From 9114e5e50995f0c7d2be5c24aa4712d89cd89f48 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Thu, 16 Nov 2017 14:42:58 -0500 Subject: [PATCH 1/9] rgw: Add try_refresh_bucket_info function Sometimes operations fail with -ECANCELED. This means we got raced. If this happens we should update our bucket info from cache and try again. Some user reports suggest that our cache may be getting and staying out of sync. This is a bug and should be fixed, but it would also be nice if we were robust enough to notice the problem and refresh. So in that case, we invalidate the cache and fetch direct from the OSD, putting a warning in the log. Signed-off-by: Adam C. Emerson --- src/rgw/rgw_rados.cc | 45 +++++++++++++++++++++++++++++++++++++++++--- src/rgw/rgw_rados.h | 28 +++++++++++++++++++++++---- 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 3806bd0925b..04df975ead9 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -11874,15 +11874,27 @@ int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx, return 0; } -int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, - const string& tenant, const string& bucket_name, RGWBucketInfo& info, - real_time *pmtime, map *pattrs) +int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx, + const string& tenant, + const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + boost::optional refresh_version) { bucket_info_entry e; string bucket_entry; rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry); + if (binfo_cache->find(bucket_entry, &e)) { + if (refresh_version && + e.info.objv_tracker.read_version.compare(&(*refresh_version))) { + lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is " + << "a failure that should be debugged. I am a nice machine, " + << "so I will try to recover." << dendl; + binfo_cache->invalidate(bucket_entry); + } info = e.info; if (pattrs) *pattrs = e.attrs; @@ -11933,6 +11945,7 @@ int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, e.info.ep_objv = ot.read_version; info = e.info; if (ret < 0) { + lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl; info.bucket.tenant = tenant; info.bucket.name = bucket_name; // XXX and why return anything in case of an error anyway? @@ -11954,9 +11967,35 @@ int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl; } + if (refresh_version && + refresh_version->compare(&info.objv_tracker.read_version)) { + lderr(cct) << "WARNING: The OSD has the same version I have. Something may " + << "have gone squirrelly. An administrator may have forced a " + << "change; otherwise there is a problem somewhere." << dendl; + } + return 0; } +int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx, + const string& tenant, const string& bucket_name, + RGWBucketInfo& info, + real_time *pmtime, map *pattrs) +{ + return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime, + pattrs, boost::none); +} + +int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + map *pattrs) +{ + RGWObjectCtx obj_ctx(this); + + return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name, + info, pmtime, pattrs, info.objv_tracker.read_version); +} + int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point, bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime, map *pattrs) diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 1c95d0ac89b..57f8bf09c99 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -3407,12 +3407,32 @@ public: int convert_old_bucket_info(RGWObjectCtx& obj_ctx, const string& tenant_name, const string& bucket_name); static void make_bucket_entry_name(const string& tenant_name, const string& bucket_name, string& bucket_entry); + + +private: + int _get_bucket_info(RGWObjectCtx& obj_ctx, const string& tenant, + const string& bucket_name, RGWBucketInfo& info, + real_time *pmtime, + map *pattrs, + boost::optional refresh_version); +public: + + int get_bucket_info(RGWObjectCtx& obj_ctx, - const string& tenant_name, const string& bucket_name, - RGWBucketInfo& info, - ceph::real_time *pmtime, map *pattrs = NULL); + const string& tenant_name, const string& bucket_name, + RGWBucketInfo& info, + ceph::real_time *pmtime, map *pattrs = NULL); + + // Returns true on successful refresh. Returns false if there was an + // error or the version stored on the OSD is the same as that + // presented in the BucketInfo structure. + // + int try_refresh_bucket_info(RGWBucketInfo& info, + ceph::real_time *pmtime, + map *pattrs = nullptr); + int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv, - map *pattrs, bool create_entry_point); + map *pattrs, bool create_entry_point); int cls_rgw_init_index(librados::IoCtx& io_ctx, librados::ObjectWriteOperation& op, string& oid); int cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr); From 1a3fcc70c0747791aa423cd0aa7d2596eaf3d73c Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 15:51:42 -0500 Subject: [PATCH 2/9] rgw: Add retry_raced_bucket_write If the OSD informs us that our bucket info is out of date when we need to write, we should have a way to update it. This template function allows us to wrap relevant sections of code so they'll be retried against new bucket info on -ECANCELED. Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 31 +++++++++++++++++++++++++++++++ src/rgw/rgw_op.h | 1 + 2 files changed, 32 insertions(+) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 4003bea61ff..d0404f77fa0 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -630,6 +630,37 @@ void rgw_bucket_object_pre_exec(struct req_state *s) dump_bucket_from_state(s); } +// So! Now and then when we try to update bucket information, the +// bucket has changed during the course of the operation. (Or we have +// a cache consistency problem that Watch/Notify isn't ruling out +// completely.) +// +// When this happens, we need to update the bucket info and try +// again. We have, however, to try the right *part* again. We can't +// simply re-send, since that will obliterate the previous update. +// +// Thus, callers of this function should include everything that +// merges information to be changed into the bucket information as +// well as the call to set it. +// +// The called function must return an integer, negative on error. In +// general, they should just return op_ret. +namespace { +template +int retry_raced_bucket_write(RGWRados* g, req_state* s, const F& f) { + auto r = f(); + for (auto i = 0u; i < 15u && r == -ECANCELED; ++i) { + r = g->try_refresh_bucket_info(s->bucket_info, nullptr, + &s->bucket_attrs); + if (r >= 0) { + r = f(); + } + } + return r; +} +} + + int RGWGetObj::verify_permission() { obj = rgw_obj(s->bucket, s->object); diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 1cb44d2c48e..22fe2e7ffdb 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -120,6 +120,7 @@ protected: int do_aws4_auth_completion(); virtual int init_quota(); + public: RGWOp() : s(nullptr), From ebb86301b20098e15824f469001f6153b27965f5 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 15:53:05 -0500 Subject: [PATCH 3/9] rgw: Handle stale bucket info in RGWPutMetadataBucket Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 92 +++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index d0404f77fa0..42f6d121b2f 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -4017,55 +4017,61 @@ void RGWPutMetadataBucket::execute() return; } - /* Encode special metadata first as we're using std::map::emplace under - * the hood. This method will add the new items only if the map doesn't - * contain such keys yet. */ - if (has_policy) { - if (s->dialect.compare("swift") == 0) { - auto old_policy = \ - static_cast(s->bucket_acl.get()); - auto new_policy = static_cast(&policy); - new_policy->filter_merge(policy_rw_mask, old_policy); - policy = *new_policy; - } - buffer::list bl; - policy.encode(bl); - emplace_attr(RGW_ATTR_ACL, std::move(bl)); - } + op_ret = retry_raced_bucket_write(store, s, [this] { + /* Encode special metadata first as we're using std::map::emplace under + * the hood. This method will add the new items only if the map doesn't + * contain such keys yet. */ + if (has_policy) { + if (s->dialect.compare("swift") == 0) { + auto old_policy = \ + static_cast(s->bucket_acl.get()); + auto new_policy = static_cast(&policy); + new_policy->filter_merge(policy_rw_mask, old_policy); + policy = *new_policy; + } + buffer::list bl; + policy.encode(bl); + emplace_attr(RGW_ATTR_ACL, std::move(bl)); + } - if (has_cors) { - buffer::list bl; - cors_config.encode(bl); - emplace_attr(RGW_ATTR_CORS, std::move(bl)); - } + if (has_cors) { + buffer::list bl; + cors_config.encode(bl); + emplace_attr(RGW_ATTR_CORS, std::move(bl)); + } - /* It's supposed that following functions WILL NOT change any special - * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */ - prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); - populate_with_generic_attrs(s, attrs); + /* It's supposed that following functions WILL NOT change any + * special attributes (like RGW_ATTR_ACL) if they are already + * present in attrs. */ + prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs); + populate_with_generic_attrs(s, attrs); - /* According to the Swift's behaviour and its container_quota WSGI middleware - * implementation: anyone with write permissions is able to set the bucket - * quota. This stays in contrast to account quotas that can be set only by - * clients holding reseller admin privileges. */ - op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota); - if (op_ret < 0) { - return; - } + /* According to the Swift's behaviour and its container_quota + * WSGI middleware implementation: anyone with write permissions + * is able to set the bucket quota. This stays in contrast to + * account quotas that can be set only by clients holding + * reseller admin privileges. */ + op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket_info.quota); + if (op_ret < 0) { + return op_ret; + } - if (swift_ver_location) { - s->bucket_info.swift_ver_location = *swift_ver_location; - s->bucket_info.swift_versioning = (! swift_ver_location->empty()); - } + if (swift_ver_location) { + s->bucket_info.swift_ver_location = *swift_ver_location; + s->bucket_info.swift_versioning = (!swift_ver_location->empty()); + } - /* Web site of Swift API. */ - filter_out_website(attrs, rmattr_names, s->bucket_info.website_conf); - s->bucket_info.has_website = !s->bucket_info.website_conf.is_empty(); + /* Web site of Swift API. */ + filter_out_website(attrs, rmattr_names, s->bucket_info.website_conf); + s->bucket_info.has_website = !s->bucket_info.website_conf.is_empty(); - /* Setting attributes also stores the provided bucket info. Due to this - * fact, the new quota settings can be serialized with the same call. */ - op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, - &s->bucket_info.objv_tracker); + /* Setting attributes also stores the provided bucket info. Due + * to this fact, the new quota settings can be serialized with + * the same call. */ + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); } int RGWPutMetadataObject::verify_permission() From a0a1e7c2ef992b8758bcfb20d893730c1b202475 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 15:59:44 -0500 Subject: [PATCH 4/9] rgw: Handle stale bucket info in RGWSetBucketVersioning Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 42f6d121b2f..1761c570072 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -2090,17 +2090,20 @@ void RGWSetBucketVersioning::execute() } } - if (versioning_status == VersioningEnabled) { - s->bucket_info.flags |= BUCKET_VERSIONED; - s->bucket_info.flags &= ~BUCKET_VERSIONS_SUSPENDED; - } else if (versioning_status == VersioningSuspended) { - s->bucket_info.flags |= (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); - } else { - return; - } + op_ret = retry_raced_bucket_write(store, s, [this] { + if (versioning_status == VersioningEnabled) { + s->bucket_info.flags |= BUCKET_VERSIONED; + s->bucket_info.flags &= ~BUCKET_VERSIONS_SUSPENDED; + } else if (versioning_status == VersioningSuspended) { + s->bucket_info.flags |= (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); + } else { + return op_ret; + } + op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), + &s->bucket_attrs); + return op_ret; + }); - op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), - &s->bucket_attrs); if (op_ret < 0) { ldout(s->cct, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name << " returned err=" << op_ret << dendl; From b2b7385f194def1025a8947bab876c9856b06400 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 16:03:13 -0500 Subject: [PATCH 5/9] rgw: Handle stale bucket info in RGWSetBucketWebsite Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 1761c570072..325f3556fa0 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -2153,10 +2153,14 @@ void RGWSetBucketWebsite::execute() } } - s->bucket_info.has_website = true; - s->bucket_info.website_conf = website_conf; + op_ret = retry_raced_bucket_write(store, s, [this] { + s->bucket_info.has_website = true; + s->bucket_info.website_conf = website_conf; + op_ret = store->put_bucket_instance_info(s->bucket_info, false, + real_time(), &s->bucket_attrs); + return op_ret; + }); - op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), &s->bucket_attrs); if (op_ret < 0) { ldout(s->cct, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name << " returned err=" << op_ret << dendl; return; From f4d274248e43cb38ff2b27782c010b2c35b12b2b Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 16:05:06 -0500 Subject: [PATCH 6/9] rgw: Handle stale bucket info in RGWDeleteBucketWebsite Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 325f3556fa0..770c6609e64 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -2179,10 +2179,13 @@ void RGWDeleteBucketWebsite::pre_exec() void RGWDeleteBucketWebsite::execute() { - s->bucket_info.has_website = false; - s->bucket_info.website_conf = RGWBucketWebsiteConf(); - - op_ret = store->put_bucket_instance_info(s->bucket_info, false, real_time(), &s->bucket_attrs); + op_ret = retry_raced_bucket_write(store, s, [this] { + s->bucket_info.has_website = false; + s->bucket_info.website_conf = RGWBucketWebsiteConf(); + op_ret = store->put_bucket_instance_info(s->bucket_info, false, + real_time(), &s->bucket_attrs); + return op_ret; + }); if (op_ret < 0) { ldout(s->cct, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name << " returned err=" << op_ret << dendl; return; From 1738b4f6b726b462abb436f78026c1577b55f05e Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 16:15:04 -0500 Subject: [PATCH 7/9] rgw: Handle stale bucket info in RGWPutBucketPolicy Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 770c6609e64..aa62e40f24a 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -6879,15 +6879,15 @@ void RGWPutBucketPolicy::execute() } try { - Policy p(s->cct, s->bucket_tenant, in_data); - auto attrs = s->bucket_attrs; - attrs[RGW_ATTR_IAM_POLICY].clear(); - attrs[RGW_ATTR_IAM_POLICY].append(p.text); - op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, - &s->bucket_info.objv_tracker); - if (op_ret == -ECANCELED) { - op_ret = 0; /* lost a race, but it's ok because policies are immutable */ - } + const Policy p(s->cct, s->bucket_tenant, in_data); + op_ret = retry_raced_bucket_write(store, s, [&p, this] { + auto attrs = s->bucket_attrs; + attrs[RGW_ATTR_IAM_POLICY].clear(); + attrs[RGW_ATTR_IAM_POLICY].append(p.text); + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); } catch (rgw::IAM::PolicyParseException& e) { ldout(s->cct, 20) << "failed to parse policy: " << e.what() << dendl; op_ret = -EINVAL; From e397b7e6d0c49d625fb2b2363311e6486f2045fe Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 16:16:38 -0500 Subject: [PATCH 8/9] rgw: Handle stale bucket info in RGWDeleteBucketPolicy Signed-off-by: Adam C. Emerson --- src/rgw/rgw_op.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index aa62e40f24a..95aec1ba175 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -6955,11 +6955,11 @@ int RGWDeleteBucketPolicy::verify_permission() void RGWDeleteBucketPolicy::execute() { - auto attrs = s->bucket_attrs; - attrs.erase(RGW_ATTR_IAM_POLICY); - op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, - &s->bucket_info.objv_tracker); - if (op_ret == -ECANCELED) { - op_ret = 0; /* lost a race, but it's ok because policies are immutable */ - } + op_ret = retry_raced_bucket_write(store, s, [this] { + auto attrs = s->bucket_attrs; + attrs.erase(RGW_ATTR_IAM_POLICY); + op_ret = rgw_bucket_set_attrs(store, s->bucket_info, attrs, + &s->bucket_info.objv_tracker); + return op_ret; + }); } From 4489cb58a15647a31ac0546d70400af5668404cb Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Fri, 17 Nov 2017 17:15:26 -0500 Subject: [PATCH 9/9] rgw: Expire entries in bucket info cache To bound the degree to which an RGW instance can go out to lunch if the watch/notify breaks down, force refresh of any cache entry over a certain age. Fifteen minutes by default, and expiration can be turned off entirely. This is separate from the LRU. The LRU removes entries based on the last time of access. This expiration patch forces refresh based on the last time they were updated. Signed-off-by: Adam C. Emerson --- src/common/options.cc | 22 +++++++++++++++++++++- src/rgw/rgw_rados.h | 18 ++++++++++++++---- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/common/options.cc b/src/common/options.cc index 0064af66523..78365c699e0 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -4300,7 +4300,7 @@ std::vector