mirror of
https://github.com/ceph/ceph
synced 2025-01-01 08:32:24 +00:00
rgw: reshard improvements
Improve error log message when an expired reshard lock is renewed. Add two new configurable options to manage resharding: * rgw_reshard_batch_size : number of reshard entries to batch together before sending the operations to the CLS back-end. * rgw_reshard_max_aio : maximum number of outstanding asynchronous i/o operations to allow at a time. Alter rgw_reshard_bucket_lock duration default from 2 minutes to 6 minutes. Add documentation, minimum values, tags, and service to a few rgw reshard configuration options. Change some rgw_reshard_* options from LEVEL_DEV to LEVEL_ADVANCED. Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
This commit is contained in:
parent
5552971a8b
commit
b713bb77a1
@ -6261,13 +6261,32 @@ std::vector<Option> get_rgw_options() {
|
||||
.set_default(true)
|
||||
.set_description("Enable stats on bucket listing in Swift"),
|
||||
|
||||
Option("rgw_reshard_num_logs", Option::TYPE_INT, Option::LEVEL_DEV)
|
||||
Option("rgw_reshard_num_logs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
|
||||
.set_default(16)
|
||||
.set_description(""),
|
||||
.set_min(1)
|
||||
.set_description("")
|
||||
.add_service("rgw"),
|
||||
|
||||
Option("rgw_reshard_bucket_lock_duration", Option::TYPE_INT, Option::LEVEL_DEV)
|
||||
.set_default(120)
|
||||
.set_description(""),
|
||||
Option("rgw_reshard_bucket_lock_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
|
||||
.set_default(360)
|
||||
.set_min(30)
|
||||
.set_description("Number of seconds the timeout on the reshard locks (bucket reshard lock and reshard log lock) are set to. As a reshard proceeds these locks can be renewed/extended. If too short, reshards cannot complete and will fail, causing a future reshard attempt. If too long a hung or crashed reshard attempt will keep the bucket locked for an extended period, not allowing RGW to detect the failed reshard attempt and recover.")
|
||||
.add_tag("performance")
|
||||
.add_service("rgw"),
|
||||
|
||||
Option("rgw_reshard_batch_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
|
||||
.set_default(64)
|
||||
.set_min(8)
|
||||
.set_description("Number of reshard entries to batch together before sending the operations to the CLS back-end")
|
||||
.add_tag("performance")
|
||||
.add_service("rgw"),
|
||||
|
||||
Option("rgw_reshard_max_aio", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
|
||||
.set_default(128)
|
||||
.set_min(16)
|
||||
.set_description("Maximum number of outstanding asynchronous I/O operations to allow at a time during resharding")
|
||||
.add_tag("performance")
|
||||
.add_service("rgw"),
|
||||
|
||||
Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(false)
|
||||
@ -6357,7 +6376,8 @@ std::vector<Option> get_rgw_options() {
|
||||
|
||||
Option("rgw_reshard_thread_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
|
||||
.set_default(10_min)
|
||||
.set_description(""),
|
||||
.set_min(10_min)
|
||||
.set_description("Number of seconds between processing of reshard log entries"),
|
||||
|
||||
Option("rgw_cache_expiry_interval", Option::TYPE_UINT,
|
||||
Option::LEVEL_ADVANCED)
|
||||
|
@ -2,6 +2,7 @@
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
|
||||
#include "rgw_rados.h"
|
||||
#include "rgw_bucket.h"
|
||||
@ -21,10 +22,6 @@ const string reshard_lock_name = "reshard_process";
|
||||
const string bucket_instance_lock_name = "bucket_instance_lock";
|
||||
|
||||
|
||||
#define RESHARD_SHARD_WINDOW 64
|
||||
#define RESHARD_MAX_AIO 128
|
||||
|
||||
|
||||
class BucketReshardShard {
|
||||
RGWRados *store;
|
||||
const RGWBucketInfo& bucket_info;
|
||||
@ -33,6 +30,8 @@ class BucketReshardShard {
|
||||
vector<rgw_cls_bi_entry> entries;
|
||||
map<uint8_t, rgw_bucket_category_stats> stats;
|
||||
deque<librados::AioCompletion *>& aio_completions;
|
||||
uint64_t max_aio_completions;
|
||||
uint64_t reshard_shard_batch_size;
|
||||
|
||||
int wait_next_completion() {
|
||||
librados::AioCompletion *c = aio_completions.front();
|
||||
@ -52,7 +51,7 @@ class BucketReshardShard {
|
||||
}
|
||||
|
||||
int get_completion(librados::AioCompletion **c) {
|
||||
if (aio_completions.size() >= RESHARD_MAX_AIO) {
|
||||
if (aio_completions.size() >= max_aio_completions) {
|
||||
int ret = wait_next_completion();
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
@ -74,6 +73,11 @@ public:
|
||||
{
|
||||
num_shard = (bucket_info.num_shards > 0 ? _num_shard : -1);
|
||||
bs.init(bucket_info.bucket, num_shard, nullptr /* no RGWBucketInfo */);
|
||||
|
||||
max_aio_completions =
|
||||
store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
|
||||
reshard_shard_batch_size =
|
||||
store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
|
||||
}
|
||||
|
||||
int get_num_shard() {
|
||||
@ -90,7 +94,7 @@ public:
|
||||
target.total_size_rounded += entry_stats.total_size_rounded;
|
||||
target.actual_size += entry_stats.actual_size;
|
||||
}
|
||||
if (entries.size() >= RESHARD_SHARD_WINDOW) {
|
||||
if (entries.size() >= reshard_shard_batch_size) {
|
||||
int ret = flush();
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
@ -401,7 +405,8 @@ RGWBucketReshardLock::RGWBucketReshardLock(RGWRados* _store,
|
||||
ephemeral(_ephemeral),
|
||||
internal_lock(reshard_lock_name)
|
||||
{
|
||||
const int lock_dur_secs = store->ctx()->_conf->rgw_reshard_bucket_lock_duration;
|
||||
const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
|
||||
"rgw_reshard_bucket_lock_duration");
|
||||
duration = std::chrono::seconds(lock_dur_secs);
|
||||
|
||||
#define COOKIE_LEN 16
|
||||
@ -450,8 +455,14 @@ int RGWBucketReshardLock::renew(const Clock::time_point& now) {
|
||||
ret = internal_lock.lock_exclusive(&store->reshard_pool_ctx, lock_oid);
|
||||
}
|
||||
if (ret < 0) { /* expired or already locked by another processor */
|
||||
std::stringstream error_s;
|
||||
if (-ENOENT == ret) {
|
||||
error_s << "ENOENT (lock expired or never initially locked)";
|
||||
} else {
|
||||
error_s << ret << " (" << cpp_strerror(-ret) << ")";
|
||||
}
|
||||
ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
|
||||
lock_oid << " with " << cpp_strerror(-ret) << dendl;
|
||||
lock_oid << " with error " << error_s.str() << dendl;
|
||||
return ret;
|
||||
}
|
||||
internal_lock.set_must_renew(false);
|
||||
@ -1093,7 +1104,7 @@ void *RGWReshard::ReshardWorker::entry() {
|
||||
|
||||
utime_t end = ceph_clock_now();
|
||||
end -= start;
|
||||
int secs = cct->_conf->rgw_reshard_thread_interval;
|
||||
int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
|
||||
|
||||
if (secs <= end.sec())
|
||||
continue; // next round
|
||||
|
Loading…
Reference in New Issue
Block a user