mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
rgw: fix rgw versioned bucket stat accounting during reshard and check index
Fixes: https://tracker.ceph.com/issues/62760 Signed-off-by: Cory Snyder <csnyder@1111systems.com>
This commit is contained in:
parent
b79dcf640a
commit
152aadb71b
@ -17,6 +17,39 @@
|
||||
* CephFS: The period specifier ``m`` now implies minutes and the period specifier
|
||||
``M`` now implies months. This has been made consistent with the rest
|
||||
of the system.
|
||||
* RGW: New tools have been added to radosgw-admin for identifying and
|
||||
correcting issues with versioned bucket indexes. Historical bugs with the
|
||||
versioned bucket index transaction workflow made it possible for the index
|
||||
to accumulate extraneous "book-keeping" olh entries and plain placeholder
|
||||
entries. In some specific scenarios where clients made concurrent requests
|
||||
referencing the same object key, it was likely that a lot of extra index
|
||||
entries would accumulate. When a significant number of these entries are
|
||||
present in a single bucket index shard, they can cause high bucket listing
|
||||
latencies and lifecycle processing failures. To check whether a versioned
|
||||
bucket has unnecessary olh entries, users can now run ``radosgw-admin
|
||||
bucket check olh``. If the ``--fix`` flag is used, the extra entries will
|
||||
be safely removed. A distinct issue from the one described thus far, it is
|
||||
also possible that some versioned buckets are maintaining extra unlinked
|
||||
objects that are not listable from the S3/ Swift APIs. These extra objects
|
||||
are typically a result of PUT requests that exited abnormally, in the middle
|
||||
of a bucket index transaction - so the client would not have received a
|
||||
successful response. Bugs in prior releases made these unlinked objects easy
|
||||
to reproduce with any PUT request that was made on a bucket that was actively
|
||||
resharding. Besides the extra space that these hidden, unlinked objects
|
||||
consume, there can be another side effect in certain scenarios, caused by
|
||||
the nature of the failure mode that produced them, where a client of a bucket
|
||||
that was a victim of this bug may find the object associated with the key to
|
||||
be in an inconsistent state. To check whether a versioned bucket has unlinked
|
||||
entries, users can now run ``radosgw-admin bucket check unlinked``. If the
|
||||
``--fix`` flag is used, the unlinked objects will be safely removed. Finally,
|
||||
a third issue made it possible for versioned bucket index stats to be
|
||||
accounted inaccurately. The tooling for recalculating versioned bucket stats
|
||||
also had a bug, and was not previously capable of fixing these inaccuracies.
|
||||
This release resolves those issues and users can now expect that the existing
|
||||
``radosgw-admin bucket check`` command will produce correct results. We
|
||||
recommend that users with versioned buckets, especially those that existed
|
||||
on prior releases, use these new tools to check whether their buckets are
|
||||
affected and to clean them up accordingly.
|
||||
|
||||
>=18.0.0
|
||||
|
||||
|
@ -676,77 +676,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
}
|
||||
} // rgw_bucket_list
|
||||
|
||||
|
||||
static int check_index(cls_method_context_t hctx,
|
||||
rgw_bucket_dir_header *existing_header,
|
||||
rgw_bucket_dir_header *calc_header)
|
||||
{
|
||||
int rc = read_bucket_header(hctx, existing_header);
|
||||
if (rc < 0) {
|
||||
CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
|
||||
return rc;
|
||||
}
|
||||
|
||||
calc_header->tag_timeout = existing_header->tag_timeout;
|
||||
calc_header->ver = existing_header->ver;
|
||||
calc_header->syncstopped = existing_header->syncstopped;
|
||||
|
||||
map<string, bufferlist> keys;
|
||||
string start_obj;
|
||||
string filter_prefix;
|
||||
|
||||
#define CHECK_CHUNK_SIZE 1000
|
||||
bool done = false;
|
||||
bool more;
|
||||
|
||||
do {
|
||||
rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) {
|
||||
if (!bi_is_plain_entry(kiter->first)) {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
|
||||
rgw_bucket_dir_entry entry;
|
||||
auto eiter = kiter->second.cbegin();
|
||||
try {
|
||||
decode(entry, eiter);
|
||||
} catch (ceph::buffer::error& err) {
|
||||
CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str());
|
||||
return -EIO;
|
||||
}
|
||||
if (entry.exists) {
|
||||
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
|
||||
stats.num_entries++;
|
||||
stats.total_size += entry.meta.accounted_size;
|
||||
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
|
||||
stats.actual_size += entry.meta.size;
|
||||
}
|
||||
|
||||
start_obj = kiter->first;
|
||||
}
|
||||
} while (keys.size() == CHECK_CHUNK_SIZE && !done);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
{
|
||||
CLS_LOG(10, "entered %s", __func__);
|
||||
rgw_cls_check_index_ret ret;
|
||||
|
||||
int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
encode(ret, *out);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header)
|
||||
{
|
||||
header->ver++;
|
||||
@ -757,18 +686,6 @@ static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header
|
||||
}
|
||||
|
||||
|
||||
int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
{
|
||||
CLS_LOG(10, "entered %s", __func__);
|
||||
rgw_bucket_dir_header existing_header;
|
||||
rgw_bucket_dir_header calc_header;
|
||||
int rc = check_index(hctx, &existing_header, &calc_header);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
return write_bucket_header(hctx, &calc_header);
|
||||
}
|
||||
|
||||
int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
{
|
||||
CLS_LOG(10, "entered %s", __func__);
|
||||
@ -3136,6 +3053,115 @@ static int list_olh_entries(cls_method_context_t hctx,
|
||||
return count;
|
||||
}
|
||||
|
||||
static int check_index(cls_method_context_t hctx,
|
||||
rgw_bucket_dir_header *existing_header,
|
||||
rgw_bucket_dir_header *calc_header)
|
||||
{
|
||||
int rc = read_bucket_header(hctx, existing_header);
|
||||
if (rc < 0) {
|
||||
CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
|
||||
return rc;
|
||||
}
|
||||
|
||||
calc_header->tag_timeout = existing_header->tag_timeout;
|
||||
calc_header->ver = existing_header->ver;
|
||||
calc_header->syncstopped = existing_header->syncstopped;
|
||||
|
||||
std::list<rgw_cls_bi_entry> entries;
|
||||
string start_obj;
|
||||
string filter_prefix;
|
||||
|
||||
#define CHECK_CHUNK_SIZE 1000
|
||||
bool more;
|
||||
|
||||
do {
|
||||
rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
|
||||
if (rc < 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (const auto & bientry : entries) {
|
||||
rgw_bucket_dir_entry entry;
|
||||
auto diter = bientry.data.cbegin();
|
||||
try {
|
||||
decode(entry, diter);
|
||||
} catch (ceph::buffer::error& err) {
|
||||
CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (entry.exists && entry.key.instance.empty()) {
|
||||
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
|
||||
stats.num_entries++;
|
||||
stats.total_size += entry.meta.accounted_size;
|
||||
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
|
||||
stats.actual_size += entry.meta.size;
|
||||
}
|
||||
start_obj = bientry.idx;
|
||||
}
|
||||
entries.clear();
|
||||
} while (more);
|
||||
|
||||
start_obj = "";
|
||||
do {
|
||||
rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
|
||||
if (rc < 0) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
for (const auto & bientry : entries) {
|
||||
rgw_bucket_dir_entry entry;
|
||||
auto diter = bientry.data.cbegin();
|
||||
try {
|
||||
decode(entry, diter);
|
||||
} catch (ceph::buffer::error& err) {
|
||||
CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (entry.exists) {
|
||||
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
|
||||
stats.num_entries++;
|
||||
stats.total_size += entry.meta.accounted_size;
|
||||
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
|
||||
stats.actual_size += entry.meta.size;
|
||||
}
|
||||
start_obj = bientry.idx;
|
||||
}
|
||||
entries.clear();
|
||||
} while (more);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
{
|
||||
CLS_LOG(10, "entered %s", __func__);
|
||||
rgw_bucket_dir_header existing_header;
|
||||
rgw_bucket_dir_header calc_header;
|
||||
int rc = check_index(hctx, &existing_header, &calc_header);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
return write_bucket_header(hctx, &calc_header);
|
||||
}
|
||||
|
||||
|
||||
int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
|
||||
{
|
||||
CLS_LOG(10, "entered %s", __func__);
|
||||
rgw_cls_check_index_ret ret;
|
||||
|
||||
int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
|
||||
encode(ret, *out);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Lists all the entries that appear in a bucket index listing.
|
||||
*
|
||||
* It may not be obvious why this function calls three other "segment"
|
||||
|
@ -385,39 +385,31 @@ bool rgw_cls_bi_entry::get_info(cls_rgw_obj_key *key,
|
||||
RGWObjCategory *category,
|
||||
rgw_bucket_category_stats *accounted_stats)
|
||||
{
|
||||
bool account = false;
|
||||
auto iter = data.cbegin();
|
||||
using ceph::decode;
|
||||
switch (type) {
|
||||
case BIIndexType::Plain:
|
||||
account = true;
|
||||
// NO BREAK; falls through to case InstanceIdx:
|
||||
case BIIndexType::Instance:
|
||||
{
|
||||
rgw_bucket_dir_entry entry;
|
||||
decode(entry, iter);
|
||||
account = (account && entry.exists);
|
||||
*key = entry.key;
|
||||
*category = entry.meta.category;
|
||||
accounted_stats->num_entries++;
|
||||
accounted_stats->total_size += entry.meta.accounted_size;
|
||||
accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
|
||||
accounted_stats->actual_size += entry.meta.size;
|
||||
}
|
||||
break;
|
||||
case BIIndexType::OLH:
|
||||
{
|
||||
rgw_bucket_olh_entry entry;
|
||||
decode(entry, iter);
|
||||
*key = entry.key;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
auto iter = data.cbegin();
|
||||
if (type == BIIndexType::OLH) {
|
||||
rgw_bucket_olh_entry entry;
|
||||
decode(entry, iter);
|
||||
*key = entry.key;
|
||||
return false;
|
||||
}
|
||||
|
||||
return account;
|
||||
rgw_bucket_dir_entry entry;
|
||||
decode(entry, iter);
|
||||
*key = entry.key;
|
||||
*category = entry.meta.category;
|
||||
accounted_stats->num_entries++;
|
||||
accounted_stats->total_size += entry.meta.accounted_size;
|
||||
accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
|
||||
accounted_stats->actual_size += entry.meta.size;
|
||||
if (type == BIIndexType::Plain) {
|
||||
return entry.exists && entry.key.instance.empty();
|
||||
} else if (type == BIIndexType::Instance) {
|
||||
return entry.exists;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void rgw_cls_bi_entry::generate_test_instances(list<rgw_cls_bi_entry*>& o)
|
||||
{
|
||||
using ceph::encode;
|
||||
|
Loading…
Reference in New Issue
Block a user