rgw: fix rgw versioned bucket stat accounting during reshard and check index

Fixes: https://tracker.ceph.com/issues/62760
Signed-off-by: Cory Snyder <csnyder@1111systems.com>
This commit is contained in:
Cory Snyder 2023-09-07 17:23:14 +00:00
parent b79dcf640a
commit 152aadb71b
3 changed files with 163 additions and 112 deletions

View File

@ -17,6 +17,39 @@
* CephFS: The period specifier ``m`` now implies minutes and the period specifier
``M`` now implies months. This has been made consistent with the rest
of the system.
* RGW: New tools have been added to radosgw-admin for identifying and
correcting issues with versioned bucket indexes. Historical bugs with the
versioned bucket index transaction workflow made it possible for the index
to accumulate extraneous "book-keeping" olh entries and plain placeholder
entries. In some specific scenarios where clients made concurrent requests
referencing the same object key, it was likely that a lot of extra index
entries would accumulate. When a significant number of these entries are
present in a single bucket index shard, they can cause high bucket listing
latencies and lifecycle processing failures. To check whether a versioned
bucket has unnecessary olh entries, users can now run ``radosgw-admin
bucket check olh``. If the ``--fix`` flag is used, the extra entries will
be safely removed. A distinct issue from the one described thus far, it is
also possible that some versioned buckets are maintaining extra unlinked
objects that are not listable from the S3/ Swift APIs. These extra objects
are typically a result of PUT requests that exited abnormally, in the middle
of a bucket index transaction - so the client would not have received a
successful response. Bugs in prior releases made these unlinked objects easy
to reproduce with any PUT request that was made on a bucket that was actively
resharding. Besides the extra space that these hidden, unlinked objects
consume, there can be another side effect in certain scenarios, caused by
the nature of the failure mode that produced them, where a client of a bucket
that was a victim of this bug may find the object associated with the key to
be in an inconsistent state. To check whether a versioned bucket has unlinked
entries, users can now run ``radosgw-admin bucket check unlinked``. If the
``--fix`` flag is used, the unlinked objects will be safely removed. Finally,
a third issue made it possible for versioned bucket index stats to be
accounted inaccurately. The tooling for recalculating versioned bucket stats
also had a bug, and was not previously capable of fixing these inaccuracies.
This release resolves those issues and users can now expect that the existing
``radosgw-admin bucket check`` command will produce correct results. We
recommend that users with versioned buckets, especially those that existed
on prior releases, use these new tools to check whether their buckets are
affected and to clean them up accordingly.
>=18.0.0

View File

@ -676,77 +676,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
}
} // rgw_bucket_list
static int check_index(cls_method_context_t hctx,
rgw_bucket_dir_header *existing_header,
rgw_bucket_dir_header *calc_header)
{
int rc = read_bucket_header(hctx, existing_header);
if (rc < 0) {
CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
return rc;
}
calc_header->tag_timeout = existing_header->tag_timeout;
calc_header->ver = existing_header->ver;
calc_header->syncstopped = existing_header->syncstopped;
map<string, bufferlist> keys;
string start_obj;
string filter_prefix;
#define CHECK_CHUNK_SIZE 1000
bool done = false;
bool more;
do {
rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more);
if (rc < 0)
return rc;
for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) {
if (!bi_is_plain_entry(kiter->first)) {
done = true;
break;
}
rgw_bucket_dir_entry entry;
auto eiter = kiter->second.cbegin();
try {
decode(entry, eiter);
} catch (ceph::buffer::error& err) {
CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str());
return -EIO;
}
if (entry.exists) {
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
stats.num_entries++;
stats.total_size += entry.meta.accounted_size;
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
stats.actual_size += entry.meta.size;
}
start_obj = kiter->first;
}
} while (keys.size() == CHECK_CHUNK_SIZE && !done);
return 0;
}
int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
rgw_cls_check_index_ret ret;
int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
if (rc < 0)
return rc;
encode(ret, *out);
return 0;
}
static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header)
{
header->ver++;
@ -757,18 +686,6 @@ static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header
}
int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
rgw_bucket_dir_header existing_header;
rgw_bucket_dir_header calc_header;
int rc = check_index(hctx, &existing_header, &calc_header);
if (rc < 0)
return rc;
return write_bucket_header(hctx, &calc_header);
}
int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
@ -3136,6 +3053,115 @@ static int list_olh_entries(cls_method_context_t hctx,
return count;
}
static int check_index(cls_method_context_t hctx,
rgw_bucket_dir_header *existing_header,
rgw_bucket_dir_header *calc_header)
{
int rc = read_bucket_header(hctx, existing_header);
if (rc < 0) {
CLS_LOG(1, "ERROR: check_index(): failed to read header\n");
return rc;
}
calc_header->tag_timeout = existing_header->tag_timeout;
calc_header->ver = existing_header->ver;
calc_header->syncstopped = existing_header->syncstopped;
std::list<rgw_cls_bi_entry> entries;
string start_obj;
string filter_prefix;
#define CHECK_CHUNK_SIZE 1000
bool more;
do {
rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
if (rc < 0) {
return rc;
}
for (const auto & bientry : entries) {
rgw_bucket_dir_entry entry;
auto diter = bientry.data.cbegin();
try {
decode(entry, diter);
} catch (ceph::buffer::error& err) {
CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
return -EIO;
}
if (entry.exists && entry.key.instance.empty()) {
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
stats.num_entries++;
stats.total_size += entry.meta.accounted_size;
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
stats.actual_size += entry.meta.size;
}
start_obj = bientry.idx;
}
entries.clear();
} while (more);
start_obj = "";
do {
rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more);
if (rc < 0) {
return rc;
}
for (const auto & bientry : entries) {
rgw_bucket_dir_entry entry;
auto diter = bientry.data.cbegin();
try {
decode(entry, diter);
} catch (ceph::buffer::error& err) {
CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str());
return -EIO;
}
if (entry.exists) {
rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category];
stats.num_entries++;
stats.total_size += entry.meta.accounted_size;
stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
stats.actual_size += entry.meta.size;
}
start_obj = bientry.idx;
}
entries.clear();
} while (more);
return 0;
}
int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
rgw_bucket_dir_header existing_header;
rgw_bucket_dir_header calc_header;
int rc = check_index(hctx, &existing_header, &calc_header);
if (rc < 0)
return rc;
return write_bucket_header(hctx, &calc_header);
}
int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
CLS_LOG(10, "entered %s", __func__);
rgw_cls_check_index_ret ret;
int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header);
if (rc < 0)
return rc;
encode(ret, *out);
return 0;
}
/* Lists all the entries that appear in a bucket index listing.
*
* It may not be obvious why this function calls three other "segment"

View File

@ -385,39 +385,31 @@ bool rgw_cls_bi_entry::get_info(cls_rgw_obj_key *key,
RGWObjCategory *category,
rgw_bucket_category_stats *accounted_stats)
{
bool account = false;
auto iter = data.cbegin();
using ceph::decode;
switch (type) {
case BIIndexType::Plain:
account = true;
// NO BREAK; falls through to case InstanceIdx:
case BIIndexType::Instance:
{
rgw_bucket_dir_entry entry;
decode(entry, iter);
account = (account && entry.exists);
*key = entry.key;
*category = entry.meta.category;
accounted_stats->num_entries++;
accounted_stats->total_size += entry.meta.accounted_size;
accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
accounted_stats->actual_size += entry.meta.size;
}
break;
case BIIndexType::OLH:
{
rgw_bucket_olh_entry entry;
decode(entry, iter);
*key = entry.key;
}
break;
default:
break;
auto iter = data.cbegin();
if (type == BIIndexType::OLH) {
rgw_bucket_olh_entry entry;
decode(entry, iter);
*key = entry.key;
return false;
}
return account;
rgw_bucket_dir_entry entry;
decode(entry, iter);
*key = entry.key;
*category = entry.meta.category;
accounted_stats->num_entries++;
accounted_stats->total_size += entry.meta.accounted_size;
accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size);
accounted_stats->actual_size += entry.meta.size;
if (type == BIIndexType::Plain) {
return entry.exists && entry.key.instance.empty();
} else if (type == BIIndexType::Instance) {
return entry.exists;
}
return false;
}
void rgw_cls_bi_entry::generate_test_instances(list<rgw_cls_bi_entry*>& o)
{
using ceph::encode;