Merge PR #45669 into main

* refs/pull/45669/head:
	client: switch to use 32 bits ext_num_fwd
	client: switch to use 32 bits ext_num_retry
	ceph_fs.h: add 32 bits extended num_retry and num_fwd support
	ceph_fs.h: switch to use its own encode/decode helpers

Reviewed-by: Venky Shankar <vshankar@redhat.com>
This commit is contained in:
Venky Shankar 2023-02-22 14:41:09 +05:30
commit 8ce318517d
9 changed files with 121 additions and 62 deletions

View File

@ -2477,7 +2477,7 @@ void Client::send_request(MetaRequest *request, MetaSession *session,
mds_rank_t mds = session->mds_num;
ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
<< " for mds." << mds << dendl;
auto r = build_client_request(request);
auto r = build_client_request(request, mds);
if (!r)
return;
@ -2522,31 +2522,31 @@ void Client::send_request(MetaRequest *request, MetaSession *session,
session->con->send_message2(std::move(r));
}
ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
ref_t<MClientRequest> Client::build_client_request(MetaRequest *request, mds_rank_t mds)
{
auto session = mds_sessions.at(mds);
bool old_version = !session->mds_features.test(CEPHFS_FEATURE_32BITS_RETRY_FWD);
/*
* The type of 'retry_attempt' in 'MetaRequest' is 'int',
* while in 'ceph_mds_request_head' the type of 'num_retry'
* is '__u8'. So in case the request retries exceeding 256
* times, the MDS will receive a incorrect retry seq.
* Avoid inifinite retrying after overflow.
*
* In this case it's ususally a bug in MDS and continue
* retrying the request makes no sense.
*
* In future this could be fixed in ceph code, so avoid
* using the hardcode here.
* The client will increase the retry count and if the MDS is
* old version, so we limit to retry at most 256 times.
*/
int max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry);
max_retry = 1 << (max_retry * CHAR_BIT);
if (request->retry_attempt >= max_retry) {
request->abort(-CEPHFS_EMULTIHOP);
request->caller_cond->notify_all();
ldout(cct, 1) << __func__ << " request tid " << request->tid
<< " seq overflow" << ", abort it" << dendl;
return nullptr;
if (request->retry_attempt) {
int old_max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry);
old_max_retry = 1 << (old_max_retry * CHAR_BIT);
if ((old_version && request->retry_attempt >= old_max_retry) ||
(uint32_t)request->retry_attempt >= UINT32_MAX) {
request->abort(-CEPHFS_EMULTIHOP);
request->caller_cond->notify_all();
ldout(cct, 1) << __func__ << " request tid " << request->tid
<< " retry seq overflow" << ", abort it" << dendl;
return nullptr;
}
}
auto req = make_message<MClientRequest>(request->get_op());
auto req = make_message<MClientRequest>(request->get_op(), old_version);
req->set_tid(request->tid);
req->set_stamp(request->op_stamp);
memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
@ -2578,7 +2578,7 @@ ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
req->fscrypt_auth = request->fscrypt_auth;
req->fscrypt_file = request->fscrypt_file;
req->set_retry_attempt(request->retry_attempt++);
req->head.num_fwd = request->num_fwd;
req->head.ext_num_fwd = request->num_fwd;
const gid_t *_gids;
int gid_count = request->perms.get_gids(&_gids);
req->set_gid_list(gid_count, _gids);
@ -2607,32 +2607,20 @@ void Client::handle_client_request_forward(const MConstRef<MClientRequestForward
ceph_assert(request);
/*
* The type of 'num_fwd' in ceph 'MClientRequestForward'
* is 'int32_t', while in 'ceph_mds_request_head' the
* type is '__u8'. So in case the request bounces between
* MDSes exceeding 256 times, the client will get stuck.
* Avoid inifinite retrying after overflow.
*
* In this case it's ususally a bug in MDS and continue
* bouncing the request makes no sense.
*
* In future this could be fixed in ceph code, so avoid
* using the hardcode here.
* The MDS will increase the fwd count and in client side
* if the num_fwd is less than the one saved in request
* that means the MDS is an old version and overflowed of
* 8 bits.
*/
int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
max_fwd = (1 << (max_fwd * CHAR_BIT)) - 1;
auto num_fwd = fwd->get_num_fwd();
if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
request->abort(-CEPHFS_EMULTIHOP);
request->caller_cond->notify_all();
ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
<< ", abort it" << dendl;
} else {
ldout(cct, 10) << __func__ << " tid " << tid
<< " old fwd seq " << fwd->get_num_fwd()
<< " <= req fwd " << request->num_fwd
<< ", ignore it" << dendl;
}
if (num_fwd <= request->num_fwd || (uint32_t)num_fwd >= UINT32_MAX) {
request->abort(-CEPHFS_EMULTIHOP);
request->caller_cond->notify_all();
ldout(cct, 0) << __func__ << " request tid " << tid << " new num_fwd "
<< num_fwd << " old num_fwd " << request->num_fwd << ", fwd seq overflow"
<< ", abort it" << dendl;
return;
}

View File

@ -956,7 +956,7 @@ protected:
void connect_mds_targets(mds_rank_t mds);
void send_request(MetaRequest *request, MetaSession *session,
bool drop_cap_releases=false);
MRef<MClientRequest> build_client_request(MetaRequest *request);
MRef<MClientRequest> build_client_request(MetaRequest *request, mds_rank_t mds);
void kick_requests(MetaSession *session);
void kick_requests_closed(MetaSession *session);
void handle_client_request_forward(const MConstRef<MClientRequestForward>& reply);

View File

@ -46,8 +46,8 @@ void MetaRequest::dump(Formatter *f) const
f->dump_unsigned("oldest_client_tid", head.oldest_client_tid);
f->dump_unsigned("mdsmap_epoch", head.mdsmap_epoch);
f->dump_unsigned("flags", head.flags);
f->dump_unsigned("num_retry", head.num_retry);
f->dump_unsigned("num_fwd", head.num_fwd);
f->dump_unsigned("num_retry", head.ext_num_retry);
f->dump_unsigned("num_fwd", head.ext_num_fwd);
f->dump_unsigned("num_releases", head.num_releases);
f->dump_int("abort_rc", abort_rc);

View File

@ -156,8 +156,8 @@ public:
// normal fields
void set_tid(ceph_tid_t t) { tid = t; }
void set_oldest_client_tid(ceph_tid_t t) { head.oldest_client_tid = t; }
void inc_num_fwd() { head.num_fwd = head.num_fwd + 1; }
void set_retry_attempt(int a) { head.num_retry = a; }
void inc_num_fwd() { head.ext_num_fwd = head.ext_num_fwd + 1; }
void set_retry_attempt(int a) { head.ext_num_retry = a; }
void set_filepath(const filepath& fp) { path = fp; }
void set_filepath2(const filepath& fp) { path2 = fp; }
void set_alternate_name(std::string an) { alternate_name = an; }

View File

@ -14,6 +14,8 @@
#include "msgr.h"
#include "rados.h"
#include "include/encoding.h"
#include "include/denc.h"
/*
* The data structures defined here are shared between Linux kernel and
@ -619,7 +621,7 @@ union ceph_mds_request_args {
} __attribute__ ((packed)) lookupino;
} __attribute__ ((packed));
#define CEPH_MDS_REQUEST_HEAD_VERSION 1
#define CEPH_MDS_REQUEST_HEAD_VERSION 2
/*
* Note that any change to this structure must ensure that it is compatible
@ -630,15 +632,68 @@ struct ceph_mds_request_head {
__le64 oldest_client_tid;
__le32 mdsmap_epoch; /* on client */
__le32 flags; /* CEPH_MDS_FLAG_* */
__u8 num_retry, num_fwd; /* count retry, fwd attempts */
__u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */
__le16 num_releases; /* # include cap/lease release records */
__le32 op; /* mds op code */
__le32 caller_uid, caller_gid;
__le64 ino; /* use this ino for openc, mkdir, mknod,
etc. (if replaying) */
union ceph_mds_request_args args;
__le32 ext_num_retry; /* new count retry attempts */
__le32 ext_num_fwd; /* new count fwd attempts */
} __attribute__ ((packed));
void inline encode(const struct ceph_mds_request_head& h, ceph::buffer::list& bl, bool old_version) {
using ceph::encode;
encode(h.version, bl);
encode(h.oldest_client_tid, bl);
encode(h.mdsmap_epoch, bl);
encode(h.flags, bl);
// For old MDS daemons
__u8 num_retry = __u32(h.ext_num_retry);
__u8 num_fwd = __u32(h.ext_num_fwd);
encode(num_retry, bl);
encode(num_fwd, bl);
encode(h.num_releases, bl);
encode(h.op, bl);
encode(h.caller_uid, bl);
encode(h.caller_gid, bl);
encode(h.ino, bl);
bl.append((char*)&h.args, sizeof(h.args));
if (!old_version) {
encode(h.ext_num_retry, bl);
encode(h.ext_num_fwd, bl);
}
}
void inline decode(struct ceph_mds_request_head& h, ceph::buffer::list::const_iterator& bl) {
using ceph::decode;
decode(h.version, bl);
decode(h.oldest_client_tid, bl);
decode(h.mdsmap_epoch, bl);
decode(h.flags, bl);
decode(h.num_retry, bl);
decode(h.num_fwd, bl);
decode(h.num_releases, bl);
decode(h.op, bl);
decode(h.caller_uid, bl);
decode(h.caller_gid, bl);
decode(h.ino, bl);
bl.copy(sizeof(h.args), (char*)&(h.args));
if (h.version >= 2) {
decode(h.ext_num_retry, bl);
decode(h.ext_num_fwd, bl);
} else {
h.ext_num_retry = h.num_retry;
h.ext_num_fwd = h.num_fwd;
}
}
/* cap/lease release record */
struct ceph_mds_request_release {
__le64 ino, cap_id; /* ino and unique cap id */

View File

@ -320,7 +320,6 @@ WRITE_RAW_ENCODER(ceph_file_layout)
WRITE_RAW_ENCODER(ceph_dir_layout)
WRITE_RAW_ENCODER(ceph_mds_session_head)
WRITE_RAW_ENCODER(ceph_mds_request_head_legacy)
WRITE_RAW_ENCODER(ceph_mds_request_head)
WRITE_RAW_ENCODER(ceph_mds_request_release)
WRITE_RAW_ENCODER(ceph_filelock)
WRITE_RAW_ENCODER(ceph_mds_caps_head)

View File

@ -29,6 +29,7 @@ static const std::array feature_names
"alternate_name",
"notify_session_state",
"op_getvxattr",
"32bits_retry_fwd",
};
static_assert(feature_names.size() == CEPHFS_FEATURE_MAX + 1);

View File

@ -45,7 +45,8 @@ namespace ceph {
#define CEPHFS_FEATURE_ALTERNATE_NAME 15
#define CEPHFS_FEATURE_NOTIFY_SESSION_STATE 16
#define CEPHFS_FEATURE_OP_GETVXATTR 17
#define CEPHFS_FEATURE_MAX 17
#define CEPHFS_FEATURE_32BITS_RETRY_FWD 18
#define CEPHFS_FEATURE_MAX 18
#define CEPHFS_FEATURES_ALL { \
0, 1, 2, 3, 4, \
@ -64,6 +65,7 @@ namespace ceph {
CEPHFS_FEATURE_ALTERNATE_NAME, \
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
CEPHFS_FEATURE_32BITS_RETRY_FWD, \
}
#define CEPHFS_METRIC_FEATURES_ALL { \

View File

@ -73,6 +73,7 @@ private:
public:
mutable struct ceph_mds_request_head head; /* XXX HACK! */
utime_t stamp;
bool peer_old_version = false;
struct Release {
mutable ceph_mds_request_release item;
@ -111,10 +112,11 @@ protected:
// cons
MClientRequest()
: MMDSOp(CEPH_MSG_CLIENT_REQUEST, HEAD_VERSION, COMPAT_VERSION) {}
MClientRequest(int op)
MClientRequest(int op, bool over=true)
: MMDSOp(CEPH_MSG_CLIENT_REQUEST, HEAD_VERSION, COMPAT_VERSION) {
memset(&head, 0, sizeof(head));
head.op = op;
peer_old_version = over;
}
~MClientRequest() final {}
@ -160,8 +162,8 @@ public:
// normal fields
void set_stamp(utime_t t) { stamp = t; }
void set_oldest_client_tid(ceph_tid_t t) { head.oldest_client_tid = t; }
void inc_num_fwd() { head.num_fwd = head.num_fwd + 1; }
void set_retry_attempt(int a) { head.num_retry = a; }
void inc_num_fwd() { head.ext_num_fwd = head.ext_num_fwd + 1; }
void set_retry_attempt(int a) { head.ext_num_retry = a; }
void set_filepath(const filepath& fp) { path = fp; }
void set_filepath2(const filepath& fp) { path2 = fp; }
void set_string2(const char *s) { path2.set_path(std::string_view(s), 0); }
@ -192,8 +194,8 @@ public:
utime_t get_stamp() const { return stamp; }
ceph_tid_t get_oldest_client_tid() const { return head.oldest_client_tid; }
int get_num_fwd() const { return head.num_fwd; }
int get_retry_attempt() const { return head.num_retry; }
int get_num_fwd() const { return head.ext_num_fwd; }
int get_retry_attempt() const { return head.ext_num_retry; }
int get_op() const { return head.op; }
unsigned get_caller_uid() const { return head.caller_uid; }
unsigned get_caller_gid() const { return head.caller_gid; }
@ -252,10 +254,20 @@ public:
void encode_payload(uint64_t features) override {
using ceph::encode;
head.num_releases = releases.size();
head.version = CEPH_MDS_REQUEST_HEAD_VERSION;
/*
* If the peer is old version, we must skip all the
* new members, because the old version of MDS or
* client will just copy the 'head' memory and isn't
* that smart to skip them.
*/
if (peer_old_version) {
head.version = 1;
} else {
head.version = CEPH_MDS_REQUEST_HEAD_VERSION;
}
if (features & CEPH_FEATURE_FS_BTIME) {
encode(head, payload);
encode(head, payload, peer_old_version);
} else {
struct ceph_mds_request_head_legacy old_mds_head;
@ -312,8 +324,10 @@ public:
out << " " << get_filepath2();
if (stamp != utime_t())
out << " " << stamp;
if (head.num_retry)
out << " RETRY=" << (int)head.num_retry;
if (head.ext_num_fwd)
out << " FWD=" << (int)head.ext_num_fwd;
if (head.ext_num_retry)
out << " RETRY=" << (int)head.ext_num_retry;
if (is_async())
out << " ASYNC";
if (is_replay())