Merge pull request #60067 from VallariAg/wip-nvmeof-healthcheck

mon: add nvmeof healthchecks
This commit is contained in:
Vallari Agrawal 2024-11-14 08:40:43 +05:30 committed by GitHub
commit 874ae379d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 81 additions and 0 deletions

View File

@ -1640,6 +1640,25 @@ We encourage you to fix this by making the weights even on both dividing buckets
This can be done by making sure the combined weight of the OSDs on each dividing
bucket are the same.
NVMeoF Gateway
--------------
NVMEOF_SINGLE_GATEWAY
_____________________
One of the gateway group has only one gateway. This is not ideal because it makes
high availability (HA) impossible with a single gatway in a group. This can lead to
problems with failover and failback operations for the NVMeoF gateway.
It's recommended to have multiple NVMeoF gateways in a group.
NVMEOF_GATEWAY_DOWN
___________________
Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed,
the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information.
Miscellaneous
-------------

View File

@ -28,3 +28,5 @@ overrides:
mon:
# cephadm can take up to 5 minutes to bring up remaining mons
mon down mkfs grace: 300
log-ignorelist:
- NVMEOF_SINGLE_GATEWAY

View File

@ -8,6 +8,9 @@ overrides:
- out of quorum
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
- NVMEOF_SINGLE_GATEWAY
- NVMEOF_GATEWAY_DOWN
- are in unavailable state
- is in error state
- failed cephadm daemon

View File

@ -3,6 +3,9 @@ overrides:
log-ignorelist:
# nvmeof daemon thrashing
- CEPHADM_FAILED_DAEMON
- NVMEOF_SINGLE_GATEWAY
- NVMEOF_GATEWAY_DOWN
- are in unavailable state
- is in error state
- failed cephadm daemon

View File

@ -16,7 +16,9 @@
#include "NVMeofGwMon.h"
#include "NVMeofGwMap.h"
#include "OSDMonitor.h"
#include "mon/health_check.h"
using std::list;
using std::map;
using std::make_pair;
using std::ostream;
@ -893,6 +895,47 @@ struct CMonRequestProposal : public Context {
}
};
void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
{
list<string> singleGatewayDetail;
list<string> gatewayDownDetail;
for (const auto& created_map_pair: created_gws) {
const auto& group_key = created_map_pair.first;
auto& group = group_key.second;
const NvmeGwMonStates& gw_created_map = created_map_pair.second;
if ( gw_created_map.size() == 1) {
ostringstream ss;
ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ;
singleGatewayDetail.push_back(ss.str());
}
for (const auto& gw_created_pair: gw_created_map) {
const auto& gw_id = gw_created_pair.first;
const auto& gw_created = gw_created_pair.second;
if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) {
ostringstream ss;
ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
gatewayDownDetail.push_back(ss.str());
}
}
}
if (!singleGatewayDetail.empty()) {
ostringstream ss;
ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
<< "; HA is not possible with single gateway.";
auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN,
ss.str(), singleGatewayDetail.size());
d.detail.swap(singleGatewayDetail);
}
if (!gatewayDownDetail.empty()) {
ostringstream ss;
ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state"
<< "; gateway might be down, try to redeploy.";
auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN,
ss.str(), gatewayDownDetail.size());
d.detail.swap(gatewayDownDetail);
}
}
int NVMeofGwMap::blocklist_gw(
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)

View File

@ -27,6 +27,9 @@
#include "NVMeofGwTypes.h"
using ceph::coarse_mono_clock;
class health_check_map_t;
class Monitor;
/*-------------------*/
class NVMeofGwMap
@ -140,6 +143,8 @@ public:
decode(fsm_timers, bl);
DECODE_FINISH(bl);
}
void get_health_checks(health_check_map_t *checks) const;
};
#include "NVMeofGwSerialize.h"

View File

@ -181,6 +181,11 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
<< HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl;
put_version(t, pending_map.epoch, bl);
put_last_committed(t, pending_map.epoch);
//health
health_check_map_t checks;
pending_map.get_health_checks(&checks);
encode_health(checks, t);
}
void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
@ -193,6 +198,7 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
bufferlist bl;
int err = get_version(version, bl);
ceph_assert(err == 0);
load_health();
auto p = bl.cbegin();
map.decode(p);