mirror of https://github.com/ceph/ceph
Merge pull request #60067 from VallariAg/wip-nvmeof-healthcheck
mon: add nvmeof healthchecks
This commit is contained in:
commit
874ae379d7
|
@ -1640,6 +1640,25 @@ We encourage you to fix this by making the weights even on both dividing buckets
|
|||
This can be done by making sure the combined weight of the OSDs on each dividing
|
||||
bucket are the same.
|
||||
|
||||
NVMeoF Gateway
|
||||
--------------
|
||||
|
||||
NVMEOF_SINGLE_GATEWAY
|
||||
_____________________
|
||||
|
||||
One of the gateway group has only one gateway. This is not ideal because it makes
|
||||
high availability (HA) impossible with a single gatway in a group. This can lead to
|
||||
problems with failover and failback operations for the NVMeoF gateway.
|
||||
|
||||
It's recommended to have multiple NVMeoF gateways in a group.
|
||||
|
||||
NVMEOF_GATEWAY_DOWN
|
||||
___________________
|
||||
|
||||
Some of the gateways are in the GW_UNAVAILABLE state. If a NVMeoF daemon has crashed,
|
||||
the daemon log file (found at ``/var/log/ceph/``) may contain troubleshooting information.
|
||||
|
||||
|
||||
Miscellaneous
|
||||
-------------
|
||||
|
||||
|
|
|
@ -28,3 +28,5 @@ overrides:
|
|||
mon:
|
||||
# cephadm can take up to 5 minutes to bring up remaining mons
|
||||
mon down mkfs grace: 300
|
||||
log-ignorelist:
|
||||
- NVMEOF_SINGLE_GATEWAY
|
||||
|
|
|
@ -8,6 +8,9 @@ overrides:
|
|||
- out of quorum
|
||||
# nvmeof daemon thrashing
|
||||
- CEPHADM_FAILED_DAEMON
|
||||
- NVMEOF_SINGLE_GATEWAY
|
||||
- NVMEOF_GATEWAY_DOWN
|
||||
- are in unavailable state
|
||||
- is in error state
|
||||
- failed cephadm daemon
|
||||
|
||||
|
|
|
@ -3,6 +3,9 @@ overrides:
|
|||
log-ignorelist:
|
||||
# nvmeof daemon thrashing
|
||||
- CEPHADM_FAILED_DAEMON
|
||||
- NVMEOF_SINGLE_GATEWAY
|
||||
- NVMEOF_GATEWAY_DOWN
|
||||
- are in unavailable state
|
||||
- is in error state
|
||||
- failed cephadm daemon
|
||||
|
||||
|
|
|
@ -16,7 +16,9 @@
|
|||
#include "NVMeofGwMon.h"
|
||||
#include "NVMeofGwMap.h"
|
||||
#include "OSDMonitor.h"
|
||||
#include "mon/health_check.h"
|
||||
|
||||
using std::list;
|
||||
using std::map;
|
||||
using std::make_pair;
|
||||
using std::ostream;
|
||||
|
@ -893,6 +895,47 @@ struct CMonRequestProposal : public Context {
|
|||
}
|
||||
};
|
||||
|
||||
void NVMeofGwMap::get_health_checks(health_check_map_t *checks) const
|
||||
{
|
||||
list<string> singleGatewayDetail;
|
||||
list<string> gatewayDownDetail;
|
||||
for (const auto& created_map_pair: created_gws) {
|
||||
const auto& group_key = created_map_pair.first;
|
||||
auto& group = group_key.second;
|
||||
const NvmeGwMonStates& gw_created_map = created_map_pair.second;
|
||||
if ( gw_created_map.size() == 1) {
|
||||
ostringstream ss;
|
||||
ss << "NVMeoF Gateway Group '" << group << "' has 1 gateway." ;
|
||||
singleGatewayDetail.push_back(ss.str());
|
||||
}
|
||||
for (const auto& gw_created_pair: gw_created_map) {
|
||||
const auto& gw_id = gw_created_pair.first;
|
||||
const auto& gw_created = gw_created_pair.second;
|
||||
if (gw_created.availability == gw_availability_t::GW_UNAVAILABLE) {
|
||||
ostringstream ss;
|
||||
ss << "NVMeoF Gateway '" << gw_id << "' is unavailable." ;
|
||||
gatewayDownDetail.push_back(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!singleGatewayDetail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << singleGatewayDetail.size() << " group(s) have only 1 nvmeof gateway"
|
||||
<< "; HA is not possible with single gateway.";
|
||||
auto& d = checks->add("NVMEOF_SINGLE_GATEWAY", HEALTH_WARN,
|
||||
ss.str(), singleGatewayDetail.size());
|
||||
d.detail.swap(singleGatewayDetail);
|
||||
}
|
||||
if (!gatewayDownDetail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << gatewayDownDetail.size() << " gateway(s) are in unavailable state"
|
||||
<< "; gateway might be down, try to redeploy.";
|
||||
auto& d = checks->add("NVMEOF_GATEWAY_DOWN", HEALTH_WARN,
|
||||
ss.str(), gatewayDownDetail.size());
|
||||
d.detail.swap(gatewayDownDetail);
|
||||
}
|
||||
}
|
||||
|
||||
int NVMeofGwMap::blocklist_gw(
|
||||
const NvmeGwId &gw_id, const NvmeGroupKey& group_key,
|
||||
NvmeAnaGrpId grpid, epoch_t &epoch, bool failover)
|
||||
|
|
|
@ -27,6 +27,9 @@
|
|||
#include "NVMeofGwTypes.h"
|
||||
|
||||
using ceph::coarse_mono_clock;
|
||||
|
||||
class health_check_map_t;
|
||||
|
||||
class Monitor;
|
||||
/*-------------------*/
|
||||
class NVMeofGwMap
|
||||
|
@ -140,6 +143,8 @@ public:
|
|||
decode(fsm_timers, bl);
|
||||
DECODE_FINISH(bl);
|
||||
}
|
||||
|
||||
void get_health_checks(health_check_map_t *checks) const;
|
||||
};
|
||||
|
||||
#include "NVMeofGwSerialize.h"
|
||||
|
|
|
@ -181,6 +181,11 @@ void NVMeofGwMon::encode_pending(MonitorDBStore::TransactionRef t)
|
|||
<< HAVE_FEATURE(mon.get_quorum_con_features(), NVMEOFHA) << dendl;
|
||||
put_version(t, pending_map.epoch, bl);
|
||||
put_last_committed(t, pending_map.epoch);
|
||||
|
||||
//health
|
||||
health_check_map_t checks;
|
||||
pending_map.get_health_checks(&checks);
|
||||
encode_health(checks, t);
|
||||
}
|
||||
|
||||
void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
|
||||
|
@ -193,6 +198,7 @@ void NVMeofGwMon::update_from_paxos(bool *need_bootstrap)
|
|||
bufferlist bl;
|
||||
int err = get_version(version, bl);
|
||||
ceph_assert(err == 0);
|
||||
load_health();
|
||||
|
||||
auto p = bl.cbegin();
|
||||
map.decode(p);
|
||||
|
|
Loading…
Reference in New Issue