mirror of
https://github.com/ceph/ceph
synced 2024-12-13 15:08:33 +00:00
mon: Improvements to slow heartbeat health messages
Include crush parentage for each osd Fixes: https://tracker.ceph.com/issues/43126 Signed-off-by: David Zafman <dzafman@redhat.com>
This commit is contained in:
parent
98f5ce79a6
commit
886475b5fe
@ -171,18 +171,18 @@ By default we will warn about ping times which exceed 1 second (1000 millisecond
|
||||
|
||||
::
|
||||
|
||||
HEALTH_WARN Long heartbeat ping times on back interface seen, longest is 1118.001 msec
|
||||
HEALTH_WARN Slow OSD heartbeats on back (longest 1118.001ms)
|
||||
|
||||
The health detail will add the combination of OSDs are seeing the delays and by how much. There is a limit of 10
|
||||
detail line items.
|
||||
|
||||
::
|
||||
|
||||
[WRN] OSD_SLOW_PING_TIME_BACK: Long heartbeat ping times on back interface seen, longest is 1118.001 msec
|
||||
Slow heartbeat ping on back interface from osd.0 to osd.1 1118.001 msec
|
||||
Slow heartbeat ping on back interface from osd.0 to osd.2 1030.123 msec
|
||||
Slow heartbeat ping on back interface from osd.2 to osd.1 1015.321 msec
|
||||
Slow heartbeat ping on back interface from osd.1 to osd.0 1010.456 msec
|
||||
[WRN] OSD_SLOW_PING_TIME_BACK: Slow OSD heartbeats on back (longest 1118.001ms)
|
||||
Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.1 [dc1,rack1] 1118.001 msec possibly improving
|
||||
Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.2 [dc1,rack2] 1030.123 msec
|
||||
Slow OSD heartbeats on back from osd.2 [dc1,rack2] to osd.1 [dc1,rack1] 1015.321 msec
|
||||
Slow OSD heartbeats on back from osd.1 [dc1,rack1] to osd.0 [dc1,rack1] 1010.456 msec
|
||||
|
||||
To see even more detail and a complete dump of network performance information the ``dump_osd_network`` command can be used. Typically, this would be
|
||||
sent to a mgr, but it can be limited to a particular OSD's interactions by issuing it to any OSD. The current threshold which defaults to 1 second
|
||||
|
@ -119,8 +119,30 @@ function TEST_network_ping_test2() {
|
||||
run_osd $dir 2 || return 1
|
||||
|
||||
sleep 5
|
||||
ceph osd crush add-bucket dc1 datacenter
|
||||
ceph osd crush add-bucket dc2 datacenter
|
||||
ceph osd crush add-bucket dc3 datacenter
|
||||
ceph osd crush add-bucket rack1 rack
|
||||
ceph osd crush add-bucket rack2 rack
|
||||
ceph osd crush add-bucket rack3 rack
|
||||
ceph osd crush add-bucket host1 host
|
||||
ceph osd crush add-bucket host2 host
|
||||
ceph osd crush add-bucket host3 host
|
||||
ceph osd crush move dc1 root=default
|
||||
ceph osd crush move dc2 root=default
|
||||
ceph osd crush move dc3 root=default
|
||||
ceph osd crush move rack1 datacenter=dc1
|
||||
ceph osd crush move rack2 datacenter=dc2
|
||||
ceph osd crush move rack3 datacenter=dc3
|
||||
ceph osd crush move host1 rack=rack1
|
||||
ceph osd crush move host2 rack=rack2
|
||||
ceph osd crush move host3 rack=rack3
|
||||
ceph osd crush set osd.0 1.0 host=host1
|
||||
ceph osd crush set osd.1 1.0 host=host2
|
||||
ceph osd crush set osd.2 1.0 host=host3
|
||||
ceph osd crush rule create-simple myrule default host firstn
|
||||
|
||||
create_pool foo 16
|
||||
create_pool foo 16 16 replicated myrule
|
||||
|
||||
# write some objects
|
||||
timeout 20 rados bench -p foo 10 write -b 4096 --no-cleanup || return 1
|
||||
@ -130,11 +152,13 @@ function TEST_network_ping_test2() {
|
||||
flush_pg_stats
|
||||
|
||||
ceph health | tee $dir/health
|
||||
grep -q "Long heartbeat" $dir/health || return 1
|
||||
grep -q "Slow OSD heartbeats" $dir/health || return 1
|
||||
|
||||
ceph health detail | tee $dir/health
|
||||
grep -q "OSD_SLOW_PING_TIME_BACK" $dir/health || return 1
|
||||
grep -q "OSD_SLOW_PING_TIME_FRONT" $dir/health || return 1
|
||||
grep -q "Slow OSD heartbeats on front from osd[.][0-2] [[]dc[1-3],rack[1-3][]] \
|
||||
to osd[.][0-2] [[]dc[1-3],rack[1-3][]]" $dir/health || return 1
|
||||
rm -f $dir/health
|
||||
}
|
||||
|
||||
|
@ -2363,6 +2363,23 @@ void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map,
|
||||
}
|
||||
}
|
||||
|
||||
// Get crush parentage for an osd (skip root)
|
||||
set<std::string> PGMap::osd_parentage(const OSDMap& osdmap, int id) const
|
||||
{
|
||||
set<std::string> reporters_by_subtree;
|
||||
auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
|
||||
|
||||
auto loc = osdmap.crush->get_full_location(id);
|
||||
for (auto& [parent_bucket_type, parent_id] : loc) {
|
||||
// Should we show the root? Might not be too informative like "default"
|
||||
if (parent_bucket_type != "root" &&
|
||||
parent_bucket_type != reporter_subtree_level) {
|
||||
reporters_by_subtree.insert(parent_id);
|
||||
}
|
||||
}
|
||||
return reporters_by_subtree;
|
||||
}
|
||||
|
||||
void PGMap::get_health_checks(
|
||||
CephContext *cct,
|
||||
const OSDMap& osdmap,
|
||||
@ -2833,9 +2850,11 @@ void PGMap::get_health_checks(
|
||||
break;
|
||||
}
|
||||
max_detail--;
|
||||
ss << "Slow heartbeat ping on back interface from osd." << sback.from
|
||||
ss << "Slow OSD heartbeats on back from osd." << sback.from
|
||||
<< " [" << osd_parentage(osdmap, sback.from) << "]"
|
||||
<< (osdmap.is_down(sback.from) ? " (down)" : "")
|
||||
<< " to osd." << sback.to
|
||||
<< " [" << osd_parentage(osdmap, sback.to) << "]"
|
||||
<< (osdmap.is_down(sback.to) ? " (down)" : "")
|
||||
<< " " << fixed_u_to_string(sback.pingtime, 3) << " msec"
|
||||
<< (sback.improving ? " possibly improving" : "");
|
||||
@ -2850,9 +2869,12 @@ void PGMap::get_health_checks(
|
||||
break;
|
||||
}
|
||||
max_detail--;
|
||||
ss << "Slow heartbeat ping on front interface from osd." << sfront.from
|
||||
// Get crush parentage for each osd
|
||||
ss << "Slow OSD heartbeats on front from osd." << sfront.from
|
||||
<< " [" << osd_parentage(osdmap, sfront.from) << "]"
|
||||
<< (osdmap.is_down(sfront.from) ? " (down)" : "")
|
||||
<< " to osd." << sfront.to
|
||||
<< " [" << osd_parentage(osdmap, sfront.to) << "]"
|
||||
<< (osdmap.is_down(sfront.to) ? " (down)" : "")
|
||||
<< " " << fixed_u_to_string(sfront.pingtime, 3) << " msec"
|
||||
<< (sfront.improving ? " possibly improving" : "");
|
||||
@ -2860,16 +2882,16 @@ void PGMap::get_health_checks(
|
||||
}
|
||||
if (detail_back.size() != 0) {
|
||||
ostringstream ss;
|
||||
ss << "Long heartbeat ping times on back interface seen, longest is "
|
||||
<< fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec";
|
||||
ss << "Slow OSD heartbeats on back (longest "
|
||||
<< fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)";
|
||||
auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(),
|
||||
back_sorted.size());
|
||||
d.detail.swap(detail_back);
|
||||
}
|
||||
if (detail_front.size() != 0) {
|
||||
ostringstream ss;
|
||||
ss << "Long heartbeat ping times on front interface seen, longest is "
|
||||
<< fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec";
|
||||
ss << "Slow OSD heartbeats on front (longest "
|
||||
<< fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)";
|
||||
auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(),
|
||||
front_sorted.size());
|
||||
d.detail.swap(detail_front);
|
||||
|
@ -491,6 +491,7 @@ public:
|
||||
void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid,
|
||||
bool primary, std::set<pg_t>& pgs) const;
|
||||
|
||||
set<std::string> osd_parentage(const OSDMap& osdmap, int id) const;
|
||||
void get_health_checks(
|
||||
CephContext *cct,
|
||||
const OSDMap& osdmap,
|
||||
|
Loading…
Reference in New Issue
Block a user