From 886475b5fecc3ed49ee72733963fe0003d84cc12 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 18 Dec 2019 21:20:25 -0800 Subject: [PATCH] mon: Improvements to slow heartbeat health messages Include crush parentage for each osd Fixes: https://tracker.ceph.com/issues/43126 Signed-off-by: David Zafman --- doc/rados/operations/monitoring.rst | 12 +++++----- qa/standalone/misc/network-ping.sh | 28 ++++++++++++++++++++++-- src/mon/PGMap.cc | 34 ++++++++++++++++++++++++----- src/mon/PGMap.h | 1 + 4 files changed, 61 insertions(+), 14 deletions(-) diff --git a/doc/rados/operations/monitoring.rst b/doc/rados/operations/monitoring.rst index 0dd248b4290..bc8a35cbb88 100644 --- a/doc/rados/operations/monitoring.rst +++ b/doc/rados/operations/monitoring.rst @@ -171,18 +171,18 @@ By default we will warn about ping times which exceed 1 second (1000 millisecond :: - HEALTH_WARN Long heartbeat ping times on back interface seen, longest is 1118.001 msec + HEALTH_WARN Slow OSD heartbeats on back (longest 1118.001ms) The health detail will add the combination of OSDs are seeing the delays and by how much. There is a limit of 10 detail line items. :: - [WRN] OSD_SLOW_PING_TIME_BACK: Long heartbeat ping times on back interface seen, longest is 1118.001 msec - Slow heartbeat ping on back interface from osd.0 to osd.1 1118.001 msec - Slow heartbeat ping on back interface from osd.0 to osd.2 1030.123 msec - Slow heartbeat ping on back interface from osd.2 to osd.1 1015.321 msec - Slow heartbeat ping on back interface from osd.1 to osd.0 1010.456 msec + [WRN] OSD_SLOW_PING_TIME_BACK: Slow OSD heartbeats on back (longest 1118.001ms) + Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.1 [dc1,rack1] 1118.001 msec possibly improving + Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.2 [dc1,rack2] 1030.123 msec + Slow OSD heartbeats on back from osd.2 [dc1,rack2] to osd.1 [dc1,rack1] 1015.321 msec + Slow OSD heartbeats on back from osd.1 [dc1,rack1] to osd.0 [dc1,rack1] 1010.456 msec To see even more detail and a complete dump of network performance information the ``dump_osd_network`` command can be used. Typically, this would be sent to a mgr, but it can be limited to a particular OSD's interactions by issuing it to any OSD. The current threshold which defaults to 1 second diff --git a/qa/standalone/misc/network-ping.sh b/qa/standalone/misc/network-ping.sh index b2b299d63ee..4745108c5ba 100755 --- a/qa/standalone/misc/network-ping.sh +++ b/qa/standalone/misc/network-ping.sh @@ -119,8 +119,30 @@ function TEST_network_ping_test2() { run_osd $dir 2 || return 1 sleep 5 + ceph osd crush add-bucket dc1 datacenter + ceph osd crush add-bucket dc2 datacenter + ceph osd crush add-bucket dc3 datacenter + ceph osd crush add-bucket rack1 rack + ceph osd crush add-bucket rack2 rack + ceph osd crush add-bucket rack3 rack + ceph osd crush add-bucket host1 host + ceph osd crush add-bucket host2 host + ceph osd crush add-bucket host3 host + ceph osd crush move dc1 root=default + ceph osd crush move dc2 root=default + ceph osd crush move dc3 root=default + ceph osd crush move rack1 datacenter=dc1 + ceph osd crush move rack2 datacenter=dc2 + ceph osd crush move rack3 datacenter=dc3 + ceph osd crush move host1 rack=rack1 + ceph osd crush move host2 rack=rack2 + ceph osd crush move host3 rack=rack3 + ceph osd crush set osd.0 1.0 host=host1 + ceph osd crush set osd.1 1.0 host=host2 + ceph osd crush set osd.2 1.0 host=host3 + ceph osd crush rule create-simple myrule default host firstn - create_pool foo 16 + create_pool foo 16 16 replicated myrule # write some objects timeout 20 rados bench -p foo 10 write -b 4096 --no-cleanup || return 1 @@ -130,11 +152,13 @@ function TEST_network_ping_test2() { flush_pg_stats ceph health | tee $dir/health - grep -q "Long heartbeat" $dir/health || return 1 + grep -q "Slow OSD heartbeats" $dir/health || return 1 ceph health detail | tee $dir/health grep -q "OSD_SLOW_PING_TIME_BACK" $dir/health || return 1 grep -q "OSD_SLOW_PING_TIME_FRONT" $dir/health || return 1 + grep -q "Slow OSD heartbeats on front from osd[.][0-2] [[]dc[1-3],rack[1-3][]] \ +to osd[.][0-2] [[]dc[1-3],rack[1-3][]]" $dir/health || return 1 rm -f $dir/health } diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 722e61f5d74..7d74817bad0 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2363,6 +2363,23 @@ void PGMap::dump_pool_stats_and_io_rate(int64_t poolid, const OSDMap &osd_map, } } +// Get crush parentage for an osd (skip root) +set PGMap::osd_parentage(const OSDMap& osdmap, int id) const +{ + set reporters_by_subtree; + auto reporter_subtree_level = g_conf().get_val("mon_osd_reporter_subtree_level"); + + auto loc = osdmap.crush->get_full_location(id); + for (auto& [parent_bucket_type, parent_id] : loc) { + // Should we show the root? Might not be too informative like "default" + if (parent_bucket_type != "root" && + parent_bucket_type != reporter_subtree_level) { + reporters_by_subtree.insert(parent_id); + } + } + return reporters_by_subtree; +} + void PGMap::get_health_checks( CephContext *cct, const OSDMap& osdmap, @@ -2833,9 +2850,11 @@ void PGMap::get_health_checks( break; } max_detail--; - ss << "Slow heartbeat ping on back interface from osd." << sback.from + ss << "Slow OSD heartbeats on back from osd." << sback.from + << " [" << osd_parentage(osdmap, sback.from) << "]" << (osdmap.is_down(sback.from) ? " (down)" : "") << " to osd." << sback.to + << " [" << osd_parentage(osdmap, sback.to) << "]" << (osdmap.is_down(sback.to) ? " (down)" : "") << " " << fixed_u_to_string(sback.pingtime, 3) << " msec" << (sback.improving ? " possibly improving" : ""); @@ -2850,9 +2869,12 @@ void PGMap::get_health_checks( break; } max_detail--; - ss << "Slow heartbeat ping on front interface from osd." << sfront.from + // Get crush parentage for each osd + ss << "Slow OSD heartbeats on front from osd." << sfront.from + << " [" << osd_parentage(osdmap, sfront.from) << "]" << (osdmap.is_down(sfront.from) ? " (down)" : "") << " to osd." << sfront.to + << " [" << osd_parentage(osdmap, sfront.to) << "]" << (osdmap.is_down(sfront.to) ? " (down)" : "") << " " << fixed_u_to_string(sfront.pingtime, 3) << " msec" << (sfront.improving ? " possibly improving" : ""); @@ -2860,16 +2882,16 @@ void PGMap::get_health_checks( } if (detail_back.size() != 0) { ostringstream ss; - ss << "Long heartbeat ping times on back interface seen, longest is " - << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << " msec"; + ss << "Slow OSD heartbeats on back (longest " + << fixed_u_to_string(back_sorted.rbegin()->pingtime, 3) << "ms)"; auto& d = checks->add("OSD_SLOW_PING_TIME_BACK", HEALTH_WARN, ss.str(), back_sorted.size()); d.detail.swap(detail_back); } if (detail_front.size() != 0) { ostringstream ss; - ss << "Long heartbeat ping times on front interface seen, longest is " - << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << " msec"; + ss << "Slow OSD heartbeats on front (longest " + << fixed_u_to_string(front_sorted.rbegin()->pingtime, 3) << "ms)"; auto& d = checks->add("OSD_SLOW_PING_TIME_FRONT", HEALTH_WARN, ss.str(), front_sorted.size()); d.detail.swap(detail_front); diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h index 20c1cac2777..c05268c3ad8 100644 --- a/src/mon/PGMap.h +++ b/src/mon/PGMap.h @@ -491,6 +491,7 @@ public: void get_filtered_pg_stats(uint64_t state, int64_t poolid, int64_t osdid, bool primary, std::set& pgs) const; + set osd_parentage(const OSDMap& osdmap, int id) const; void get_health_checks( CephContext *cct, const OSDMap& osdmap,