mirror of
https://github.com/ceph/ceph
synced 2025-02-19 17:08:05 +00:00
qa: Added mon connection score tests
Basically when we deploy a 3 MONS Check if the connection scores are clean with a 60 seconds grace period Fixes: https://tracker.ceph.com/issues/65695 Signed-off-by: Kamoltat <ksirivad@redhat.com>
This commit is contained in:
parent
c05d4e2716
commit
ed7f4e8829
40
qa/suites/rados/singleton/all/mon-connection-score.yaml
Normal file
40
qa/suites/rados/singleton/all/mon-connection-score.yaml
Normal file
@ -0,0 +1,40 @@
|
||||
roles:
|
||||
- - mon.a
|
||||
- mon.b
|
||||
- mon.c
|
||||
- osd.0
|
||||
- osd.1
|
||||
- osd.2
|
||||
- mgr.x
|
||||
- client.0
|
||||
|
||||
openstack:
|
||||
- volumes: # attached to each instance
|
||||
count: 3
|
||||
size: 10 # GB
|
||||
tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
pre-mgr-commands:
|
||||
- sudo ceph config set mgr mgr_pool false --force
|
||||
log-ignorelist:
|
||||
- overall HEALTH_
|
||||
- \(OSDMAP_FLAGS\)
|
||||
- \(OSD_
|
||||
- \(PG_
|
||||
- \(POOL_
|
||||
- \(CACHE_POOL_
|
||||
- \(OBJECT_
|
||||
- \(SLOW_OPS\)
|
||||
- \(REQUEST_SLOW\)
|
||||
- \(TOO_FEW_PGS\)
|
||||
- slow request
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
- overall HEALTH_
|
||||
- \(MGR_DOWN\)
|
||||
- \(MON_DOWN\)
|
||||
- \(PG_AVAILABILITY\)
|
||||
- \(SLOW_OPS\)
|
||||
- cephfs_test_runner:
|
||||
modules:
|
||||
- tasks.mon_connection_score
|
@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd):
|
||||
while True:
|
||||
if condition():
|
||||
success_time_elapsed = 0
|
||||
while success_time_elapsed < success_hold_time:
|
||||
if condition():
|
||||
success_time_elapsed += 1
|
||||
time.sleep(1)
|
||||
elapsed += 1
|
||||
else:
|
||||
break
|
||||
while success_time_elapsed < success_hold_time and condition():
|
||||
success_time_elapsed += 1
|
||||
time.sleep(1)
|
||||
elapsed += 1
|
||||
if success_time_elapsed == success_hold_time:
|
||||
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
|
||||
return
|
||||
|
95
qa/tasks/mon_connection_score.py
Normal file
95
qa/tasks/mon_connection_score.py
Normal file
@ -0,0 +1,95 @@
|
||||
from tasks.ceph_test_case import CephTestCase
|
||||
import json
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestStretchClusterNew(CephTestCase):
|
||||
|
||||
CLUSTER = "ceph"
|
||||
MONS = {
|
||||
"a": {
|
||||
"rank": 0,
|
||||
},
|
||||
"b": {
|
||||
"rank": 1,
|
||||
},
|
||||
"c": {
|
||||
"rank": 2,
|
||||
}
|
||||
}
|
||||
WRITE_PERIOD = 10
|
||||
RECOVERY_PERIOD = WRITE_PERIOD * 6
|
||||
SUCCESS_HOLD_TIME = 10
|
||||
|
||||
def setUp(self):
|
||||
"""
|
||||
Set up the cluster for the test.
|
||||
"""
|
||||
super(TestStretchClusterNew, self).setUp()
|
||||
|
||||
def tearDown(self):
|
||||
"""
|
||||
Clean up the cluter after the test.
|
||||
"""
|
||||
super(TestStretchClusterNew, self).tearDown()
|
||||
|
||||
def _check_connection_score(self):
|
||||
"""
|
||||
Check the connection score of all the mons.
|
||||
"""
|
||||
for mon, _ in self.MONS.items():
|
||||
# get the connection score
|
||||
cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
|
||||
'daemon', 'mon.{}'.format(mon),
|
||||
'connection', 'scores', 'dump')
|
||||
# parse the connection score
|
||||
cscore = json.loads(cscore)
|
||||
# check if the current mon rank is correct
|
||||
if cscore["rank"] != self.MONS[mon]["rank"]:
|
||||
log.error(
|
||||
"Rank mismatch {} != {}".format(
|
||||
cscore["rank"], self.MONS[mon]["rank"]
|
||||
)
|
||||
)
|
||||
return False
|
||||
# check if current mon have all the peer reports and ourself
|
||||
if len(cscore['reports']) != len(self.MONS):
|
||||
log.error(
|
||||
"Reports count mismatch {}".format(cscore['reports'])
|
||||
)
|
||||
return False
|
||||
|
||||
for report in cscore["reports"]:
|
||||
report_rank = []
|
||||
for peer in report["peer_scores"]:
|
||||
# check if the peer is alive
|
||||
if not peer["peer_alive"]:
|
||||
log.error("Peer {} is not alive".format(peer))
|
||||
return False
|
||||
report_rank.append(peer["peer_rank"])
|
||||
|
||||
# check if current mon has all the ranks and no duplicates
|
||||
expected_ranks = [
|
||||
rank
|
||||
for data in self.MONS.values()
|
||||
for rank in data.values()
|
||||
]
|
||||
if report_rank.sort() != expected_ranks.sort():
|
||||
log.error("Rank mismatch in report {}".format(report))
|
||||
return False
|
||||
|
||||
log.info("Connection score is clean!")
|
||||
return True
|
||||
|
||||
def test_connection_score(self):
|
||||
# check if all mons are in quorum
|
||||
self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
|
||||
# check if all connection scores reflect this
|
||||
self.wait_until_true_and_hold(
|
||||
lambda: self._check_connection_score(),
|
||||
# Wait for 4 minutes for the connection score to recover
|
||||
timeout=self.RECOVERY_PERIOD * 4,
|
||||
# Hold the clean connection score for 60 seconds
|
||||
success_hold_time=self.SUCCESS_HOLD_TIME * 6
|
||||
)
|
Loading…
Reference in New Issue
Block a user