qa: Added mon connection score tests

Basically when we deploy a 3 MONS

Check if the connection scores are clean
with a 60 seconds grace period

Fixes: https://tracker.ceph.com/issues/65695

Signed-off-by: Kamoltat <ksirivad@redhat.com>
This commit is contained in:
Kamoltat 2024-07-17 22:26:55 +00:00
parent c05d4e2716
commit ed7f4e8829
3 changed files with 139 additions and 7 deletions

View File

@ -0,0 +1,40 @@
roles:
- - mon.a
- mon.b
- mon.c
- osd.0
- osd.1
- osd.2
- mgr.x
- client.0
openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
tasks:
- install:
- ceph:
pre-mgr-commands:
- sudo ceph config set mgr mgr_pool false --force
log-ignorelist:
- overall HEALTH_
- \(OSDMAP_FLAGS\)
- \(OSD_
- \(PG_
- \(POOL_
- \(CACHE_POOL_
- \(OBJECT_
- \(SLOW_OPS\)
- \(REQUEST_SLOW\)
- \(TOO_FEW_PGS\)
- slow request
- \(POOL_APP_NOT_ENABLED\)
- overall HEALTH_
- \(MGR_DOWN\)
- \(MON_DOWN\)
- \(PG_AVAILABILITY\)
- \(SLOW_OPS\)
- cephfs_test_runner:
modules:
- tasks.mon_connection_score

View File

@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd):
while True:
if condition():
success_time_elapsed = 0
while success_time_elapsed < success_hold_time:
if condition():
success_time_elapsed += 1
time.sleep(1)
elapsed += 1
else:
break
while success_time_elapsed < success_hold_time and condition():
success_time_elapsed += 1
time.sleep(1)
elapsed += 1
if success_time_elapsed == success_hold_time:
log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
return

View File

@ -0,0 +1,95 @@
from tasks.ceph_test_case import CephTestCase
import json
import logging
log = logging.getLogger(__name__)
class TestStretchClusterNew(CephTestCase):
CLUSTER = "ceph"
MONS = {
"a": {
"rank": 0,
},
"b": {
"rank": 1,
},
"c": {
"rank": 2,
}
}
WRITE_PERIOD = 10
RECOVERY_PERIOD = WRITE_PERIOD * 6
SUCCESS_HOLD_TIME = 10
def setUp(self):
"""
Set up the cluster for the test.
"""
super(TestStretchClusterNew, self).setUp()
def tearDown(self):
"""
Clean up the cluter after the test.
"""
super(TestStretchClusterNew, self).tearDown()
def _check_connection_score(self):
"""
Check the connection score of all the mons.
"""
for mon, _ in self.MONS.items():
# get the connection score
cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
'daemon', 'mon.{}'.format(mon),
'connection', 'scores', 'dump')
# parse the connection score
cscore = json.loads(cscore)
# check if the current mon rank is correct
if cscore["rank"] != self.MONS[mon]["rank"]:
log.error(
"Rank mismatch {} != {}".format(
cscore["rank"], self.MONS[mon]["rank"]
)
)
return False
# check if current mon have all the peer reports and ourself
if len(cscore['reports']) != len(self.MONS):
log.error(
"Reports count mismatch {}".format(cscore['reports'])
)
return False
for report in cscore["reports"]:
report_rank = []
for peer in report["peer_scores"]:
# check if the peer is alive
if not peer["peer_alive"]:
log.error("Peer {} is not alive".format(peer))
return False
report_rank.append(peer["peer_rank"])
# check if current mon has all the ranks and no duplicates
expected_ranks = [
rank
for data in self.MONS.values()
for rank in data.values()
]
if report_rank.sort() != expected_ranks.sort():
log.error("Rank mismatch in report {}".format(report))
return False
log.info("Connection score is clean!")
return True
def test_connection_score(self):
# check if all mons are in quorum
self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
# check if all connection scores reflect this
self.wait_until_true_and_hold(
lambda: self._check_connection_score(),
# Wait for 4 minutes for the connection score to recover
timeout=self.RECOVERY_PERIOD * 4,
# Hold the clean connection score for 60 seconds
success_hold_time=self.SUCCESS_HOLD_TIME * 6
)