From e2ce8ed1ff9846d4efa7de434a56edc66305e767 Mon Sep 17 00:00:00 2001 From: Laura Flores Date: Thu, 18 Jan 2024 18:57:24 +0000 Subject: [PATCH] mgr: add read balancer support inside the balancer module Read balancing may now be managed automatically via the balancer manager module. Users may choose between two new modes: ``upmap-read``, which offers upmap and read optimization simultaneously, or ``read``, which may be used to only optimize reads. Existing balancer commands have also been added to contain more information about read balancing. Run the following commands to test the new automatic behavior: `ceph balancer on` (on by default) `ceph balancer mode ` `ceph balancer status` Run the following commands to test the new supervised behavior: `ceph balancer off` `ceph balancer mode ` `ceph balancer eval` | `ceph balancer eval ` `ceph balancer eval-verbose` | `ceph balancer eval-verbose ` `ceph balancer optimize ` `ceph balancer show ` `ceph balancer eval ` `ceph balancer execute ` In the balancer module, there is also a new "self_test" function which tests the module's basic functionality. This test can be triggered with the following commands: `ceph mgr module enable selftest` `ceph mgr self-test module balancer` Related Trello: https://trello.com/c/sWoKctzL/859-add-read-balancer-support-inside-the-balancer-module Signed-off-by: Laura Flores --- doc/dev/balancer-design.rst | 1 - doc/rados/operations/balancer.rst | 48 ++++- doc/rados/operations/read-balancer.rst | 47 ++++- src/mgr/PyOSDMap.cc | 32 +++ src/pybind/mgr/balancer/module.py | 190 +++++++++++++++++- src/pybind/mgr/ceph_module.pyi | 1 + src/pybind/mgr/mgr_module.py | 4 + src/test/cli-integration/balancer/misplaced.t | 2 + 8 files changed, 311 insertions(+), 14 deletions(-) diff --git a/doc/dev/balancer-design.rst b/doc/dev/balancer-design.rst index 684d163528f..cf45473f851 100644 --- a/doc/dev/balancer-design.rst +++ b/doc/dev/balancer-design.rst @@ -55,4 +55,3 @@ Plans for the Next Version -------------------------- 1. Improve behavior for heterogeneous OSDs in a pool -2. Offer read balancing as an online option to the balancer manager module diff --git a/doc/rados/operations/balancer.rst b/doc/rados/operations/balancer.rst index 5f87865bcf3..949ff17c24a 100644 --- a/doc/rados/operations/balancer.rst +++ b/doc/rados/operations/balancer.rst @@ -21,9 +21,9 @@ To check the current status of the balancer, run the following command: Automatic balancing ------------------- -When the balancer is in ``upmap`` mode, the automatic balancing feature is -enabled by default. For more details, see :ref:`upmap`. To disable the -balancer, run the following command: +When the balancer is in ``upmap`` mode, which is the default, the automatic +upmap balancing feature is enabled. For more details, see :ref:`upmap`. +To disable the balancer, run the following command: .. prompt:: bash $ @@ -34,6 +34,10 @@ The balancer mode can be changed from ``upmap`` mode to ``crush-compat`` mode. ``crush-compat`` mode, the balancer automatically makes small changes to the data distribution in order to ensure that OSDs are utilized equally. +Additional modes include ``upmap-read`` and ``read``. ``upmap-read`` mode +combines the upmap balancer with the read balancer so that both writes +and reads are optimized. ``read`` mode can be used when only read optimization +is desired. For more details, see :ref:`read_balancer`. Throttling ---------- @@ -102,7 +106,7 @@ and then run the following command: Modes ----- -There are two supported balancer modes: +There are four supported balancer modes: #. **crush-compat**. This mode uses the compat weight-set feature (introduced in Luminous) to manage an alternative set of weights for devices in the @@ -135,13 +139,45 @@ There are two supported balancer modes: To use ``upmap``, all clients must be Luminous or newer. -The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by -running the following command: +#. **read**. In Reef and later releases, the OSDMap can store explicit + mappings for individual primary OSDs as exceptions to the normal CRUSH + placement calculation. These ``pg-upmap-primary`` entries provide fine-grained + control over primary PG mappings. This mode optimizes the placement of individual + primary PGs in order to achieve balanced reads, or primary PGs, in a cluster. + In ``read`` mode, upmap behavior is not excercised, so this mode is best for + uses cases in which only read balancing is desired. + + To use ``pg-upmap-primary``, all clients must be Reef or newer. For more + details about client compatibility, see :ref:`read_balancer`. + +#. **upmap-read**. This balancer mode combines optimization benefits of + both ``upmap`` and ``read`` mode. Like in ``read`` mode, ``upmap-read`` + makes use of ``pg-upmap-primary``. As such, only Reef and later clients + are compatible. For more details about client compatibility, see + :ref:`read_balancer`. + + ``upmap-read`` is highly recommended for achieving the ``upmap`` mode's + offering of balanced PG distribution as well as the ``read`` mode's + offering of balanced reads. + +The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by running the following command: .. prompt:: bash $ ceph balancer mode crush-compat +The mode can be changed to ``read`` by running the following command: + + .. prompt:: bash $ + + ceph balancer mode read + +The mode can be changed to ``upmap-read`` by running the following command: + + .. prompt:: bash $ + + ceph balancer mode upmap-read + Supervised optimization ----------------------- diff --git a/doc/rados/operations/read-balancer.rst b/doc/rados/operations/read-balancer.rst index 0833e4326c9..a2c189dbb16 100644 --- a/doc/rados/operations/read-balancer.rst +++ b/doc/rados/operations/read-balancer.rst @@ -17,9 +17,50 @@ you may want to try improving your read performance with the read balancer. Online Optimization =================== -At present, there is no online option for the read balancer. However, we plan to add -the read balancer as an option to the :ref:`balancer` in the next Ceph version -so it can be enabled to run automatically in the background like the upmap balancer. +Enabling +-------- + +To enable automatic read balancing, you must turn on the *balancer module* +(enabled by default in new clusters) and set the mode to ``read`` or ``upmap-read``: + +.. prompt:: bash $ + + ceph balancer on + ceph balancer mode + +Both ``read`` and ``upmap-read`` mode make use of ``pg-upmap-primary``. In order +to use ``pg-upmap-primary``, the cluster cannot have any pre-Reef clients. + +If you want to use a different balancer or if you want to make your +own custom ``pg-upmap-primary`` entries, you might want to turn off the balancer in +order to avoid conflict: + +.. prompt:: bash $ + + ceph balancer off + +To allow use of the new feature on an existing cluster, you must restrict the +cluster to supporting only Reef (and newer) clients. To do so, run the +following command: + +.. prompt:: bash $ + + ceph osd set-require-min-compat-client reef + +This command will fail if any pre-Reef clients or daemons are connected to +the monitors. To see which client versions are in use, run the following +command: + +.. prompt:: bash $ + + ceph features + +Balancer Module +--------------- + +The `balancer` module for ``ceph-mgr`` will automatically balance the number of +primary PGs per OSD if set to ``read`` or ``upmap-read`` mode. See :ref:`balancer` +for more information. Offline Optimization ==================== diff --git a/src/mgr/PyOSDMap.cc b/src/mgr/PyOSDMap.cc index 83475f5ee5f..58d2da41b9b 100644 --- a/src/mgr/PyOSDMap.cc +++ b/src/mgr/PyOSDMap.cc @@ -162,6 +162,36 @@ static PyObject *osdmap_calc_pg_upmaps(BasePyOSDMap* self, PyObject *args) return PyLong_FromLong(r); } +static PyObject *osdmap_balance_primaries(BasePyOSDMap* self, PyObject *args) +{ + int pool_id; + BasePyOSDMapIncremental *incobj; + if (!PyArg_ParseTuple(args, "iO:balance_primaries", + &pool_id, &incobj)) { + return nullptr; + } + auto check_pool = self->osdmap->get_pg_pool(pool_id); + if (!check_pool) { + derr << __func__ << " pool '" << pool_id + << "' does not exist" << dendl; + return nullptr; + } + dout(10) << __func__ << " osdmap " << self->osdmap + << " pool_id " << pool_id + << " inc " << incobj->inc + << dendl; + PyThreadState *tstate = PyEval_SaveThread(); + OSDMap tmp_osd_map; + tmp_osd_map.deepish_copy_from(*(self->osdmap)); + int r = self->osdmap->balance_primaries(g_ceph_context, + pool_id, + incobj->inc, + tmp_osd_map); + PyEval_RestoreThread(tstate); + dout(10) << __func__ << " r = " << r << dendl; + return PyLong_FromLong(r); +} + static PyObject *osdmap_map_pool_pgs_up(BasePyOSDMap* self, PyObject *args) { int poolid; @@ -324,6 +354,8 @@ PyMethodDef BasePyOSDMap_methods[] = { "Get pools that have CRUSH rules that TAKE the given root"}, {"_calc_pg_upmaps", (PyCFunction)osdmap_calc_pg_upmaps, METH_VARARGS, "Calculate new pg-upmap values"}, + {"_balance_primaries", (PyCFunction)osdmap_balance_primaries, METH_VARARGS, + "Calculate new pg-upmap-primary values"}, {"_map_pool_pgs_up", (PyCFunction)osdmap_map_pool_pgs_up, METH_VARARGS, "Calculate up set mappings for all PGs in a pool"}, {"_pg_to_up_acting_osds", (PyCFunction)osdmap_pg_to_up_acting_osds, METH_VARARGS, diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 2cbaf10c09f..54780d1eac8 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -9,7 +9,7 @@ import json import math import random import time -from mgr_module import CLIReadCommand, CLICommand, CommandResult, MgrModule, Option, OSDMap +from mgr_module import CLIReadCommand, CLICommand, CommandResult, MgrModule, Option, OSDMap, CephReleases from threading import Event from typing import cast, Any, Dict, List, Optional, Sequence, Tuple, Union from mgr_module import CRUSHMap @@ -55,6 +55,8 @@ class Mode(enum.Enum): none = 'none' crush_compat = 'crush-compat' upmap = 'upmap' + read = 'read' + upmap_read = 'upmap-read' class Plan(object): @@ -116,6 +118,10 @@ class MsPlan(Plan): osdlist += [m['from'], m['to']] ls.append('ceph osd pg-upmap-items %s %s' % (item['pgid'], ' '.join([str(a) for a in osdlist]))) + for item in incdump.get('new_pg_upmap_primaries', []): + ls.append('ceph osd pg-upmap-primary %s %s' % (item['pgid'], item['primary_osd'])) + for item in incdump.get('old_pg_upmap_primaries', []): + ls.append('ceph osd rm-pg-upmap-primary %s' % item['pgid']) return '\n'.join(ls) @@ -142,6 +148,9 @@ class Eval: self.score = 0.0 + self.read_balance_score_by_pool: Dict[str, Dict[str, float]] = {} + self.read_balance_score_acting_by_pool: Dict[str, float] = {} + def show(self, verbose: bool = False) -> str: if verbose: r = self.ms.desc + '\n' @@ -155,9 +164,12 @@ class Eval: r += 'stats_by_root %s\n' % self.stats_by_root r += 'score_by_pool %s\n' % self.score_by_pool r += 'score_by_root %s\n' % self.score_by_root + r += 'score %f (lower is better)\n' % self.score + r += 'read_balance_score_by_pool %s\n' % self.read_balance_score_by_pool else: r = self.ms.desc + ' ' - r += 'score %f (lower is better)\n' % self.score + r += 'score %f (lower is better)\n' % self.score + r += 'read_balance_scores (lower is better) %s\n' % self.read_balance_score_acting_by_pool return r def calc_stats(self, count, target, total): @@ -290,7 +302,7 @@ class Module(MgrModule): Option(name='mode', desc='Balancer mode', default='upmap', - enum_allowed=['none', 'crush-compat', 'upmap'], + enum_allowed=['none', 'crush-compat', 'upmap', 'read', 'upmap-read'], runtime=True), Option(name='sleep_interval', type='secs', @@ -394,6 +406,19 @@ class Module(MgrModule): self.get("pool_stats"), 'initialize compat weight-set') self.get_compat_weight_set_weights(ms) # ignore error + elif (mode == Mode.read) or (mode == Mode.upmap_read): + try: + release = CephReleases[min_compat_client] + if release.value < CephReleases.reef.value: + warn = ('min_compat_client "%s" ' + '< "reef", which is required for pg-upmap-primary. ' + 'Try "ceph osd set-require-min-compat-client reef" ' + 'before enabling this mode' % min_compat_client) + return (-errno.EPERM, '', warn) + except KeyError: + self.log.error('Unable to apply mode {} due to unknown min_compat_client {}'.format(mode, min_compat_client)) + warn = ('Unable to apply mode {} due to unknown min_compat_client {}.'.format(mode, min_compat_client)) + return (-errno.EPERM, '', warn) self.set_module_option('mode', mode.value) return (0, '', '') @@ -894,6 +919,23 @@ class Module(MgrModule): 'objects': objects, 'bytes': bytes, } + try: + read_balance_scores = pi['read_balance'] + pe.read_balance_score_acting_by_pool[pool] = read_balance_scores['score_acting'] + pe.read_balance_score_by_pool[pool] = { + 'score_acting': read_balance_scores['score_acting'], + 'score_stable': read_balance_scores['score_stable'], + 'optimal_score': read_balance_scores['optimal_score'], + 'raw_score_acting': read_balance_scores['raw_score_acting'], + 'raw_score_stable': read_balance_scores['raw_score_stable'], + 'primary_affinity_weighted': read_balance_scores['primary_affinity_weighted'], + 'average_primary_affinity': read_balance_scores['average_primary_affinity'], + 'average_primary_affinity_weighted': read_balance_scores['average_primary_affinity_weighted'] + } + except KeyError: + self.log.debug("Skipping pool '{}' since it does not have a read_balance_score, " + "likely because it is not replicated.".format(pool)) + for root in pe.total_by_root: pe.count_by_root[root] = { 'pgs': { @@ -998,6 +1040,14 @@ class Module(MgrModule): return self.do_upmap(plan) elif plan.mode == 'crush-compat': return self.do_crush_compat(cast(MsPlan, plan)) + elif plan.mode == 'read': + return self.do_read_balancing(plan) + elif plan.mode == 'upmap-read': + r_upmap, detail_upmap = self.do_upmap(plan) + r_read, detail_read = self.do_read_balancing(plan) + if (r_upmap < 0) and (r_read < 0): + return r_upmap, detail_upmap + return 0, '' elif plan.mode == 'none': detail = 'Please do "ceph balancer mode" to choose a valid mode first' self.log.info('Idle') @@ -1007,6 +1057,83 @@ class Module(MgrModule): self.log.info(detail) return -errno.EINVAL, detail + def do_read_balancing(self, plan: Plan) -> Tuple[int, str]: + self.log.info('do_read_balancing') + osdmap_dump = plan.osdmap_dump + msg = 'Unable to find further optimization, ' \ + 'or distribution is already perfect' + + if len(plan.pools): + pools = plan.pools + else: # all + pools = [str(i['pool_name']) for i in osdmap_dump.get('pools', [])] + if len(pools) == 0: + detail = 'No pools available' + self.log.info(detail) + return -errno.ENOENT, detail + self.log.info('pools %s' % pools) + + adjusted_pools = [] + inc = plan.inc + total_num_changes = 0 + pools_with_pg_merge = [] + crush_rule_by_pool_name = {} + no_read_balance_info = [] + replicated_pools_with_optimal_score = [] + rb_error_message = {} + for p in osdmap_dump.get('pools', []): + for pool_pg_status in plan.pg_status.get('pgs_by_pool_state', []): + if pool_pg_status['pool_id'] != p['pool']: + continue + for state in pool_pg_status['pg_state_counts']: + if state['state_name'] != 'active+clean': + msg = "Not all PGs are active+clean; try again later." + return -errno.EALREADY, msg + if p['pg_num'] > p['pg_num_target']: + pools_with_pg_merge.append(p['pool_name']) + crush_rule_by_pool_name[p['pool_name']] = p['crush_rule'] + if 'read_balance' not in p: + no_read_balance_info.append(p['pool_name']) + if 'read_balance' in p: + if 'error_message' in p['read_balance']: + rb_error_message[p['pool_name']] = p['read_balance']['error_message'] + elif p['read_balance']['score_acting'] == p['read_balance']['optimal_score']: + replicated_pools_with_optimal_score.append(p['pool_name']) + for pool in pools: + if pool not in crush_rule_by_pool_name: + self.log.debug('pool %s does not exist' % pool) + continue + if pool in pools_with_pg_merge: + self.log.debug('pool %s has pending PG(s) for merging, skipping for now' % pool) + continue + if pool in no_read_balance_info: + self.log.debug('pool %s has no read_balance information, skipping' % pool) + continue + if pool in replicated_pools_with_optimal_score: + self.log.debug('pool %s is already balanced, skipping' % pool) + continue + if pool in rb_error_message: + self.log.error(rb_error_message[pool]) + continue + adjusted_pools.append(pool) + pool_dump = osdmap_dump.get('pools', []) + for pool in adjusted_pools: + for p in pool_dump: + if p['pool_name'] == pool: + pool_id = p['pool'] + break + num_changes = plan.osdmap.balance_primaries(pool_id, inc) + total_num_changes += num_changes + if total_num_changes < 0: + self.no_optimization_needed = True + self.log.debug('unable to balance reads.') + return -errno.EALREADY, msg + self.log.info('prepared {} read changes'.format(total_num_changes)) + if total_num_changes == 0: + self.no_optimization_needed = True + return -errno.EALREADY, msg + return 0, '' + def do_upmap(self, plan: Plan) -> Tuple[int, str]: self.log.info('do_upmap') max_optimizations = cast(float, self.get_module_option('upmap_max_optimizations')) @@ -1067,7 +1194,7 @@ class Module(MgrModule): left -= did if left <= 0: break - self.log.info('prepared %d/%d changes' % (total_did, max_optimizations)) + self.log.info('prepared %d/%d upmap changes' % (total_did, max_optimizations)) if total_did == 0: self.no_optimization_needed = True return -errno.EALREADY, 'Unable to find further optimization, ' \ @@ -1422,6 +1549,19 @@ class Module(MgrModule): }), 'foo') commands.append(result) + # read + for item in incdump.get('new_pg_upmap_primaries', []): + self.log.info('ceph osd pg-upmap-primary %s primary_osd %s', item['pgid'], + item['primary_osd']) + result = CommandResult('foo') + self.send_command(result, 'mon', '', json.dumps({ + 'prefix': 'osd pg-upmap-primary', + 'format': 'json', + 'pgid': item['pgid'], + 'id': item['primary_osd'], + }), 'foo') + commands.append(result) + # wait for commands self.log.debug('commands %s' % commands) for result in commands: @@ -1437,3 +1577,45 @@ class Module(MgrModule): 'active': self.active, 'mode': self.mode, } + + def self_test(self) -> None: + # turn balancer on + self.on() + + # Get min-compat-client + min_compat_client = self.get_osdmap().dump().get('require_min_compat_client', '') + release = CephReleases[min_compat_client] + + # Check upmap mode warning + r, _, warn = self.set_mode(Mode.upmap) + if release.value < CephReleases.luminous.value: + if r >= 0: + raise RuntimeError('upmap mode did not properly warn about min_compat_client') + if warn == '': + raise RuntimeError('upmap mode warning is empty when it should not be.') + + # Check read mode warning + r, _, warn = self.set_mode(Mode.read) + if release.value < CephReleases.reef.value: + if r >= 0: + raise RuntimeError('read mode did not properly warn about min_compat_client') + if warn == '': + raise RuntimeError('read mode warning is empty when it should not be.') + r, _, warn = self.set_mode(Mode.upmap_read) + + # Check upmap-read mode warning + if release.value < CephReleases.reef.value: + if r >= 0: + raise RuntimeError('upmap-read mode did not properly warn about min_compat_client') + if warn == '': + raise RuntimeError('upmap-read mode warning is empty when it should not be.') + + # Check status + r, status, _ = self.show_status() + if r < 0: + raise RuntimeError('Balancer status was unsuccessful') + if status == '': + raise RuntimeError('Balancer status was empty') + + # Turn off + self.off() diff --git a/src/pybind/mgr/ceph_module.pyi b/src/pybind/mgr/ceph_module.pyi index 50147f08f30..df4a3782a0c 100644 --- a/src/pybind/mgr/ceph_module.pyi +++ b/src/pybind/mgr/ceph_module.pyi @@ -19,6 +19,7 @@ class BasePyOSDMap(object): def _get_crush(self):... def _get_pools_by_take(self, take):... def _calc_pg_upmaps(self, inc, max_deviation, max_iterations, pool):... + def _balance_primaries(self, pool_id, inc):... def _map_pool_pgs_up(self, poolid):... def _pg_to_up_acting_osds(self, pool_id, ps):... def _pool_raw_used_rate(self, pool_id):... diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index ef908bd81d3..28fd69738eb 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -210,6 +210,10 @@ class OSDMap(ceph_module.BasePyOSDMap): inc, max_deviation, max_iterations, pools) + def balance_primaries(self, pool_id: int, + inc: 'OSDMapIncremental') -> int: + return self._balance_primaries(pool_id, inc) + def map_pool_pgs_up(self, poolid: int) -> List[int]: return self._map_pool_pgs_up(poolid) diff --git a/src/test/cli-integration/balancer/misplaced.t b/src/test/cli-integration/balancer/misplaced.t index 050cceb6402..b9510caa4d0 100644 --- a/src/test/cli-integration/balancer/misplaced.t +++ b/src/test/cli-integration/balancer/misplaced.t @@ -12,6 +12,7 @@ $ ceph config set osd.* target_max_misplaced_ratio .07 $ ceph balancer eval current cluster score [0-9]*\.?[0-9]+.* (re) + read_balance_scores \(lower is better\) {'rbd': [0-9]*\.?[0-9]+.*, 'balancer_opt': [0-9]*\.?[0-9]+.*} (re) # Turn off active balancer to use manual commands $ ceph balancer off $ ceph balancer optimize test_plan balancer_opt @@ -22,6 +23,7 @@ $ ceph balancer execute test_plan $ ceph balancer eval current cluster score [0-9]*\.?[0-9]+.* (re) + read_balance_scores \(lower is better\) {'rbd': [0-9]*\.?[0-9]+.*, 'balancer_opt': [0-9]*\.?[0-9]+.*} (re) # Plan is gone after execution ? $ ceph balancer execute test_plan Error ENOENT: plan test_plan not found