From 7a1d6872c74a03b2493ecef0f2c4a5f9656a7c5e Mon Sep 17 00:00:00 2001 From: David Zafman Date: Thu, 12 Dec 2019 19:42:10 -0800 Subject: [PATCH 1/8] Partially revert "mgr/balancer: balance pools with same crush_rule in batch" This partially reverts commit 3a730d751deff892e7a0bddba87eba3dbb829c3e Fixes: https://tracker.ceph.com/issues/43307 Signed-off-by: David Zafman --- src/pybind/mgr/balancer/module.py | 37 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index c6953d37a00..2290aa0fe04 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -979,14 +979,17 @@ class Module(MgrModule): detail = 'No pools available' self.log.info(detail) return -errno.ENOENT, detail + # shuffle pool list so they all get equal (in)attention + random.shuffle(pools) + self.log.info('pools %s' % pools) + adjusted_pools = [] inc = plan.inc total_did = 0 left = max_optimizations pools_with_pg_merge = [p['pool_name'] for p in osdmap_dump.get('pools', []) if p['pg_num'] > p['pg_num_target']] crush_rule_by_pool_name = dict((p['pool_name'], p['crush_rule']) for p in osdmap_dump.get('pools', [])) - pools_by_crush_rule = {} # group pools by crush_rule for pool in pools: if pool not in crush_rule_by_pool_name: self.log.info('pool %s does not exist' % pool) @@ -994,36 +997,36 @@ class Module(MgrModule): if pool in pools_with_pg_merge: self.log.info('pool %s has pending PG(s) for merging, skipping for now' % pool) continue - crush_rule = crush_rule_by_pool_name[pool] - if crush_rule not in pools_by_crush_rule: - pools_by_crush_rule[crush_rule] = [] - pools_by_crush_rule[crush_rule].append(pool) - classified_pools = list(pools_by_crush_rule.values()) + adjusted_pools.append(pool) # shuffle so all pools get equal (in)attention - random.shuffle(classified_pools) - for it in classified_pools: - pool_dump = osdmap_dump.get('pools', []) + random.shuffle(adjusted_pools) + pool_dump = osdmap_dump.get('pools', []) + for pool in adjusted_pools: num_pg = 0 for p in pool_dump: - if p['pool_name'] in it: - num_pg += p['pg_num'] + if p['pool_name'] == pool: + num_pg = p['pg_num'] + pool_id = p['pool'] + break # note that here we deliberately exclude any scrubbing pgs too # since scrubbing activities have significant impacts on performance - pool_ids = list(p['pool'] for p in pool_dump if p['pool_name'] in it) num_pg_active_clean = 0 for p in plan.pg_status.get('pgs_by_pool_state', []): - if len(pool_ids) and p['pool_id'] not in pool_ids: + pgs_pool_id = p['pool_id'] + if pgs_pool_id != pool_id: continue for s in p['pg_state_counts']: if s['state_name'] == 'active+clean': num_pg_active_clean += s['count'] break - available = max_optimizations - (num_pg - num_pg_active_clean) - did = plan.osdmap.calc_pg_upmaps(inc, max_deviation, available, it) - self.log.info('prepared %d changes for pool(s) %s' % (did, it)) + available = left - (num_pg - num_pg_active_clean) + did = plan.osdmap.calc_pg_upmaps(inc, max_deviation, available, [pool]) total_did += did - self.log.info('prepared %d changes in total' % total_did) + left -= did + if left <= 0: + break + self.log.info('prepared %d/%d changes' % (total_did, max_optimizations)) if total_did == 0: return -errno.EALREADY, 'Unable to find further optimization, ' \ 'or pool(s) pg_num is decreasing, ' \ From a92f48773e9c59d49ee248de1d7951e7c9d3ac25 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 10 Dec 2019 08:55:46 -0800 Subject: [PATCH 2/8] Revert "tools: osdmaptool sync with balancer module behavior" This was the rules batching version of osdmaptool This reverts commit f165d4ca39b1edef4bf22a7597af79339c95026f. Signed-off-by: David Zafman --- src/test/cli/osdmaptool/upmap-out.t | 4 +- src/test/cli/osdmaptool/upmap.t | 4 +- src/tools/osdmaptool.cc | 60 +++++++---------------------- 3 files changed, 18 insertions(+), 50 deletions(-) diff --git a/src/test/cli/osdmaptool/upmap-out.t b/src/test/cli/osdmaptool/upmap-out.t index 6bafe38b11a..8a167aba09a 100644 --- a/src/test/cli/osdmaptool/upmap-out.t +++ b/src/test/cli/osdmaptool/upmap-out.t @@ -8,8 +8,8 @@ writing upmap command output to: c checking for upmap cleanups upmap, max-count 11, max deviation 1 - prepared 11 changes for pools(s) rbd - prepared 11 changes in total + pools rbd + prepared 11/11 changes $ cat c ceph osd pg-upmap-items 1.7 142 145 ceph osd pg-upmap-items 1.8 219 223 diff --git a/src/test/cli/osdmaptool/upmap.t b/src/test/cli/osdmaptool/upmap.t index 36746fbfff6..8fa1a610313 100644 --- a/src/test/cli/osdmaptool/upmap.t +++ b/src/test/cli/osdmaptool/upmap.t @@ -7,8 +7,8 @@ writing upmap command output to: c checking for upmap cleanups upmap, max-count 11, max deviation 1 - prepared 11 changes for pools(s) rbd - prepared 11 changes in total + pools rbd + prepared 11/11 changes $ cat c ceph osd pg-upmap-items 1.7 142 147 ceph osd pg-upmap-items 1.8 219 223 diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index 6a2f6799524..4d0436860b6 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -147,7 +147,6 @@ int main(int argc, const char **argv) std::set upmap_pools; int64_t pg_num = -1; bool test_map_pgs_dump_all = false; - bool debug = false; std::string val; std::ostringstream err; @@ -187,8 +186,6 @@ int main(int argc, const char **argv) createsimple = true; } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) { health = true; - } else if (ceph_argparse_flag(args, i, "--debug", (char*)NULL)) { - debug = true; } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) { createpool = true; } else if (ceph_argparse_flag(args, i, "--create-from-conf", (char*)NULL)) { @@ -406,55 +403,27 @@ int main(int argc, const char **argv) cout << "No pools available" << std::endl; goto skip_upmap; } - if (debug) { - cout << "pools "; - for (auto& i: pools) - cout << osdmap.get_pool_name(i) << " "; - cout << std::endl; - } - map< int, set > pools_by_rule; - for (auto&i: pools) { - const string& pool_name = osdmap.get_pool_name(i); - const pg_pool_t *p = osdmap.get_pg_pool(i); - const int rule = p->get_crush_rule(); - if (!osdmap.crush->rule_exists(rule)) { - cout << " pool " << pool_name << " does not exist" << std::endl; - continue; - } - if (p->get_pg_num() > p->get_pg_num_target()) { - cout << "pool " << pool_name << " has pending PG(s) for merging, skipping for now" << std::endl; - continue; - } - if (debug) { - cout << "pool " << i << " rule " << rule << std::endl; - } - pools_by_rule[rule].emplace(i); - } - vector rules; - for (auto& r: pools_by_rule) - rules.push_back(r.first); std::random_device rd; - std::shuffle(rules.begin(), rules.end(), std::mt19937{rd()}); - if (debug) { - for (auto& r: rules) - cout << "rule: " << r << " " << pools_by_rule[r] << std::endl; - } + std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); + cout << "pools "; + for (auto& i: pools) + cout << osdmap.get_pool_name(i) << " "; + cout << std::endl; int total_did = 0; - int available = upmap_max; - for (auto& r: rules) { - // Assume all PGs are active+clean - // available = upmap_max - (num_pg - num_pg_active_clean) + int left = upmap_max; + for (auto& i: pools) { + set one_pool; + one_pool.insert(i); int did = osdmap.calc_pg_upmaps( g_ceph_context, upmap_deviation, - available, pools_by_rule[r], + left, one_pool, &pending_inc); - cout << "prepared " << did << " changes for pools(s) "; - for (auto i: pools_by_rule[r]) - cout << osdmap.get_pool_name(i) << " "; - cout << std::endl; total_did += did; + left -= did; + if (left <= 0) + break; } - cout << "prepared " << total_did << " changes in total" << std::endl; + cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; if (total_did > 0) { print_inc_upmaps(pending_inc, upmap_fd); if (upmap_save) { @@ -464,7 +433,6 @@ int main(int argc, const char **argv) } } else { cout << "Unable to find further optimization, " - << "or pool(s) pg_num is decreasing, " << "or distribution is already perfect" << std::endl; } From 8e46bbbf36e3d3795c1d3ba50c431ac80467da20 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 13 Dec 2019 17:14:51 -0800 Subject: [PATCH 3/8] test: Fix test case for pool based balancing instead of rule batched Signed-off-by: David Zafman --- qa/standalone/mgr/balancer.sh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/qa/standalone/mgr/balancer.sh b/qa/standalone/mgr/balancer.sh index f7aba3156d3..ce38dd13754 100755 --- a/qa/standalone/mgr/balancer.sh +++ b/qa/standalone/mgr/balancer.sh @@ -195,17 +195,20 @@ function TEST_balancer2() { sleep 30 ceph osd df - # FINAL_PER_OSD2 should distribute evenly + # We should be with plue or minus 1 of FINAL_PER_OSD2 + # This is because here each pool is balanced independently + MIN=$(expr $FINAL_PER_OSD2 - 1) + MAX=$(expr $FINAL_PER_OSD2 + 1) PGS=$(ceph osd df --format=json-pretty | jq '.nodes[0].pgs') - test $PGS -eq $FINAL_PER_OSD2 || return 1 + test $PGS -ge $MIN -a $PGS -le $MAX || return 1 PGS=$(ceph osd df --format=json-pretty | jq '.nodes[1].pgs') - test $PGS -eq $FINAL_PER_OSD2 || return 1 + test $PGS -ge $MIN -a $PGS -le $MAX || return 1 PGS=$(ceph osd df --format=json-pretty | jq '.nodes[2].pgs') - test $PGS -eq $FINAL_PER_OSD2 || return 1 + test $PGS -ge $MIN -a $PGS -le $MAX || return 1 PGS=$(ceph osd df --format=json-pretty | jq '.nodes[3].pgs') - test $PGS -eq $FINAL_PER_OSD2 || return 1 + test $PGS -ge $MIN -a $PGS -le $MAX || return 1 PGS=$(ceph osd df --format=json-pretty | jq '.nodes[4].pgs') - test $PGS -eq $FINAL_PER_OSD2 || return 1 + test $PGS -ge $MIN -a $PGS -le $MAX || return 1 teardown $dir || return 1 } From e42a6ccb1819be4988e3ed7bd78fcf513f8d1589 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 17 Dec 2019 17:35:14 -0800 Subject: [PATCH 4/8] tools: osdmaptool document non-upmap options that were missing Signed-off-by: David Zafman --- doc/man/8/osdmaptool.rst | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index 068338d31b3..fd65f7a3128 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -13,6 +13,8 @@ Synopsis | **osdmaptool** *mapfilename* [--print] [--createsimple *numosd* [--pgbits *bitsperosd* ] ] [--clobber] +| **osdmaptool** *mapfilename* [--import-crush *crushmap*] +| **osdmaptool** *mapfilename* [--export-crush *crushmap*] Description @@ -111,6 +113,10 @@ Options mark osds up and in (but do not persist). +.. option:: --mark-out + + mark an osd as out (but do not persist) + .. option:: --tree Displays a hierarchical tree of the map. @@ -119,6 +125,15 @@ Options clears pg_temp and primary_temp variables. +.. option:: --health + + dump health checks + +.. option:: --with-default-pool + + include default pool when creating map + + Example ======= @@ -130,19 +145,19 @@ To view the result:: osdmaptool --print osdmap -To view the mappings of placement groups for pool 0:: +To view the mappings of placement groups for pool 1:: - osdmaptool --test-map-pgs-dump rbd --pool 0 + osdmaptool osdmap --test-map-pgs-dump --pool 1 pool 0 pg_num 8 - 0.0 [0,2,1] 0 - 0.1 [2,0,1] 2 - 0.2 [0,1,2] 0 - 0.3 [2,0,1] 2 - 0.4 [0,2,1] 0 - 0.5 [0,2,1] 0 - 0.6 [0,1,2] 0 - 0.7 [1,0,2] 1 + 1.0 [0,2,1] 0 + 1.1 [2,0,1] 2 + 1.2 [0,1,2] 0 + 1.3 [2,0,1] 2 + 1.4 [0,2,1] 0 + 1.5 [0,2,1] 0 + 1.6 [0,1,2] 0 + 1.7 [1,0,2] 1 #osd count first primary c wt wt osd.0 8 5 5 1 1 osd.1 8 1 1 1 1 @@ -157,7 +172,7 @@ To view the mappings of placement groups for pool 0:: size 3 8 In which, - #. pool 0 has 8 placement groups. And two tables follow: + #. pool 1 has 8 placement groups. And two tables follow: #. A table for placement groups. Each row presents a placement group. With columns of: * placement group id, From 184e9d1ae3b5bcc332d5fe3330d46a5cb8fcacd6 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Tue, 17 Dec 2019 19:38:51 -0800 Subject: [PATCH 5/8] doc: Add upmap options to osdmaptool man page and give example Signed-off-by: David Zafman --- doc/man/8/osdmaptool.rst | 28 ++++++++++++++++++++++++ doc/rados/operations/upmap.rst | 39 ++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index fd65f7a3128..3ac3abb4369 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -15,6 +15,10 @@ Synopsis [--pgbits *bitsperosd* ] ] [--clobber] | **osdmaptool** *mapfilename* [--import-crush *crushmap*] | **osdmaptool** *mapfilename* [--export-crush *crushmap*] +| **osdmaptool** *mapfilename* [--upmap *file*] [--upmap-max *max-optimizations*] + [--upmap-deviation *max-deviation*] [--upmap-pool *poolname*] + [--upmap-save *file*] [--upmap-save *newosdmap*] +| **osdmaptool** *mapfilename* [--upmap-cleanup] [--upmap-save *newosdmap*] Description @@ -133,6 +137,30 @@ Options include default pool when creating map +.. option:: --upmap-cleanup + + clean up pg_upmap[_items] entries, writing commands to [default: - for stdout] + +.. option:: --upmap + + calculate pg upmap entries to balance pg layout writing commands to [default: - for stdout] + +.. option:: --upmap-max + + set max upmap entries to calculate [default: 10] + +.. option:: --upmap-deviation + + max deviation from target [default: 1] + +.. option:: --upmap-pool + + restrict upmap balancing to 1 pool or the option can be repeated for multiple pools + +.. option:: --upmap-save + + write modified OSDMap with upmap changes + Example ======= diff --git a/doc/rados/operations/upmap.rst b/doc/rados/operations/upmap.rst index 58f63226197..40072d9ed68 100644 --- a/doc/rados/operations/upmap.rst +++ b/doc/rados/operations/upmap.rst @@ -23,14 +23,12 @@ use with:: ceph features -A word of caution +Balancer module ----------------- -This is a new feature and not very user friendly. At the time of this -writing we are working on a new `balancer` module for ceph-mgr that -will eventually do all of this automatically. +The new `balancer` module for ceph-mgr will automatically balance +the number of PGs per OSD. See ``Balancer`` -Until then, Offline optimization -------------------- @@ -43,7 +41,8 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. #. Run the optimizer:: - osdmaptool om --upmap out.txt [--upmap-pool ] [--upmap-max ] [--upmap-deviation ] + osdmaptool om --upmap out.txt [--upmap-pool ] + [--upmap-max ] [--upmap-deviation ] It is highly recommended that optimization be done for each pool individually, or for sets of similarly-utilized pools. You can @@ -52,24 +51,28 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. kind of data (e.g., RBD image pools, yes; RGW index pool and RGW data pool, no). - The ``max-count`` value is the maximum number of upmap entries to - identify in the run. The default is 100, but you may want to make - this a smaller number so that the tool completes more quickly (but - does less work). If it cannot find any additional changes to make - it will stop early (i.e., when the pool distribution is perfect). + The ``max-optimizations`` value is the maximum number of upmap entries to + identify in the run. The default is `10` like the ceph-mgr balancer module, + but you should use a larger number if you are doing offline optimization. + If it cannot find any additional changes to make it will stop early + (i.e., when the pool distribution is perfect). - The ``max-deviation`` value defaults to `.01` (i.e., 1%). If an OSD - utilization varies from the average by less than this amount it - will be considered perfect. + The ``max-deviation`` value defaults to `1`. If an OSD PG count + varies from the computed target number by less than or equal + to this amount it will be considered perfect. -#. The proposed changes are written to the output file ``out.txt`` in - the example above. These are normal ceph CLI commands that can be - run to apply the changes to the cluster. This can be done with:: +#. Apply the changes:: source out.txt + The proposed changes are written to the output file ``out.txt`` in + the example above. These are normal ceph CLI commands that can be + run to apply the changes to the cluster. + + The above steps can be repeated as many times as necessary to achieve a perfect distribution of PGs for each set of pools. You can see some (gory) details about what the tool is doing by -passing ``--debug-osd 10`` to ``osdmaptool``. +passing ``--debug-osd 10`` and even more with ``--debug-crush 10`` +to ``osdmaptool``. From 621acf8ce7f48253e9d2189a9a2ee432fa1d3ba1 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 18 Dec 2019 11:27:02 -0800 Subject: [PATCH 6/8] osdmaptool: Add --upmap-active to simulate active upmap balancing Signed-off-by: David Zafman --- doc/man/8/osdmaptool.rst | 58 ++++++++++++++++- doc/rados/operations/upmap.rst | 7 ++ src/test/cli/osdmaptool/help.t | 1 + src/tools/osdmaptool.cc | 113 +++++++++++++++++++++++---------- 4 files changed, 144 insertions(+), 35 deletions(-) diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index 3ac3abb4369..cfe52f63f6d 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -17,7 +17,7 @@ Synopsis | **osdmaptool** *mapfilename* [--export-crush *crushmap*] | **osdmaptool** *mapfilename* [--upmap *file*] [--upmap-max *max-optimizations*] [--upmap-deviation *max-deviation*] [--upmap-pool *poolname*] - [--upmap-save *file*] [--upmap-save *newosdmap*] + [--upmap-save *file*] [--upmap-save *newosdmap*] [--upmap-active] | **osdmaptool** *mapfilename* [--upmap-cleanup] [--upmap-save *newosdmap*] @@ -27,6 +27,8 @@ Description **osdmaptool** is a utility that lets you create, view, and manipulate OSD cluster maps from the Ceph distributed storage system. Notably, it lets you extract the embedded CRUSH map or import a new CRUSH map. +It can also simulate the upmap balancer mode so you can get a sense of +what is needed to balance your PGs. Options @@ -161,6 +163,10 @@ Options write modified OSDMap with upmap changes +.. option:: --upmap-active + + Act like an active balancer, keep applying changes until balanced + Example ======= @@ -244,6 +250,56 @@ placement group distribution, whose standard deviation is 1.41421:: size 20 size 364 + To simulate the active balancer in upmap mode:: + + osdmaptool --upmap upmaps.out --upmap-active --upmap-deviation 6 --upmap-max 11 osdmap + + osdmaptool: osdmap file 'osdmap' + writing upmap command output to: upmaps.out + checking for upmap cleanups + upmap, max-count 11, max deviation 6 + pools movies photos metadata data + prepared 11/11 changes + Time elapsed 0.00310404 secs + pools movies photos metadata data + prepared 11/11 changes + Time elapsed 0.00283402 secs + pools data metadata movies photos + prepared 11/11 changes + Time elapsed 0.003122 secs + pools photos metadata data movies + prepared 11/11 changes + Time elapsed 0.00324372 secs + pools movies metadata data photos + prepared 1/11 changes + Time elapsed 0.00222609 secs + pools data movies photos metadata + prepared 0/11 changes + Time elapsed 0.00209916 secs + Unable to find further optimization, or distribution is already perfect + osd.0 pgs 41 + osd.1 pgs 42 + osd.2 pgs 42 + osd.3 pgs 41 + osd.4 pgs 46 + osd.5 pgs 39 + osd.6 pgs 39 + osd.7 pgs 43 + osd.8 pgs 41 + osd.9 pgs 46 + osd.10 pgs 46 + osd.11 pgs 46 + osd.12 pgs 46 + osd.13 pgs 41 + osd.14 pgs 40 + osd.15 pgs 40 + osd.16 pgs 39 + osd.17 pgs 46 + osd.18 pgs 46 + osd.19 pgs 39 + osd.20 pgs 42 + Total time elapsed 0.0167765 secs, 5 rounds + Availability ============ diff --git a/doc/rados/operations/upmap.rst b/doc/rados/operations/upmap.rst index 40072d9ed68..3fe65ea8d6f 100644 --- a/doc/rados/operations/upmap.rst +++ b/doc/rados/operations/upmap.rst @@ -43,6 +43,7 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. osdmaptool om --upmap out.txt [--upmap-pool ] [--upmap-max ] [--upmap-deviation ] + [--upmap-active] It is highly recommended that optimization be done for each pool individually, or for sets of similarly-utilized pools. You can @@ -61,6 +62,12 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. varies from the computed target number by less than or equal to this amount it will be considered perfect. + The ``--upmap-active`` option simulates the behavior of the active + balancer in upmap mode. It keeps cycling until the OSDs are balanced + and reports how many rounds and how long each round is taking. The + elapsed time for rounds indicates the CPU load ceph-mgr will be + consuming when it tries to compute the next optimization plan. + #. Apply the changes:: source out.txt diff --git a/src/test/cli/osdmaptool/help.t b/src/test/cli/osdmaptool/help.t index 807ec7c3f2c..09b1d785518 100644 --- a/src/test/cli/osdmaptool/help.t +++ b/src/test/cli/osdmaptool/help.t @@ -28,6 +28,7 @@ max deviation from target [default: 1] --upmap-pool restrict upmap balancing to 1 or more pools --upmap-save write modified OSDMap with upmap changes + --upmap-active Act like an active balancer, keep applying changes until balanced --dump displays the map in plain text when is 'plain', 'json' if specified format is not supported --tree displays a tree of the map --test-crush [--range-first --range-last ] map pgs to acting osds diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index 4d0436860b6..b74c91468a6 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -19,6 +19,7 @@ #include "common/errno.h" #include "common/safe_io.h" #include "mon/health_check.h" +#include #include #include "global/global_init.h" @@ -56,6 +57,7 @@ void usage() cout << " max deviation from target [default: 1]" << std::endl; cout << " --upmap-pool restrict upmap balancing to 1 or more pools" << std::endl; cout << " --upmap-save write modified OSDMap with upmap changes" << std::endl; + cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl; cout << " --dump displays the map in plain text when is 'plain', 'json' if specified format is not supported" << std::endl; cout << " --tree displays a tree of the map" << std::endl; cout << " --test-crush [--range-first --range-last ] map pgs to acting osds" << std::endl; @@ -144,6 +146,7 @@ int main(int argc, const char **argv) std::string upmap_file = "-"; int upmap_max = 10; int upmap_deviation = 1; + bool upmap_active = false; std::set upmap_pools; int64_t pg_num = -1; bool test_map_pgs_dump_all = false; @@ -184,6 +187,8 @@ int main(int argc, const char **argv) exit(EXIT_FAILURE); } createsimple = true; + } else if (ceph_argparse_flag(args, i, "--upmap-active", (char*)NULL)) { + upmap_active = true; } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) { health = true; } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) { @@ -379,9 +384,8 @@ int main(int argc, const char **argv) cout << "upmap, max-count " << upmap_max << ", max deviation " << upmap_deviation << std::endl; - OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); - pending_inc.fsid = osdmap.get_fsid(); vector pools; + set upmap_pool_nums; for (auto& s : upmap_pools) { int64_t p = osdmap.lookup_pg_pool_name(s); if (p < 0) { @@ -389,6 +393,7 @@ int main(int argc, const char **argv) exit(1); } pools.push_back(p); + upmap_pool_nums.insert(p); } if (!pools.empty()) { cout << " limiting to pools " << upmap_pools << " (" << pools << ")" @@ -403,39 +408,79 @@ int main(int argc, const char **argv) cout << "No pools available" << std::endl; goto skip_upmap; } - std::random_device rd; - std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); - cout << "pools "; - for (auto& i: pools) - cout << osdmap.get_pool_name(i) << " "; - cout << std::endl; - int total_did = 0; - int left = upmap_max; - for (auto& i: pools) { - set one_pool; - one_pool.insert(i); - int did = osdmap.calc_pg_upmaps( - g_ceph_context, upmap_deviation, - left, one_pool, - &pending_inc); - total_did += did; - left -= did; - if (left <= 0) - break; - } - cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; - if (total_did > 0) { - print_inc_upmaps(pending_inc, upmap_fd); - if (upmap_save) { - int r = osdmap.apply_incremental(pending_inc); - ceph_assert(r == 0); - modified = true; + int rounds = 0; + struct timespec round_start; + int r = clock_gettime(CLOCK_MONOTONIC, &round_start); + assert(r == 0); + do { + std::random_device rd; + std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); + cout << "pools "; + for (auto& i: pools) + cout << osdmap.get_pool_name(i) << " "; + cout << std::endl; + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + pending_inc.fsid = osdmap.get_fsid(); + int total_did = 0; + int left = upmap_max; + struct timespec begin, end; + r = clock_gettime(CLOCK_MONOTONIC, &begin); + assert(r == 0); + for (auto& i: pools) { + set one_pool; + one_pool.insert(i); + int did = osdmap.calc_pg_upmaps( + g_ceph_context, upmap_deviation, + left, one_pool, + &pending_inc); + total_did += did; + left -= did; + if (left <= 0) + break; } - } else { - cout << "Unable to find further optimization, " - << "or distribution is already perfect" - << std::endl; - } + r = clock_gettime(CLOCK_MONOTONIC, &end); + assert(r == 0); + cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; + float elapsed_time = (end.tv_sec - begin.tv_sec) + 1.0e-9*(end.tv_nsec - begin.tv_nsec); + if (upmap_active) + cout << "Time elapsed " << elapsed_time << " secs" << std::endl; + if (total_did > 0) { + print_inc_upmaps(pending_inc, upmap_fd); + if (upmap_save || upmap_active) { + int r = osdmap.apply_incremental(pending_inc); + ceph_assert(r == 0); + if (upmap_save) + modified = true; + } + } else { + cout << "Unable to find further optimization, " + << "or distribution is already perfect" + << std::endl; + if (upmap_active) { + map> pgs_by_osd; + for (auto& i : osdmap.get_pools()) { + if (!upmap_pool_nums.empty() && !upmap_pool_nums.count(i.first)) + continue; + for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) { + pg_t pg(ps, i.first); + vector up; + osdmap.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + //ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + } + for (auto& i : pgs_by_osd) + cout << "osd." << i.first << " pgs " << i.second.size() << std::endl; + float elapsed_time = (end.tv_sec - round_start.tv_sec) + 1.0e-9*(end.tv_nsec - round_start.tv_nsec); + cout << "Total time elapsed " << elapsed_time << " secs, " << rounds << " rounds" << std::endl; + } + break; + } + ++rounds; + } while(upmap_active); } skip_upmap: if (upmap_file != "-") { From b0a1b758d012dfea40db3feca1a841c96f79defe Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 13 Dec 2019 17:43:44 -0800 Subject: [PATCH 7/8] mgr: Change default upmap_max_deviation to 5 Fixes: https://tracker.ceph.com/issues/43312 Signed-off-by: David Zafman --- doc/man/8/osdmaptool.rst | 2 +- doc/rados/operations/upmap.rst | 2 +- qa/standalone/mgr/balancer.sh | 1 + src/pybind/mgr/balancer/module.py | 2 +- src/test/cli/osdmaptool/help.t | 2 +- src/test/cli/osdmaptool/upmap-out.t | 2 +- src/test/cli/osdmaptool/upmap.t | 2 +- src/tools/osdmaptool.cc | 4 ++-- 8 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index cfe52f63f6d..81a962a0050 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -153,7 +153,7 @@ Options .. option:: --upmap-deviation - max deviation from target [default: 1] + max deviation from target [default: 5] .. option:: --upmap-pool diff --git a/doc/rados/operations/upmap.rst b/doc/rados/operations/upmap.rst index 3fe65ea8d6f..8bad0d95928 100644 --- a/doc/rados/operations/upmap.rst +++ b/doc/rados/operations/upmap.rst @@ -58,7 +58,7 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. If it cannot find any additional changes to make it will stop early (i.e., when the pool distribution is perfect). - The ``max-deviation`` value defaults to `1`. If an OSD PG count + The ``max-deviation`` value defaults to `5`. If an OSD PG count varies from the computed target number by less than or equal to this amount it will be considered perfect. diff --git a/qa/standalone/mgr/balancer.sh b/qa/standalone/mgr/balancer.sh index ce38dd13754..984b6504ba0 100755 --- a/qa/standalone/mgr/balancer.sh +++ b/qa/standalone/mgr/balancer.sh @@ -141,6 +141,7 @@ function TEST_balancer2() { done ceph osd set-require-min-compat-client luminous + ceph config set mgr mgr/balancer/upmap_max_deviation 1 ceph balancer mode upmap || return 1 ceph balancer on || return 1 ceph config set mgr mgr/balancer/sleep_interval 5 diff --git a/src/pybind/mgr/balancer/module.py b/src/pybind/mgr/balancer/module.py index 2290aa0fe04..bbee82ee6a4 100644 --- a/src/pybind/mgr/balancer/module.py +++ b/src/pybind/mgr/balancer/module.py @@ -313,7 +313,7 @@ class Module(MgrModule): { 'name': 'upmap_max_deviation', 'type': 'int', - 'default': 1, + 'default': 5, 'min': 1, 'desc': 'deviation below which no optimization is attempted', 'long_desc': 'If the number of PGs are within this count then no optimization is attempted', diff --git a/src/test/cli/osdmaptool/help.t b/src/test/cli/osdmaptool/help.t index 09b1d785518..136d3b05e89 100644 --- a/src/test/cli/osdmaptool/help.t +++ b/src/test/cli/osdmaptool/help.t @@ -25,7 +25,7 @@ writing commands to [default: - for stdout] --upmap-max set max upmap entries to calculate [default: 10] --upmap-deviation - max deviation from target [default: 1] + max deviation from target [default: 5] --upmap-pool restrict upmap balancing to 1 or more pools --upmap-save write modified OSDMap with upmap changes --upmap-active Act like an active balancer, keep applying changes until balanced diff --git a/src/test/cli/osdmaptool/upmap-out.t b/src/test/cli/osdmaptool/upmap-out.t index 8a167aba09a..02b13ec561b 100644 --- a/src/test/cli/osdmaptool/upmap-out.t +++ b/src/test/cli/osdmaptool/upmap-out.t @@ -7,7 +7,7 @@ marking OSD@147 as out writing upmap command output to: c checking for upmap cleanups - upmap, max-count 11, max deviation 1 + upmap, max-count 11, max deviation 5 pools rbd prepared 11/11 changes $ cat c diff --git a/src/test/cli/osdmaptool/upmap.t b/src/test/cli/osdmaptool/upmap.t index 8fa1a610313..23a5d5d32d4 100644 --- a/src/test/cli/osdmaptool/upmap.t +++ b/src/test/cli/osdmaptool/upmap.t @@ -6,7 +6,7 @@ marking all OSDs up and in writing upmap command output to: c checking for upmap cleanups - upmap, max-count 11, max deviation 1 + upmap, max-count 11, max deviation 5 pools rbd prepared 11/11 changes $ cat c diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index b74c91468a6..fbbcddafc6a 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -54,7 +54,7 @@ void usage() cout << " writing commands to [default: - for stdout]" << std::endl; cout << " --upmap-max set max upmap entries to calculate [default: 10]" << std::endl; cout << " --upmap-deviation " << std::endl; - cout << " max deviation from target [default: 1]" << std::endl; + cout << " max deviation from target [default: 5]" << std::endl; cout << " --upmap-pool restrict upmap balancing to 1 or more pools" << std::endl; cout << " --upmap-save write modified OSDMap with upmap changes" << std::endl; cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl; @@ -145,7 +145,7 @@ int main(int argc, const char **argv) bool health = false; std::string upmap_file = "-"; int upmap_max = 10; - int upmap_deviation = 1; + int upmap_deviation = 5; bool upmap_active = false; std::set upmap_pools; int64_t pg_num = -1; From c65d5c8d1421438da999698f1f734d6a61adb3bb Mon Sep 17 00:00:00 2001 From: David Zafman Date: Fri, 20 Dec 2019 13:46:34 -0800 Subject: [PATCH 8/8] test: Sort pool list because the order isn't guaranteed from "balancer pool ls" Signed-off-by: David Zafman --- qa/standalone/mgr/balancer.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qa/standalone/mgr/balancer.sh b/qa/standalone/mgr/balancer.sh index 984b6504ba0..7e87cbf4a5f 100755 --- a/qa/standalone/mgr/balancer.sh +++ b/qa/standalone/mgr/balancer.sh @@ -67,9 +67,9 @@ function TEST_balancer() { ceph balancer pool add $TEST_POOL1 || return 1 ceph balancer pool add $TEST_POOL2 || return 1 ceph balancer pool ls || return 1 - eval POOL=$(ceph balancer pool ls | jq '.[0]') + eval POOL=$(ceph balancer pool ls | jq 'sort | .[0]') test "$POOL" = "$TEST_POOL1" || return 1 - eval POOL=$(ceph balancer pool ls | jq '.[1]') + eval POOL=$(ceph balancer pool ls | jq 'sort | .[1]') test "$POOL" = "$TEST_POOL2" || return 1 ceph balancer pool rm $TEST_POOL1 || return 1 ceph balancer pool rm $TEST_POOL2 || return 1