From 621acf8ce7f48253e9d2189a9a2ee432fa1d3ba1 Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 18 Dec 2019 11:27:02 -0800 Subject: [PATCH] osdmaptool: Add --upmap-active to simulate active upmap balancing Signed-off-by: David Zafman --- doc/man/8/osdmaptool.rst | 58 ++++++++++++++++- doc/rados/operations/upmap.rst | 7 ++ src/test/cli/osdmaptool/help.t | 1 + src/tools/osdmaptool.cc | 113 +++++++++++++++++++++++---------- 4 files changed, 144 insertions(+), 35 deletions(-) diff --git a/doc/man/8/osdmaptool.rst b/doc/man/8/osdmaptool.rst index 3ac3abb4369..cfe52f63f6d 100644 --- a/doc/man/8/osdmaptool.rst +++ b/doc/man/8/osdmaptool.rst @@ -17,7 +17,7 @@ Synopsis | **osdmaptool** *mapfilename* [--export-crush *crushmap*] | **osdmaptool** *mapfilename* [--upmap *file*] [--upmap-max *max-optimizations*] [--upmap-deviation *max-deviation*] [--upmap-pool *poolname*] - [--upmap-save *file*] [--upmap-save *newosdmap*] + [--upmap-save *file*] [--upmap-save *newosdmap*] [--upmap-active] | **osdmaptool** *mapfilename* [--upmap-cleanup] [--upmap-save *newosdmap*] @@ -27,6 +27,8 @@ Description **osdmaptool** is a utility that lets you create, view, and manipulate OSD cluster maps from the Ceph distributed storage system. Notably, it lets you extract the embedded CRUSH map or import a new CRUSH map. +It can also simulate the upmap balancer mode so you can get a sense of +what is needed to balance your PGs. Options @@ -161,6 +163,10 @@ Options write modified OSDMap with upmap changes +.. option:: --upmap-active + + Act like an active balancer, keep applying changes until balanced + Example ======= @@ -244,6 +250,56 @@ placement group distribution, whose standard deviation is 1.41421:: size 20 size 364 + To simulate the active balancer in upmap mode:: + + osdmaptool --upmap upmaps.out --upmap-active --upmap-deviation 6 --upmap-max 11 osdmap + + osdmaptool: osdmap file 'osdmap' + writing upmap command output to: upmaps.out + checking for upmap cleanups + upmap, max-count 11, max deviation 6 + pools movies photos metadata data + prepared 11/11 changes + Time elapsed 0.00310404 secs + pools movies photos metadata data + prepared 11/11 changes + Time elapsed 0.00283402 secs + pools data metadata movies photos + prepared 11/11 changes + Time elapsed 0.003122 secs + pools photos metadata data movies + prepared 11/11 changes + Time elapsed 0.00324372 secs + pools movies metadata data photos + prepared 1/11 changes + Time elapsed 0.00222609 secs + pools data movies photos metadata + prepared 0/11 changes + Time elapsed 0.00209916 secs + Unable to find further optimization, or distribution is already perfect + osd.0 pgs 41 + osd.1 pgs 42 + osd.2 pgs 42 + osd.3 pgs 41 + osd.4 pgs 46 + osd.5 pgs 39 + osd.6 pgs 39 + osd.7 pgs 43 + osd.8 pgs 41 + osd.9 pgs 46 + osd.10 pgs 46 + osd.11 pgs 46 + osd.12 pgs 46 + osd.13 pgs 41 + osd.14 pgs 40 + osd.15 pgs 40 + osd.16 pgs 39 + osd.17 pgs 46 + osd.18 pgs 46 + osd.19 pgs 39 + osd.20 pgs 42 + Total time elapsed 0.0167765 secs, 5 rounds + Availability ============ diff --git a/doc/rados/operations/upmap.rst b/doc/rados/operations/upmap.rst index 40072d9ed68..3fe65ea8d6f 100644 --- a/doc/rados/operations/upmap.rst +++ b/doc/rados/operations/upmap.rst @@ -43,6 +43,7 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. osdmaptool om --upmap out.txt [--upmap-pool ] [--upmap-max ] [--upmap-deviation ] + [--upmap-active] It is highly recommended that optimization be done for each pool individually, or for sets of similarly-utilized pools. You can @@ -61,6 +62,12 @@ Upmap entries are updated with an offline optimizer built into ``osdmaptool``. varies from the computed target number by less than or equal to this amount it will be considered perfect. + The ``--upmap-active`` option simulates the behavior of the active + balancer in upmap mode. It keeps cycling until the OSDs are balanced + and reports how many rounds and how long each round is taking. The + elapsed time for rounds indicates the CPU load ceph-mgr will be + consuming when it tries to compute the next optimization plan. + #. Apply the changes:: source out.txt diff --git a/src/test/cli/osdmaptool/help.t b/src/test/cli/osdmaptool/help.t index 807ec7c3f2c..09b1d785518 100644 --- a/src/test/cli/osdmaptool/help.t +++ b/src/test/cli/osdmaptool/help.t @@ -28,6 +28,7 @@ max deviation from target [default: 1] --upmap-pool restrict upmap balancing to 1 or more pools --upmap-save write modified OSDMap with upmap changes + --upmap-active Act like an active balancer, keep applying changes until balanced --dump displays the map in plain text when is 'plain', 'json' if specified format is not supported --tree displays a tree of the map --test-crush [--range-first --range-last ] map pgs to acting osds diff --git a/src/tools/osdmaptool.cc b/src/tools/osdmaptool.cc index 4d0436860b6..b74c91468a6 100644 --- a/src/tools/osdmaptool.cc +++ b/src/tools/osdmaptool.cc @@ -19,6 +19,7 @@ #include "common/errno.h" #include "common/safe_io.h" #include "mon/health_check.h" +#include #include #include "global/global_init.h" @@ -56,6 +57,7 @@ void usage() cout << " max deviation from target [default: 1]" << std::endl; cout << " --upmap-pool restrict upmap balancing to 1 or more pools" << std::endl; cout << " --upmap-save write modified OSDMap with upmap changes" << std::endl; + cout << " --upmap-active Act like an active balancer, keep applying changes until balanced" << std::endl; cout << " --dump displays the map in plain text when is 'plain', 'json' if specified format is not supported" << std::endl; cout << " --tree displays a tree of the map" << std::endl; cout << " --test-crush [--range-first --range-last ] map pgs to acting osds" << std::endl; @@ -144,6 +146,7 @@ int main(int argc, const char **argv) std::string upmap_file = "-"; int upmap_max = 10; int upmap_deviation = 1; + bool upmap_active = false; std::set upmap_pools; int64_t pg_num = -1; bool test_map_pgs_dump_all = false; @@ -184,6 +187,8 @@ int main(int argc, const char **argv) exit(EXIT_FAILURE); } createsimple = true; + } else if (ceph_argparse_flag(args, i, "--upmap-active", (char*)NULL)) { + upmap_active = true; } else if (ceph_argparse_flag(args, i, "--health", (char*)NULL)) { health = true; } else if (ceph_argparse_flag(args, i, "--with-default-pool", (char*)NULL)) { @@ -379,9 +384,8 @@ int main(int argc, const char **argv) cout << "upmap, max-count " << upmap_max << ", max deviation " << upmap_deviation << std::endl; - OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); - pending_inc.fsid = osdmap.get_fsid(); vector pools; + set upmap_pool_nums; for (auto& s : upmap_pools) { int64_t p = osdmap.lookup_pg_pool_name(s); if (p < 0) { @@ -389,6 +393,7 @@ int main(int argc, const char **argv) exit(1); } pools.push_back(p); + upmap_pool_nums.insert(p); } if (!pools.empty()) { cout << " limiting to pools " << upmap_pools << " (" << pools << ")" @@ -403,39 +408,79 @@ int main(int argc, const char **argv) cout << "No pools available" << std::endl; goto skip_upmap; } - std::random_device rd; - std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); - cout << "pools "; - for (auto& i: pools) - cout << osdmap.get_pool_name(i) << " "; - cout << std::endl; - int total_did = 0; - int left = upmap_max; - for (auto& i: pools) { - set one_pool; - one_pool.insert(i); - int did = osdmap.calc_pg_upmaps( - g_ceph_context, upmap_deviation, - left, one_pool, - &pending_inc); - total_did += did; - left -= did; - if (left <= 0) - break; - } - cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; - if (total_did > 0) { - print_inc_upmaps(pending_inc, upmap_fd); - if (upmap_save) { - int r = osdmap.apply_incremental(pending_inc); - ceph_assert(r == 0); - modified = true; + int rounds = 0; + struct timespec round_start; + int r = clock_gettime(CLOCK_MONOTONIC, &round_start); + assert(r == 0); + do { + std::random_device rd; + std::shuffle(pools.begin(), pools.end(), std::mt19937{rd()}); + cout << "pools "; + for (auto& i: pools) + cout << osdmap.get_pool_name(i) << " "; + cout << std::endl; + OSDMap::Incremental pending_inc(osdmap.get_epoch()+1); + pending_inc.fsid = osdmap.get_fsid(); + int total_did = 0; + int left = upmap_max; + struct timespec begin, end; + r = clock_gettime(CLOCK_MONOTONIC, &begin); + assert(r == 0); + for (auto& i: pools) { + set one_pool; + one_pool.insert(i); + int did = osdmap.calc_pg_upmaps( + g_ceph_context, upmap_deviation, + left, one_pool, + &pending_inc); + total_did += did; + left -= did; + if (left <= 0) + break; } - } else { - cout << "Unable to find further optimization, " - << "or distribution is already perfect" - << std::endl; - } + r = clock_gettime(CLOCK_MONOTONIC, &end); + assert(r == 0); + cout << "prepared " << total_did << "/" << upmap_max << " changes" << std::endl; + float elapsed_time = (end.tv_sec - begin.tv_sec) + 1.0e-9*(end.tv_nsec - begin.tv_nsec); + if (upmap_active) + cout << "Time elapsed " << elapsed_time << " secs" << std::endl; + if (total_did > 0) { + print_inc_upmaps(pending_inc, upmap_fd); + if (upmap_save || upmap_active) { + int r = osdmap.apply_incremental(pending_inc); + ceph_assert(r == 0); + if (upmap_save) + modified = true; + } + } else { + cout << "Unable to find further optimization, " + << "or distribution is already perfect" + << std::endl; + if (upmap_active) { + map> pgs_by_osd; + for (auto& i : osdmap.get_pools()) { + if (!upmap_pool_nums.empty() && !upmap_pool_nums.count(i.first)) + continue; + for (unsigned ps = 0; ps < i.second.get_pg_num(); ++ps) { + pg_t pg(ps, i.first); + vector up; + osdmap.pg_to_up_acting_osds(pg, &up, nullptr, nullptr, nullptr); + //ldout(cct, 20) << __func__ << " " << pg << " up " << up << dendl; + for (auto osd : up) { + if (osd != CRUSH_ITEM_NONE) + pgs_by_osd[osd].insert(pg); + } + } + } + for (auto& i : pgs_by_osd) + cout << "osd." << i.first << " pgs " << i.second.size() << std::endl; + float elapsed_time = (end.tv_sec - round_start.tv_sec) + 1.0e-9*(end.tv_nsec - round_start.tv_nsec); + cout << "Total time elapsed " << elapsed_time << " secs, " << rounds << " rounds" << std::endl; + } + break; + } + ++rounds; + } while(upmap_active); } skip_upmap: if (upmap_file != "-") {