From 930eae214c54f5e0e790c09f35da14dff8585ac1 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 13 Feb 2017 17:42:33 -0800 Subject: [PATCH] OSDMonitor: get stripe_width via stripe_unit in ec profile With bluestore, making the smallest write match min_alloc_size avoids write amplification. With EC pools this is the stripe unit, or stripe_width / num_data_chunks. Rather than requiring people to divide by k to get the smallest ec write, allow it to be specified directly via stripe_unit. Store it in the ec profile so changing a monitor config option isn't necessary to set it. This is particularly important for ec overwrites since they allow random i/o which should match bluestore's checksum granularity (aka min_alloc_size). Signed-off-by: Josh Durgin --- PendingReleaseNotes | 7 ++- .../configuration/pool-pg-config-ref.rst | 14 +++-- doc/rados/operations/erasure-code-profile.rst | 26 ++++++-- src/common/config.h | 2 - src/common/config_opts.h | 2 +- src/mon/OSDMonitor.cc | 62 +++++++++++++++---- src/mon/OSDMonitor.h | 1 + src/test/erasure-code/TestErasureCodeLrc.cc | 8 +-- src/test/erasure-code/test-erasure-code.sh | 9 ++- src/test/mon/osd-pool-create.sh | 20 +++--- 10 files changed, 109 insertions(+), 42 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 3a27fdd7895..e2b043034d2 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -78,7 +78,6 @@ still works. See the documentation page "Mount CephFS in your file systems table" for details. - 12.0.1 ------ @@ -106,3 +105,9 @@ * The RGW api for getting object torrent has changed its params from 'get_torrent' to 'torrent' so that it can be compatible with Amazon S3. Now the request for object torrent is like 'GET /ObjectName?torrent'. + +* The configuration option "osd pool erasure code stripe width" has + been replaced by "osd pool erasure code stripe unit", and given the + ability to be overridden by the erasure code profile setting + "stripe_unit". For more details see "Erasure Code Profiles" in the + documentation. diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst index 9f8414128b0..9cb2c8f54ff 100644 --- a/doc/rados/configuration/pool-pg-config-ref.rst +++ b/doc/rados/configuration/pool-pg-config-ref.rst @@ -85,14 +85,18 @@ Ceph configuration file. make pool creation work in the absence of ruleset 0. -``osd pool erasure code stripe width`` +``osd pool erasure code stripe unit`` -:Description: Sets the desired size, in bytes, of an object stripe on every - erasure coded pools. Every object if size S will be stored as - N stripes and each stripe will be encoded/decoded individually. +:Description: Sets the default size, in bytes, of a chunk of an object + stripe for erasure coded pools. Every object of size S + will be stored as N stripes, with each data chunk + receiving ``stripe unit`` bytes. Each stripe of ``N * + stripe unit`` bytes will be encoded/decoded + individually. This option can is overridden by the + ``stripe_unit`` setting in an erasure code profile. :Type: Unsigned 32-bit Integer -:Default: ``4096`` +:Default: ``4096`` ``osd pool default size`` diff --git a/doc/rados/operations/erasure-code-profile.rst b/doc/rados/operations/erasure-code-profile.rst index 3262b3db8dc..ddf772d36ca 100644 --- a/doc/rados/operations/erasure-code-profile.rst +++ b/doc/rados/operations/erasure-code-profile.rst @@ -39,6 +39,7 @@ To create a new erasure code profile:: ceph osd erasure-code-profile set {name} \ [{directory=directory}] \ [{plugin=plugin}] \ + [{stripe_unit=stripe_unit}] \ [{key=value} ...] \ [--force] @@ -60,23 +61,40 @@ Where: plugins`_ for more information. :Type: String -:Required: No. +:Required: No. :Default: jerasure +``{stripe_unit=stripe_unit}`` + +:Description: The amount of data in a data chunk, per stripe. For + example, a profile with 2 data chunks and stripe_unit=4K + would put the range 0-4K in chunk 0, 4K-8K in chunk 1, + then 8K-12K in chunk 0 again. This should be a multiple + of 4K for best performance. The default value is taken + from the monitor config option + ``osd_pool_erasure_code_stripe_unit`` when a pool is + created. The stripe_width of a pool using this profile + will be the number of data chunks multiplied by this + stripe_unit. + +:Type: String +:Required: No. + ``{key=value}`` :Description: The semantic of the remaining key/value pairs is defined by the erasure code plugin. :Type: String -:Required: No. +:Required: No. ``--force`` -:Description: Override an existing profile by the same name. +:Description: Override an existing profile by the same name, and allow + setting a non-4K-aligned stripe_unit. :Type: String -:Required: No. +:Required: No. osd erasure-code-profile rm ============================ diff --git a/src/common/config.h b/src/common/config.h index fe5343cdce0..bbabc14a887 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -38,8 +38,6 @@ enum { #define OSD_REP_SPLAY 1 #define OSD_REP_CHAIN 2 -#define OSD_POOL_ERASURE_CODE_STRIPE_WIDTH 4096 - class CephContext; extern const char *CEPH_CONF_FILE_DEFAULT; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2c8f947795b..836af93ebad 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -675,7 +675,7 @@ OPTION(osd_crush_update_on_start, OPT_BOOL, true) OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds. OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) -OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes +OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes OPTION(osd_pool_default_size, OPT_INT, 3) OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7a5eddb1832..07a99f23724 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4893,8 +4893,9 @@ void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& prof } } -int OSDMonitor::normalize_profile(const string& profilename, - ErasureCodeProfile &profile, +int OSDMonitor::normalize_profile(const string& profilename, + ErasureCodeProfile &profile, + bool force, ostream *ss) { ErasureCodeInterfaceRef erasure_code; @@ -4904,10 +4905,39 @@ int OSDMonitor::normalize_profile(const string& profilename, int err = instance.factory(plugin->second, g_conf->get_val("erasure_code_dir"), profile, &erasure_code, ss); - if (err) + if (err) { return err; + } - return erasure_code->init(profile, ss); + err = erasure_code->init(profile, ss); + if (err) { + return err; + } + + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + uint32_t stripe_unit = strict_si_cast(it->second.c_str(), &err_str); + if (!err_str.empty()) { + *ss << "could not parse stripe_unit '" << it->second + << "': " << err_str << std::endl; + return -EINVAL; + } + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks); + if (chunk_size != stripe_unit) { + *ss << "stripe_unit " << stripe_unit << " does not match ec profile " + << "alignment. Would be padded to " << chunk_size + << std::endl; + return -EINVAL; + } + if ((stripe_unit % 4096) != 0 && !force) { + *ss << "stripe_unit should be a multiple of 4096 bytes for best performance." + << "use --force to override this check" << std::endl; + return -EINVAL; + } + } + return 0; } int OSDMonitor::crush_ruleset_create_erasure(const string &name, @@ -5130,12 +5160,22 @@ int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type, break; case pg_pool_t::TYPE_ERASURE: { + ErasureCodeProfile profile = + osdmap.get_erasure_code_profile(erasure_code_profile); ErasureCodeInterfaceRef erasure_code; err = get_erasure_code(erasure_code_profile, &erasure_code, ss); - uint32_t desired_stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - if (err == 0) - *stripe_width = erasure_code->get_data_chunk_count() * - erasure_code->get_chunk_size(desired_stripe_width); + if (err) + break; + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit; + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + stripe_unit = strict_si_cast(it->second.c_str(), &err_str); + assert(err_str.empty()); + } + *stripe_width = data_chunks * + erasure_code->get_chunk_size(stripe_unit * data_chunks); } break; default: @@ -6831,14 +6871,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, if (err) goto reply; } - err = normalize_profile(name, profile_map, &ss); + err = normalize_profile(name, profile_map, force, &ss); if (err) goto reply; if (osdmap.has_erasure_code_profile(name)) { ErasureCodeProfile existing_profile_map = osdmap.get_erasure_code_profile(name); - err = normalize_profile(name, existing_profile_map, &ss); + err = normalize_profile(name, existing_profile_map, force, &ss); if (err) goto reply; @@ -6892,7 +6932,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, &ss); if (err) goto reply; - err = normalize_profile(name, profile_map, &ss); + err = normalize_profile(name, profile_map, true, &ss); if (err) goto reply; dout(20) << "erasure code profile set " << profile << "=" diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 32d08f78d3e..3ccbeecec76 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -297,6 +297,7 @@ private: const string& profile) const; int normalize_profile(const string& profilename, ErasureCodeProfile &profile, + bool force, ostream *ss); int crush_ruleset_create_erasure(const string &name, const string &profile, diff --git a/src/test/erasure-code/TestErasureCodeLrc.cc b/src/test/erasure-code/TestErasureCodeLrc.cc index 52fce2e6b42..50543945e3a 100644 --- a/src/test/erasure-code/TestErasureCodeLrc.cc +++ b/src/test/erasure-code/TestErasureCodeLrc.cc @@ -615,8 +615,8 @@ TEST(ErasureCodeLrc, encode_decode) profile["layers"] = description_string; EXPECT_EQ(0, lrc.init(profile, &cerr)); EXPECT_EQ(4U, lrc.get_data_chunk_count()); - unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count(); + unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit; + unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size; EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width)); set want_to_encode; map encoded; @@ -745,8 +745,8 @@ TEST(ErasureCodeLrc, encode_decode_2) profile["layers"] = description_string; EXPECT_EQ(0, lrc.init(profile, &cerr)); EXPECT_EQ(4U, lrc.get_data_chunk_count()); - unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count(); + unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit; + unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size; EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width)); set want_to_encode; map encoded; diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh index b4417f664f6..cb1123febe0 100755 --- a/src/test/erasure-code/test-erasure-code.sh +++ b/src/test/erasure-code/test-erasure-code.sh @@ -259,8 +259,9 @@ function TEST_alignment_constraints() { # imposed by the stripe width # See http://tracker.ceph.com/issues/8622 # - local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) - local block_size=$((stripe_width - 1)) + local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + local block_size=$((stripe_unit * k - 1)) dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2 rados --block-size=$block_size \ --pool ecpool put UNALIGNED $dir/ORIGINAL || return 1 @@ -268,9 +269,7 @@ function TEST_alignment_constraints() { } function chunk_size() { - local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) - eval local $(ceph osd erasure-code-profile get default | grep k=) - echo $(($stripe_width / $k)) + echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) } # diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh index 3deef6fc10d..3fd4feef8df 100755 --- a/src/test/mon/osd-pool-create.sh +++ b/src/test/mon/osd-pool-create.sh @@ -122,30 +122,32 @@ function TEST_erasure_code_profile_default() { ceph osd erasure-code-profile ls | grep default || return 1 } -function TEST_erasure_crush_stripe_width() { +function TEST_erasure_crush_stripe_unit() { local dir=$1 - # the default stripe width is used to initialize the pool + # the default stripe unit is used to initialize the pool run_mon $dir a --public-addr $CEPH_MON - stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) + stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + stripe_width = $((stripe_unit * k)) ceph osd pool create pool_erasure 12 12 erasure ceph --format json osd dump | tee $dir/osd.json grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1 } -function TEST_erasure_crush_stripe_width_padded() { +function TEST_erasure_crush_stripe_unit_padded() { local dir=$1 - # setting osd_pool_erasure_code_stripe_width modifies the stripe_width + # setting osd_pool_erasure_code_stripe_unit modifies the stripe_width # and it is padded as required by the default plugin profile+=" plugin=jerasure" profile+=" technique=reed_sol_van" k=4 profile+=" k=$k" profile+=" m=2" - expected_chunk_size=2048 - actual_stripe_width=$(($expected_chunk_size * $k)) - desired_stripe_width=$(($actual_stripe_width - 1)) + actual_stripe_unit=2048 + desired_stripe_unit=$((actual_stripe_unit - 1)) + actual_stripe_width=$((actual_stripe_unit * k)) run_mon $dir a \ - --osd_pool_erasure_code_stripe_width $desired_stripe_width \ + --osd_pool_erasure_code_stripe_unit $desired_stripe_unit \ --osd_pool_default_erasure_code_profile "$profile" || return 1 ceph osd pool create pool_erasure 12 12 erasure ceph osd dump | tee $dir/osd.json