From 930eae214c54f5e0e790c09f35da14dff8585ac1 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 13 Feb 2017 17:42:33 -0800 Subject: [PATCH 01/11] OSDMonitor: get stripe_width via stripe_unit in ec profile With bluestore, making the smallest write match min_alloc_size avoids write amplification. With EC pools this is the stripe unit, or stripe_width / num_data_chunks. Rather than requiring people to divide by k to get the smallest ec write, allow it to be specified directly via stripe_unit. Store it in the ec profile so changing a monitor config option isn't necessary to set it. This is particularly important for ec overwrites since they allow random i/o which should match bluestore's checksum granularity (aka min_alloc_size). Signed-off-by: Josh Durgin --- PendingReleaseNotes | 7 ++- .../configuration/pool-pg-config-ref.rst | 14 +++-- doc/rados/operations/erasure-code-profile.rst | 26 ++++++-- src/common/config.h | 2 - src/common/config_opts.h | 2 +- src/mon/OSDMonitor.cc | 62 +++++++++++++++---- src/mon/OSDMonitor.h | 1 + src/test/erasure-code/TestErasureCodeLrc.cc | 8 +-- src/test/erasure-code/test-erasure-code.sh | 9 ++- src/test/mon/osd-pool-create.sh | 20 +++--- 10 files changed, 109 insertions(+), 42 deletions(-) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 3a27fdd7895..e2b043034d2 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -78,7 +78,6 @@ still works. See the documentation page "Mount CephFS in your file systems table" for details. - 12.0.1 ------ @@ -106,3 +105,9 @@ * The RGW api for getting object torrent has changed its params from 'get_torrent' to 'torrent' so that it can be compatible with Amazon S3. Now the request for object torrent is like 'GET /ObjectName?torrent'. + +* The configuration option "osd pool erasure code stripe width" has + been replaced by "osd pool erasure code stripe unit", and given the + ability to be overridden by the erasure code profile setting + "stripe_unit". For more details see "Erasure Code Profiles" in the + documentation. diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst index 9f8414128b0..9cb2c8f54ff 100644 --- a/doc/rados/configuration/pool-pg-config-ref.rst +++ b/doc/rados/configuration/pool-pg-config-ref.rst @@ -85,14 +85,18 @@ Ceph configuration file. make pool creation work in the absence of ruleset 0. -``osd pool erasure code stripe width`` +``osd pool erasure code stripe unit`` -:Description: Sets the desired size, in bytes, of an object stripe on every - erasure coded pools. Every object if size S will be stored as - N stripes and each stripe will be encoded/decoded individually. +:Description: Sets the default size, in bytes, of a chunk of an object + stripe for erasure coded pools. Every object of size S + will be stored as N stripes, with each data chunk + receiving ``stripe unit`` bytes. Each stripe of ``N * + stripe unit`` bytes will be encoded/decoded + individually. This option can is overridden by the + ``stripe_unit`` setting in an erasure code profile. :Type: Unsigned 32-bit Integer -:Default: ``4096`` +:Default: ``4096`` ``osd pool default size`` diff --git a/doc/rados/operations/erasure-code-profile.rst b/doc/rados/operations/erasure-code-profile.rst index 3262b3db8dc..ddf772d36ca 100644 --- a/doc/rados/operations/erasure-code-profile.rst +++ b/doc/rados/operations/erasure-code-profile.rst @@ -39,6 +39,7 @@ To create a new erasure code profile:: ceph osd erasure-code-profile set {name} \ [{directory=directory}] \ [{plugin=plugin}] \ + [{stripe_unit=stripe_unit}] \ [{key=value} ...] \ [--force] @@ -60,23 +61,40 @@ Where: plugins`_ for more information. :Type: String -:Required: No. +:Required: No. :Default: jerasure +``{stripe_unit=stripe_unit}`` + +:Description: The amount of data in a data chunk, per stripe. For + example, a profile with 2 data chunks and stripe_unit=4K + would put the range 0-4K in chunk 0, 4K-8K in chunk 1, + then 8K-12K in chunk 0 again. This should be a multiple + of 4K for best performance. The default value is taken + from the monitor config option + ``osd_pool_erasure_code_stripe_unit`` when a pool is + created. The stripe_width of a pool using this profile + will be the number of data chunks multiplied by this + stripe_unit. + +:Type: String +:Required: No. + ``{key=value}`` :Description: The semantic of the remaining key/value pairs is defined by the erasure code plugin. :Type: String -:Required: No. +:Required: No. ``--force`` -:Description: Override an existing profile by the same name. +:Description: Override an existing profile by the same name, and allow + setting a non-4K-aligned stripe_unit. :Type: String -:Required: No. +:Required: No. osd erasure-code-profile rm ============================ diff --git a/src/common/config.h b/src/common/config.h index fe5343cdce0..bbabc14a887 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -38,8 +38,6 @@ enum { #define OSD_REP_SPLAY 1 #define OSD_REP_CHAIN 2 -#define OSD_POOL_ERASURE_CODE_STRIPE_WIDTH 4096 - class CephContext; extern const char *CEPH_CONF_FILE_DEFAULT; diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2c8f947795b..836af93ebad 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -675,7 +675,7 @@ OPTION(osd_crush_update_on_start, OPT_BOOL, true) OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds. OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET) -OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes +OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes OPTION(osd_pool_default_size, OPT_INT, 3) OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2 OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7a5eddb1832..07a99f23724 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -4893,8 +4893,9 @@ void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& prof } } -int OSDMonitor::normalize_profile(const string& profilename, - ErasureCodeProfile &profile, +int OSDMonitor::normalize_profile(const string& profilename, + ErasureCodeProfile &profile, + bool force, ostream *ss) { ErasureCodeInterfaceRef erasure_code; @@ -4904,10 +4905,39 @@ int OSDMonitor::normalize_profile(const string& profilename, int err = instance.factory(plugin->second, g_conf->get_val("erasure_code_dir"), profile, &erasure_code, ss); - if (err) + if (err) { return err; + } - return erasure_code->init(profile, ss); + err = erasure_code->init(profile, ss); + if (err) { + return err; + } + + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + uint32_t stripe_unit = strict_si_cast(it->second.c_str(), &err_str); + if (!err_str.empty()) { + *ss << "could not parse stripe_unit '" << it->second + << "': " << err_str << std::endl; + return -EINVAL; + } + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks); + if (chunk_size != stripe_unit) { + *ss << "stripe_unit " << stripe_unit << " does not match ec profile " + << "alignment. Would be padded to " << chunk_size + << std::endl; + return -EINVAL; + } + if ((stripe_unit % 4096) != 0 && !force) { + *ss << "stripe_unit should be a multiple of 4096 bytes for best performance." + << "use --force to override this check" << std::endl; + return -EINVAL; + } + } + return 0; } int OSDMonitor::crush_ruleset_create_erasure(const string &name, @@ -5130,12 +5160,22 @@ int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type, break; case pg_pool_t::TYPE_ERASURE: { + ErasureCodeProfile profile = + osdmap.get_erasure_code_profile(erasure_code_profile); ErasureCodeInterfaceRef erasure_code; err = get_erasure_code(erasure_code_profile, &erasure_code, ss); - uint32_t desired_stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - if (err == 0) - *stripe_width = erasure_code->get_data_chunk_count() * - erasure_code->get_chunk_size(desired_stripe_width); + if (err) + break; + uint32_t data_chunks = erasure_code->get_data_chunk_count(); + uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit; + auto it = profile.find("stripe_unit"); + if (it != profile.end()) { + string err_str; + stripe_unit = strict_si_cast(it->second.c_str(), &err_str); + assert(err_str.empty()); + } + *stripe_width = data_chunks * + erasure_code->get_chunk_size(stripe_unit * data_chunks); } break; default: @@ -6831,14 +6871,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, if (err) goto reply; } - err = normalize_profile(name, profile_map, &ss); + err = normalize_profile(name, profile_map, force, &ss); if (err) goto reply; if (osdmap.has_erasure_code_profile(name)) { ErasureCodeProfile existing_profile_map = osdmap.get_erasure_code_profile(name); - err = normalize_profile(name, existing_profile_map, &ss); + err = normalize_profile(name, existing_profile_map, force, &ss); if (err) goto reply; @@ -6892,7 +6932,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, &ss); if (err) goto reply; - err = normalize_profile(name, profile_map, &ss); + err = normalize_profile(name, profile_map, true, &ss); if (err) goto reply; dout(20) << "erasure code profile set " << profile << "=" diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 32d08f78d3e..3ccbeecec76 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -297,6 +297,7 @@ private: const string& profile) const; int normalize_profile(const string& profilename, ErasureCodeProfile &profile, + bool force, ostream *ss); int crush_ruleset_create_erasure(const string &name, const string &profile, diff --git a/src/test/erasure-code/TestErasureCodeLrc.cc b/src/test/erasure-code/TestErasureCodeLrc.cc index 52fce2e6b42..50543945e3a 100644 --- a/src/test/erasure-code/TestErasureCodeLrc.cc +++ b/src/test/erasure-code/TestErasureCodeLrc.cc @@ -615,8 +615,8 @@ TEST(ErasureCodeLrc, encode_decode) profile["layers"] = description_string; EXPECT_EQ(0, lrc.init(profile, &cerr)); EXPECT_EQ(4U, lrc.get_data_chunk_count()); - unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count(); + unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit; + unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size; EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width)); set want_to_encode; map encoded; @@ -745,8 +745,8 @@ TEST(ErasureCodeLrc, encode_decode_2) profile["layers"] = description_string; EXPECT_EQ(0, lrc.init(profile, &cerr)); EXPECT_EQ(4U, lrc.get_data_chunk_count()); - unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width; - unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count(); + unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit; + unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size; EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width)); set want_to_encode; map encoded; diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh index b4417f664f6..cb1123febe0 100755 --- a/src/test/erasure-code/test-erasure-code.sh +++ b/src/test/erasure-code/test-erasure-code.sh @@ -259,8 +259,9 @@ function TEST_alignment_constraints() { # imposed by the stripe width # See http://tracker.ceph.com/issues/8622 # - local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) - local block_size=$((stripe_width - 1)) + local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + local block_size=$((stripe_unit * k - 1)) dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2 rados --block-size=$block_size \ --pool ecpool put UNALIGNED $dir/ORIGINAL || return 1 @@ -268,9 +269,7 @@ function TEST_alignment_constraints() { } function chunk_size() { - local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) - eval local $(ceph osd erasure-code-profile get default | grep k=) - echo $(($stripe_width / $k)) + echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) } # diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh index 3deef6fc10d..3fd4feef8df 100755 --- a/src/test/mon/osd-pool-create.sh +++ b/src/test/mon/osd-pool-create.sh @@ -122,30 +122,32 @@ function TEST_erasure_code_profile_default() { ceph osd erasure-code-profile ls | grep default || return 1 } -function TEST_erasure_crush_stripe_width() { +function TEST_erasure_crush_stripe_unit() { local dir=$1 - # the default stripe width is used to initialize the pool + # the default stripe unit is used to initialize the pool run_mon $dir a --public-addr $CEPH_MON - stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width) + stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit) + eval local $(ceph osd erasure-code-profile get myprofile | grep k=) + stripe_width = $((stripe_unit * k)) ceph osd pool create pool_erasure 12 12 erasure ceph --format json osd dump | tee $dir/osd.json grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1 } -function TEST_erasure_crush_stripe_width_padded() { +function TEST_erasure_crush_stripe_unit_padded() { local dir=$1 - # setting osd_pool_erasure_code_stripe_width modifies the stripe_width + # setting osd_pool_erasure_code_stripe_unit modifies the stripe_width # and it is padded as required by the default plugin profile+=" plugin=jerasure" profile+=" technique=reed_sol_van" k=4 profile+=" k=$k" profile+=" m=2" - expected_chunk_size=2048 - actual_stripe_width=$(($expected_chunk_size * $k)) - desired_stripe_width=$(($actual_stripe_width - 1)) + actual_stripe_unit=2048 + desired_stripe_unit=$((actual_stripe_unit - 1)) + actual_stripe_width=$((actual_stripe_unit * k)) run_mon $dir a \ - --osd_pool_erasure_code_stripe_width $desired_stripe_width \ + --osd_pool_erasure_code_stripe_unit $desired_stripe_unit \ --osd_pool_default_erasure_code_profile "$profile" || return 1 ceph osd pool create pool_erasure 12 12 erasure ceph osd dump | tee $dir/osd.json From 6fba80c1fac7f012303b27ef1e6fdb9d90c81a40 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 14 Feb 2017 00:04:12 -0800 Subject: [PATCH 02/11] osd, OSDMonitor, qa: mark ec overwrites non-experimental Keep the pool flag around so we can distinguish between a pool that should maintain hashes for each chunk, and a missing one is a bug, vs an overwrites pool where we rely on bluestore checksums for detecting corruption. Signed-off-by: Josh Durgin --- .../krbd/rbd-nomount/tasks/krbd_data_pool.yaml | 6 ------ .../ec-pool-snaps-few-objects-overwrites.yaml | 2 +- .../ec-small-objects-fast-read-overwrites.yaml | 2 +- .../workloads/ec-small-objects-overwrites.yaml | 2 +- .../ec-snaps-few-objects-overwrites.yaml | 2 +- qa/suites/rbd/cli/pool/ec-data-pool.yaml | 4 +--- qa/suites/rbd/librbd/pool/ec-data-pool.yaml | 4 +--- qa/suites/rbd/qemu/pool/ec-data-pool.yaml | 4 +--- qa/tasks/ceph_manager.py | 13 ++++++------- qa/tasks/rados.py | 10 ++++------ qa/workunits/rbd/krbd_data_pool.sh | 2 +- src/mon/FSCommands.cc | 2 +- src/mon/MonCommands.h | 2 +- src/mon/OSDMonitor.cc | 15 +++++---------- src/osd/ECBackend.cc | 12 ++++++------ src/osd/osd_types.h | 2 +- 16 files changed, 32 insertions(+), 52 deletions(-) diff --git a/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml b/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml index 24985fb79d5..35b9d67ebff 100644 --- a/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml +++ b/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml @@ -1,9 +1,3 @@ -overrides: - ceph: - conf: - global: - enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites - tasks: - workunit: clients: diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml index 7d494fdad98..d2ad70a57ec 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml +++ b/qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml @@ -12,7 +12,7 @@ tasks: objects: 50 pool_snaps: true ec_pool: true - erasure_code_use_hacky_overwrites: true + erasure_code_use_overwrites: true op_weights: read: 100 write: 100 diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml b/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml index c2e16487572..b3f831b778e 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml +++ b/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml @@ -14,7 +14,7 @@ tasks: objects: 1024 size: 16384 ec_pool: true - erasure_code_use_hacky_overwrites: true + erasure_code_use_overwrites: true fast_read: true op_weights: read: 100 diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml index 80a58172169..9baacef4890 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml +++ b/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml @@ -14,7 +14,7 @@ tasks: objects: 1024 size: 16384 ec_pool: true - erasure_code_use_hacky_overwrites: true + erasure_code_use_overwrites: true op_weights: read: 100 write: 100 diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml index 00263c414cc..b7c53819938 100644 --- a/qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml +++ b/qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml @@ -11,7 +11,7 @@ tasks: ops: 4000 objects: 50 ec_pool: true - erasure_code_use_hacky_overwrites: true + erasure_code_use_overwrites: true op_weights: read: 100 write: 100 diff --git a/qa/suites/rbd/cli/pool/ec-data-pool.yaml b/qa/suites/rbd/cli/pool/ec-data-pool.yaml index a6c35e57ae3..523d7450afb 100644 --- a/qa/suites/rbd/cli/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/cli/pool/ec-data-pool.yaml @@ -3,12 +3,10 @@ tasks: client.0: - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2 - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true + - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: ceph: conf: - global: - enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites client: rbd default data pool: datapool diff --git a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml b/qa/suites/rbd/librbd/pool/ec-data-pool.yaml index a6c35e57ae3..523d7450afb 100644 --- a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/librbd/pool/ec-data-pool.yaml @@ -3,12 +3,10 @@ tasks: client.0: - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2 - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true + - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: ceph: conf: - global: - enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites client: rbd default data pool: datapool diff --git a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml b/qa/suites/rbd/qemu/pool/ec-data-pool.yaml index a6c35e57ae3..523d7450afb 100644 --- a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/qemu/pool/ec-data-pool.yaml @@ -3,12 +3,10 @@ tasks: client.0: - sudo ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2 - sudo ceph osd pool create datapool 4 4 erasure teuthologyprofile - - sudo ceph osd pool set datapool debug_white_box_testing_ec_overwrites true + - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: ceph: conf: - global: - enable experimental unrecoverable data corrupting features: debug_white_box_testing_ec_overwrites client: rbd default data pool: datapool diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index c6d5ef4691d..f4c9ebc7e9d 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -1479,7 +1479,7 @@ class CephManager: def create_pool_with_unique_name(self, pg_num=16, erasure_code_profile_name=None, min_size=None, - erasure_code_use_hacky_overwrites=False): + erasure_code_use_overwrites=False): """ Create a pool named unique_pool_X where X is unique. """ @@ -1492,7 +1492,7 @@ class CephManager: pg_num, erasure_code_profile_name=erasure_code_profile_name, min_size=min_size, - erasure_code_use_hacky_overwrites=erasure_code_use_hacky_overwrites) + erasure_code_use_overwrites=erasure_code_use_overwrites) return name @contextlib.contextmanager @@ -1504,15 +1504,14 @@ class CephManager: def create_pool(self, pool_name, pg_num=16, erasure_code_profile_name=None, min_size=None, - erasure_code_use_hacky_overwrites=False): + erasure_code_use_overwrites=False): """ Create a pool named from the pool_name parameter. :param pool_name: name of the pool being created. :param pg_num: initial number of pgs. :param erasure_code_profile_name: if set and !None create an erasure coded pool using the profile - :param erasure_code_use_hacky_overwrites: if true, use the hacky - overwrites mode + :param erasure_code_use_overwrites: if true, allow overwrites """ with self.lock: assert isinstance(pool_name, basestring) @@ -1531,10 +1530,10 @@ class CephManager: 'osd', 'pool', 'set', pool_name, 'min_size', str(min_size)) - if erasure_code_use_hacky_overwrites: + if erasure_code_use_overwrites: self.raw_cluster_cmd( 'osd', 'pool', 'set', pool_name, - 'debug_white_box_testing_ec_overwrites', + 'allow_ec_overwrites', 'true') self.pools[pool_name] = pg_num time.sleep(1) diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py index 15a35e5988d..4e4f746cb05 100644 --- a/qa/tasks/rados.py +++ b/qa/tasks/rados.py @@ -55,9 +55,7 @@ def task(ctx, config): rollback: 2 snap_remove: 0 ec_pool: create an ec pool, defaults to False - erasure_code_use_hacky_overwrites: use the whitebox - testing experimental - overwrites mode + erasure_code_use_overwrites: test overwrites, default false erasure_code_profile: name: teuthologyprofile k: 2 @@ -139,7 +137,7 @@ def task(ctx, config): 'ceph_test_rados'] if config.get('ec_pool', False): args.extend(['--no-omap']) - if config.get('erasure_code_use_hacky_overwrites', False): + if config.get('erasure_code_use_overwrites', False): args.extend(['--no-sparse']) else: args.extend(['--ec-pool']) @@ -230,8 +228,8 @@ def task(ctx, config): else: pool = manager.create_pool_with_unique_name( erasure_code_profile_name=profile_name, - erasure_code_use_hacky_overwrites= - config.get('erasure_code_use_hacky_overwrites', False) + erasure_code_use_overwrites= + config.get('erasure_code_use_overwrites', False) ) created_pools.append(pool) if config.get('fast_read', False): diff --git a/qa/workunits/rbd/krbd_data_pool.sh b/qa/workunits/rbd/krbd_data_pool.sh index 2c83d34ca87..0b7b6c933ec 100755 --- a/qa/workunits/rbd/krbd_data_pool.sh +++ b/qa/workunits/rbd/krbd_data_pool.sh @@ -84,7 +84,7 @@ function count_data_objects() { ceph osd pool create repdata 24 24 ceph osd erasure-code-profile set teuthologyprofile ruleset-failure-domain=osd m=1 k=2 ceph osd pool create ecdata 24 24 erasure teuthologyprofile -ceph osd pool set ecdata debug_white_box_testing_ec_overwrites true +ceph osd pool set ecdata allow_ec_overwrites true ceph osd pool create rbdnonzero 24 24 ceph osd pool create clonesonly 24 24 diff --git a/src/mon/FSCommands.cc b/src/mon/FSCommands.cc index cc2dfe04c0b..630736c3989 100644 --- a/src/mon/FSCommands.cc +++ b/src/mon/FSCommands.cc @@ -849,7 +849,7 @@ int FileSystemCommandHandler::_check_pool( << " is an erasure-coded pool. Use of erasure-coded pools" << " for CephFS metadata is not permitted"; return -EINVAL; - } else if (pool->is_erasure() && !pool->is_hacky_ecoverwrites()) { + } else if (pool->is_erasure() && !pool->allows_ecoverwrites()) { // non-overwriteable EC pools are only acceptable with a cache tier overlay if (!pool->has_tiers() || !pool->has_read_tier() || !pool->has_write_tier()) { *ss << "pool '" << pool_name << "' (id '" << pool_id << "')" diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 3945e4b3480..9506814f7be 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -737,7 +737,7 @@ COMMAND("osd pool get " \ "get pool parameter ", "osd", "r", "cli,rest") COMMAND("osd pool set " \ "name=pool,type=CephPoolname " \ - "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|debug_white_box_testing_ec_overwrites " \ + "name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_rule|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_promote|fast_read|hit_set_grade_decay_rate|hit_set_search_last_n|scrub_min_interval|scrub_max_interval|deep_scrub_interval|recovery_priority|recovery_op_priority|scrub_priority|compression_mode|compression_algorithm|compression_required_ratio|compression_max_blob_size|compression_min_blob_size|csum_type|csum_min_block|csum_max_block|allow_ec_overwrites " \ "name=val,type=CephString " \ "name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \ "set pool parameter to ", "osd", "rw", "cli,rest") diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 07a99f23724..5cdbac9866c 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -5768,18 +5768,13 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, ss << "expecting value 'true' or '1'"; return -EINVAL; } - } else if (var == "debug_white_box_testing_ec_overwrites") { + } else if (var == "allow_ec_overwrites") { + if (!p.is_erasure()) { + ss << "ec overwrites can only be enabled for an erasure coded pool"; + return -EINVAL; + } if (val == "true" || (interr.empty() && n == 1)) { - if (cct->check_experimental_feature_enabled( - "debug_white_box_testing_ec_overwrites")) { p.flags |= pg_pool_t::FLAG_EC_OVERWRITES; - } else { - ss << "debug_white_box_testing_ec_overwrites is an experimental feature " - << "and must be enabled. Note, this feature does not yet actually " - << "work. This flag merely enables some of the preliminary support " - << "for testing purposes."; - return -ENOTSUP; - } } else if (val == "false" || (interr.empty() && n == 0)) { ss << "ec overwrites cannot be disabled once enabled"; return -EINVAL; diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index ea285cc4e6f..569bb89529e 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -954,7 +954,7 @@ void ECBackend::handle_sub_read( ++i) { int r = 0; ECUtil::HashInfoRef hinfo; - if (!get_parent()->get_pool().is_hacky_ecoverwrites()) { + if (!get_parent()->get_pool().allows_ecoverwrites()) { hinfo = get_hash_info(i->first); if (!hinfo) { r = -EIO; @@ -989,7 +989,7 @@ void ECBackend::handle_sub_read( ); } - if (!get_parent()->get_pool().is_hacky_ecoverwrites()) { + if (!get_parent()->get_pool().allows_ecoverwrites()) { // This shows that we still need deep scrub because large enough files // are read in sections, so the digest check here won't be done here. // Do NOT check osd_read_eio_on_bad_digest here. We need to report @@ -1732,7 +1732,7 @@ bool ECBackend::try_state_to_reads() Op *op = &(waiting_state.front()); if (op->requires_rmw() && pipeline_state.cache_invalid()) { - assert(get_parent()->get_pool().is_hacky_ecoverwrites()); + assert(get_parent()->get_pool().allows_ecoverwrites()); dout(20) << __func__ << ": blocking " << *op << " because it requires an rmw and the cache is invalid " << pipeline_state @@ -1786,7 +1786,7 @@ bool ECBackend::try_state_to_reads() dout(10) << __func__ << ": " << *op << dendl; if (!op->remote_read.empty()) { - assert(get_parent()->get_pool().is_hacky_ecoverwrites()); + assert(get_parent()->get_pool().allows_ecoverwrites()); objects_read_async_no_cache( op->remote_read, [this, op](map > &&results) { @@ -1859,7 +1859,7 @@ bool ECBackend::try_reads_to_commit() dout(20) << __func__ << ": written: " << written << dendl; dout(20) << __func__ << ": op: " << *op << dendl; - if (!get_parent()->get_pool().is_hacky_ecoverwrites()) { + if (!get_parent()->get_pool().allows_ecoverwrites()) { for (auto &&i: op->log_entries) { if (i.requires_kraken()) { derr << __func__ << ": log entry " << i << " requires kraken" @@ -2365,7 +2365,7 @@ void ECBackend::be_deep_scrub( o.digest_present = false; return; } else { - if (!get_parent()->get_pool().is_hacky_ecoverwrites()) { + if (!get_parent()->get_pool().allows_ecoverwrites()) { assert(hinfo->has_chunk_hash()); if (hinfo->get_total_chunk_size() != pos) { dout(0) << "_scan_list " << poid << " got incorrect size on read" << dendl; diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 1c4e4c65a6c..550ea8dfca3 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1448,7 +1448,7 @@ public: } uint64_t required_alignment() const { return stripe_width; } - bool is_hacky_ecoverwrites() const { + bool allows_ecoverwrites() const { return has_flag(FLAG_EC_OVERWRITES); } From a219319137e8ea269c7a8ca2652b68918c1c4c15 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 14 Feb 2017 17:03:19 -0800 Subject: [PATCH 03/11] qa/tasks/rados: test sparse reads with ec overwrites Signed-off-by: Josh Durgin --- qa/tasks/rados.py | 4 +--- qa/workunits/cephtool/test.sh | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/qa/tasks/rados.py b/qa/tasks/rados.py index 4e4f746cb05..2ef542fd278 100644 --- a/qa/tasks/rados.py +++ b/qa/tasks/rados.py @@ -137,9 +137,7 @@ def task(ctx, config): 'ceph_test_rados'] if config.get('ec_pool', False): args.extend(['--no-omap']) - if config.get('erasure_code_use_overwrites', False): - args.extend(['--no-sparse']) - else: + if not config.get('erasure_code_use_overwrites', False): args.extend(['--ec-pool']) if config.get('write_fadvise_dontneed', False): args.extend(['--write-fadvise-dontneed']) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 8390c422293..94c11aeebf2 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -989,7 +989,7 @@ function test_mon_mds() # We should be permitted to use an EC pool with overwrites enabled # as the data pool... - ceph osd pool set mds-ec-pool debug_white_box_testing_ec_overwrites true --yes-i-really-mean-it + ceph osd pool set mds-ec-pool allow_ec_overwrites true ceph fs new $FS_NAME fs_metadata mds-ec-pool --force 2>$TMPFILE fail_all_mds $FS_NAME ceph fs rm $FS_NAME --yes-i-really-mean-it From 3ca750d41dfe33c6efea4abc96d2bd426a9742b9 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 16 Feb 2017 00:30:00 -0800 Subject: [PATCH 04/11] test/osd/osd-scrub-repair.sh: add ec overwrites test cases Move pool and profile creation into a single function, and add a 'allow_overwrites' parameter for it so each ec test can be paramterized by it. Signed-off-by: Josh Durgin --- qa/workunits/ceph-helpers.sh | 32 +++++- src/test/osd/osd-scrub-repair.sh | 184 +++++++++++++++++++++---------- 2 files changed, 157 insertions(+), 59 deletions(-) diff --git a/qa/workunits/ceph-helpers.sh b/qa/workunits/ceph-helpers.sh index 8642d376a73..c806508e204 100755 --- a/qa/workunits/ceph-helpers.sh +++ b/qa/workunits/ceph-helpers.sh @@ -520,6 +520,27 @@ function run_osd() { activate_osd $dir $id "$@" } +function run_osd_bluestore() { + local dir=$1 + shift + local id=$1 + shift + local osd_data=$dir/$id + + local ceph_disk_args + ceph_disk_args+=" --statedir=$dir" + ceph_disk_args+=" --sysconfdir=$dir" + ceph_disk_args+=" --prepend-to-path=" + + mkdir -p $osd_data + ceph-disk $ceph_disk_args \ + prepare --bluestore $osd_data || return 1 + + local ceph_osd_args + ceph_osd_args+=" --enable-experimental-unrecoverable-data-corrupting-features=bluestore" + activate_osd $dir $id $ceph_osd_args "$@" +} + function test_run_osd() { local dir=$1 @@ -635,6 +656,7 @@ function activate_osd() { ceph_disk_args+=" --prepend-to-path=" local ceph_args="$CEPH_ARGS" + ceph_args+=" --enable-experimental-unrecoverable-data-corrupting-features=bluestore" ceph_args+=" --osd-failsafe-full-ratio=.99" ceph_args+=" --osd-journal-size=100" ceph_args+=" --osd-scrub-load-threshold=2000" @@ -993,10 +1015,18 @@ function objectstore_tool() { shift local osd_data=$dir/$id + local osd_type=$(cat $osd_data/type) + kill_daemons $dir TERM osd.$id >&2 < /dev/null || return 1 + + local journal_args + if [ "$objectstore_type" == "filestore" ]; then + journal_args=" --journal-path $osd_data/journal" + fi ceph-objectstore-tool \ + --enable-experimental-unrecoverable-data-corrupting-features=bluestore \ --data-path $osd_data \ - --journal-path $osd_data/journal \ + $journal_args \ "$@" || return 1 activate_osd $dir $id $ceph_osd_args >&2 || return 1 wait_for_clean >&2 diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh index 866383eb111..dc8cc500650 100755 --- a/src/test/osd/osd-scrub-repair.sh +++ b/src/test/osd/osd-scrub-repair.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x # # Copyright (C) 2014 Red Hat # @@ -154,11 +154,6 @@ function corrupt_and_repair_one() { function corrupt_and_repair_erasure_coded() { local dir=$1 local poolname=$2 - local profile=$3 - - ceph osd pool create $poolname 1 1 erasure $profile \ - || return 1 - wait_for_clean || return 1 add_something $dir $poolname || return 1 @@ -176,8 +171,25 @@ function corrupt_and_repair_erasure_coded() { } -function TEST_auto_repair_erasure_coded() { +function create_ec_pool() { + local pool_name=$1 + local allow_overwrites=$2 + + ceph osd erasure-code-profile set myprofile ruleset-failure-domain=osd $3 $4 $5 $6 $7 || return 1 + + ceph osd pool create "$poolname" 1 1 erasure myprofile || return 1 + + if [ "$allow_overwrites" = "true" ]; then + ceph osd pool set "$poolname" allow_ec_overwrites true || return 1 + fi + + wait_for_clean || return 1 + return 0 +} + +function auto_repair_erasure_coded() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool # Launch a cluster with 5 seconds scrub interval @@ -190,15 +202,16 @@ function TEST_auto_repair_erasure_coded() { --osd-scrub-min-interval=5 \ --osd-scrub-interval-randomize-ratio=0" for id in $(seq 0 2) ; do - run_osd $dir $id $ceph_osd_args + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id $ceph_osd_args || return 1 + else + run_osd $dir $id $ceph_osd_args || return 1 + fi done wait_for_clean || return 1 # Create an EC pool - ceph osd erasure-code-profile set myprofile \ - k=2 m=1 ruleset-failure-domain=osd || return 1 - ceph osd pool create $poolname 8 8 erasure myprofile || return 1 - wait_for_clean || return 1 + create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1 # Put an object local payload=ABCDEF @@ -222,69 +235,95 @@ function TEST_auto_repair_erasure_coded() { teardown $dir || return 1 } -function TEST_corrupt_and_repair_jerasure() { +function TEST_auto_repair_erasure_coded_appends() { + auto_repair_erasure_coded $1 false +} + +function TEST_auto_repair_erasure_coded_overwrites() { + auto_repair_erasure_coded $1 true +} + +function corrupt_and_repair_jerasure() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool - local profile=myprofile setup $dir || return 1 run_mon $dir a || return 1 run_mgr $dir x || return 1 for id in $(seq 0 3) ; do - run_osd $dir $id || return 1 + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi done wait_for_clean || return 1 - ceph osd erasure-code-profile set $profile \ - k=2 m=2 ruleset-failure-domain=osd || return 1 - - corrupt_and_repair_erasure_coded $dir $poolname $profile || return 1 + create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1 + corrupt_and_repair_erasure_coded $dir $poolname || return 1 teardown $dir || return 1 } -function TEST_corrupt_and_repair_lrc() { +function TEST_corrupt_and_repair_jerasure_appends() { + corrupt_and_repair_jerasure $1 +} + +function TEST_corrupt_and_repair_jerasure_overwrites() { + corrupt_and_repair_jerasure $1 true +} + +function corrupt_and_repair_lrc() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool - local profile=myprofile setup $dir || return 1 run_mon $dir a || return 1 run_mgr $dir x || return 1 for id in $(seq 0 9) ; do - run_osd $dir $id || return 1 + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi done wait_for_clean || return 1 - ceph osd erasure-code-profile set $profile \ - pluing=lrc \ - k=4 m=2 l=3 \ - ruleset-failure-domain=osd || return 1 - - corrupt_and_repair_erasure_coded $dir $poolname $profile || return 1 + create_ec_pool $poolname $allow_overwrites k=4 m=2 l=3 plugin=lrc || return 1 + corrupt_and_repair_erasure_coded $dir $poolname || return 1 teardown $dir || return 1 } -function TEST_unfound_erasure_coded() { +function TEST_corrupt_and_repair_lrc_appends() { + corrupt_and_repair_jerasure $1 +} + +function TEST_corrupt_and_repair_lrc_overwrites() { + corrupt_and_repair_jerasure $1 true +} + +function unfound_erasure_coded() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool local payload=ABCDEF setup $dir || return 1 run_mon $dir a || return 1 run_mgr $dir x || return 1 - run_osd $dir 0 || return 1 - run_osd $dir 1 || return 1 - run_osd $dir 2 || return 1 - run_osd $dir 3 || return 1 + for id in $(seq 0 3) ; do + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi + done wait_for_clean || return 1 - ceph osd erasure-code-profile set myprofile \ - k=2 m=2 ruleset-failure-domain=osd || return 1 - ceph osd pool create $poolname 1 1 erasure myprofile \ - || return 1 - wait_for_clean || return 1 + create_ec_pool $poolname $allow_overwrites k=2 m=2 || return 1 add_something $dir $poolname || return 1 @@ -324,27 +363,35 @@ function TEST_unfound_erasure_coded() { teardown $dir || return 1 } +function TEST_unfound_erasure_coded_appends() { + unfound_erasure_coded $1 +} + +function TEST_unfound_erasure_coded_overwrites() { + unfound_erasure_coded $1 true +} + # # list_missing for EC pool # -function TEST_list_missing_erasure_coded() { +function list_missing_erasure_coded() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool - local profile=myprofile setup $dir || return 1 run_mon $dir a || return 1 run_mgr $dir x || return 1 for id in $(seq 0 2) ; do - run_osd $dir $id || return 1 + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi done wait_for_clean || return 1 - ceph osd erasure-code-profile set $profile \ - k=2 m=1 ruleset-failure-domain=osd || return 1 - ceph osd pool create $poolname 1 1 erasure $profile \ - || return 1 - wait_for_clean || return 1 + create_ec_pool $poolname $allow_overwrites k=2 m=1 || return 1 # Put an object and remove the two shards (including primary) add_something $dir $poolname MOBJ0 || return 1 @@ -360,17 +407,17 @@ function TEST_list_missing_erasure_coded() { done id=${osds0[0]} - ceph-objectstore-tool --data-path $dir/$id --journal-path $dir/$id/journal \ + ceph-objectstore-tool --data-path $dir/$id --enable-experimental-unrecoverable-data-corrupting-features=bluestore \ MOBJ0 remove || return 1 id=${osds0[1]} - ceph-objectstore-tool --data-path $dir/$id --journal-path $dir/$id/journal \ + ceph-objectstore-tool --data-path $dir/$id --enable-experimental-unrecoverable-data-corrupting-features=bluestore \ MOBJ0 remove || return 1 id=${osds1[1]} - ceph-objectstore-tool --data-path $dir/$id --journal-path $dir/$id/journal \ + ceph-objectstore-tool --data-path $dir/$id --enable-experimental-unrecoverable-data-corrupting-features=bluestore \ MOBJ1 remove || return 1 id=${osds1[2]} - ceph-objectstore-tool --data-path $dir/$id --journal-path $dir/$id/journal \ + ceph-objectstore-tool --data-path $dir/$id --enable-experimental-unrecoverable-data-corrupting-features=bluestore \ MOBJ1 remove || return 1 for id in $(seq 0 2) ; do @@ -394,6 +441,14 @@ function TEST_list_missing_erasure_coded() { teardown $dir || return 1 } +function TEST_list_missing_erasure_coded_appends() { + list_missing_erasure_coded $1 false +} + +function TEST_list_missing_erasure_coded_overwrites() { + list_missing_erasure_coded $1 true +} + # # Corrupt one copy of a replicated pool # @@ -1513,25 +1568,25 @@ EOF # # Test scrub errors for an erasure coded pool # -function TEST_corrupt_scrub_erasure() { +function corrupt_scrub_erasure() { local dir=$1 + local allow_overwrites=$2 local poolname=ecpool - local profile=myprofile local total_objs=5 setup $dir || return 1 run_mon $dir a || return 1 run_mgr $dir x || return 1 for id in $(seq 0 2) ; do - run_osd $dir $id || return 1 + if [ "$allow_overwrites" = "true" ]; then + run_osd_bluestore $dir $id || return 1 + else + run_osd $dir $id || return 1 + fi done wait_for_clean || return 1 - ceph osd erasure-code-profile set $profile \ - k=2 m=1 ruleset-failure-domain=osd || return 1 - ceph osd pool create $poolname 1 1 erasure $profile \ - || return 1 - wait_for_clean || return 1 + create_ec_pool $poolname $allow_overwrites k=2 m=1 stripe_unit=2K --force || return 1 for i in $(seq 1 $total_objs) ; do objname=EOBJ${i} @@ -2151,6 +2206,11 @@ EOF EOF jq "$jqfilter" $dir/json | python -c "$sortkeys" | sed -e "$sedfilter" > $dir/csjson + if [ "$allow_overwrites" = "true" ] + then + grep -v data_digest $dir/csjson | grep -v ec_size_error > $dir/csjson + grep -v data_digest $dir/checkcsjson | grep -v ec_size_error > $dir/checkcsjson + fi diff -y $termwidth $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1 if test $getjson = "yes" then @@ -2166,6 +2226,14 @@ EOF teardown $dir || return 1 } +function TEST_corrupt_scrub_erasure_appends() { + corrupt_scrub_erasure $1 false +} + +function TEST_corrupt_scrub_erasure_overwrites() { + corrupt_scrub_erasure $1 true +} + # # Test to make sure that a periodic scrub won't cause deep-scrub info to be lost # From 4315221179a557f9acd7527cc8be92716b15b332 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 6 Apr 2017 11:40:41 -0700 Subject: [PATCH 05/11] osd,OSDMonitor: try to protect against ec overwrites with filestore This isn't perfect, but it's better than nothing. Prevent enabling the allow_ec_overwrites flag if any of a sample of pgs in the pool map to osds using filestore. This mainly protects filestore-only clusters from enabling it. If a filestore osd is started later, warn in the cluster log when it gets a pg with ec overwrites enabled. Signed-off-by: Josh Durgin --- qa/workunits/cephtool/test.sh | 12 +++++++++ src/mon/OSDMonitor.cc | 47 +++++++++++++++++++++++++++++++++++ src/mon/OSDMonitor.h | 3 +++ src/osd/OSD.cc | 7 ++++++ 4 files changed, 69 insertions(+) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 94c11aeebf2..4398ab61146 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1320,7 +1320,19 @@ function test_mon_osd_pool() # should fail because the type is not the same expect_false ceph osd pool create replicated 12 12 erasure ceph osd lspools | grep replicated + ceph osd pool create ec_test 1 1 erasure + set +e + ceph osd metadata | grep osd_objectstore_type | grep -qc bluestore + if [ $? -eq 0 ]; then + ceph osd pool set ec_test allow_ec_overwrites true >& $TMPFILE + check_response $? 22 "pool must only be stored on bluestore for scrubbing to work" + else + ceph osd pool set ec_test allow_ec_overwrites true || return 1 + expect_false ceph osd pool set ec_test allow_ec_overwrites false + fi + set -e ceph osd pool delete replicated replicated --yes-i-really-really-mean-it + ceph osd pool delete ec_test ec_test --yes-i-really-really-mean-it } function test_mon_osd_pool_quota() diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 5cdbac9866c..1b48c8edccb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1232,6 +1232,48 @@ int OSDMonitor::load_metadata(int osd, map& m, ostream *err) return 0; } +int OSDMonitor::get_osd_objectstore_type(int osd, string *type) +{ + map metadata; + int r = load_metadata(osd, metadata, nullptr); + if (r < 0) + return r; + + auto it = metadata.find("osd_objectstore"); + if (it == metadata.end()) + return -ENOENT; + *type = it->second; + return 0; +} + +bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id, + const pg_pool_t &pool, + ostream *err) +{ + // just check a few pgs for efficiency - this can't give a guarantee anyway, + // since filestore osds could always join the pool later + set checked_osds; + for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) { + vector up, acting; + pg_t pgid(ps, pool_id, -1); + osdmap.pg_to_up_acting_osds(pgid, up, acting); + for (int osd : up) { + if (checked_osds.find(osd) != checked_osds.end()) + continue; + string objectstore_type; + int r = get_osd_objectstore_type(osd, &objectstore_type); + // allow with missing metadata, e.g. due to an osd never booting yet + if (r < 0 || objectstore_type == "bluestore") { + checked_osds.insert(osd); + continue; + } + *err << "osd." << osd << " uses " << objectstore_type; + return false; + } + } + return true; +} + int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err) { map m; @@ -5782,6 +5824,11 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, ss << "expecting value 'true', 'false', '0', or '1'"; return -EINVAL; } + stringstream err; + if (!is_pool_currently_all_bluestore(pool, p, &err)) { + ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str(); + return -EINVAL; + } } else if (var == "target_max_objects") { if (interr.length()) { ss << "error parsing int '" << val << "': " << interr; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 3ccbeecec76..5144e89b6d1 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -411,6 +411,9 @@ private: OpTracker op_tracker; int load_metadata(int osd, map& m, ostream *err); + int get_osd_objectstore_type(int osd, std::string *type); + bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool, + ostream *err); // when we last received PG stats from each osd map last_osd_report; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1082c66e5db..809e8f21e5d 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3894,6 +3894,13 @@ int OSD::handle_pg_peering_evt( switch (result) { case RES_NONE: { const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool()); + if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) && + store->get_type() != "bluestore") { + clog->warn() << "pg " << pgid + << " is at risk of silent data corruption: " + << "the pool allows ec overwrites but is not stored in " + << "bluestore, so deep scrubbing will not detect bitrot"; + } PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); PG::_init(*rctx.transaction, pgid, pp); From 25e8e1a08e8ba3b122e99f0108e12b2c120644dd Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 12 Apr 2017 17:59:57 -0700 Subject: [PATCH 06/11] PendingReleaseNotes: note about ec overwrites Signed-off-by: Josh Durgin --- PendingReleaseNotes | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index e2b043034d2..f010976e7ff 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -111,3 +111,8 @@ ability to be overridden by the erasure code profile setting "stripe_unit". For more details see "Erasure Code Profiles" in the documentation. + +* rbd and cephfs can use erasure coding with bluestore. This may be + enabled by setting 'allow_ec_overwrites' to 'true' for a pool. Since + this relies on bluestore's checksumming to do deep scrubbing, + enabling this on a pool stored on filestore is not allowed. From bad606605abbec852a261316d4e7bbe116b8c007 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 12 Apr 2017 19:39:22 -0700 Subject: [PATCH 07/11] doc: update ec docs for overwrite support Signed-off-by: Josh Durgin --- doc/rados/operations/erasure-code.rst | 40 +++++++++++++++++++++------ doc/rados/operations/pools.rst | 11 ++++++++ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/doc/rados/operations/erasure-code.rst b/doc/rados/operations/erasure-code.rst index 43a97c57caa..6878777ac72 100644 --- a/doc/rados/operations/erasure-code.rst +++ b/doc/rados/operations/erasure-code.rst @@ -113,12 +113,41 @@ no two *chunks* are stored in the same rack. More information can be found in the `erasure code profiles <../erasure-code-profile>`_ documentation. + +Erasure Coding with Overwrites +------------------------------ + +By default, erasure coded pools only work with uses like RGW that +perform full object writes and appends. + +Since Luminous, partial writes for an erasure coded pool may be +enabled with a per-pool setting. This lets RBD and Cephfs store their +data in an erasure coded pool:: + + ceph osd pool set ec_pool allow_ec_overwrites true + +This can only be enabled on a pool residing on bluestore OSDs, since +bluestore's checksumming is used to detect bitrot or other corruption +during deep-scrub. In addition to being unsafe, using filestore with +ec overwrites yields low performance compared to bluestore. + +Erasure coded pools do not support omap, so to use them with RBD and +Cephfs you must instruct them to store their data in an ec pool, and +their metadata in a replicated pool. For RBD, this means using the +erasure coded pool as the ``--data-pool`` during image creation:: + + rbd create --size 1G --data-pool ec_pool replicated_pool/image_name + +For Cephfs, using an erasure coded pool means setting that pool in +a `file layout<../../cephfs/file-layouts>`_. + + Erasure coded pool and cache tiering ------------------------------------ Erasure coded pools require more resources than replicated pools and -lack some functionalities such as partial writes. To overcome these -limitations, it is recommended to set a `cache tier <../cache-tiering>`_ +lack some functionalities such as omap. To overcome these +limitations, one can set up a `cache tier <../cache-tiering>`_ before the erasure coded pool. For instance, if the pool *hot-storage* is made of fast storage:: @@ -131,13 +160,6 @@ will place the *hot-storage* pool as tier of *ecpool* in *writeback* mode so that every write and read to the *ecpool* are actually using the *hot-storage* and benefit from its flexibility and speed. -It is not possible to create an RBD image on an erasure coded pool -because it requires partial writes. It is however possible to create -an RBD image on an erasure coded pools when a replicated pool tier set -a cache tier:: - - $ rbd create --size 10G ecpool/myvolume - More information can be found in the `cache tiering <../cache-tiering>`_ documentation. diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst index d370677b1bb..7db41d6364a 100644 --- a/doc/rados/operations/pools.rst +++ b/doc/rados/operations/pools.rst @@ -301,6 +301,16 @@ You may set values for the following keys: :Description: The ruleset to use for mapping object placement in the cluster. :Type: Integer +.. _allow_ec_overwrites: + +``allow_ec_overwrites`` + +:Description: Whether writes to an erasure coded pool can update part + of an object, so cephfs and rbd can use it. See + `Erasure Coding with Overwrites`_ for more details. +:Type: Boolean +:Version: ``12.2.0`` and above + .. _hashpspool: ``hashpspool`` @@ -731,3 +741,4 @@ a size of 3). .. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref .. _Bloom Filter: http://en.wikipedia.org/wiki/Bloom_filter .. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups +.. _Erasure Coding with Overwrites: ../erasure-code#erasure-coding-with-overwrites From a3e5cba3f124aa8df95816b822f15fb2a2dc79c6 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 13 Apr 2017 12:04:10 -0700 Subject: [PATCH 08/11] qa: move ec overwrites tests to a bluestore-only subsuite Signed-off-by: Josh Durgin --- qa/suites/rados/thrash-erasure-code-overwrites/% | 0 qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml | 1 + qa/suites/rados/thrash-erasure-code-overwrites/clusters | 1 + qa/suites/rados/thrash-erasure-code-overwrites/fast | 1 + qa/suites/rados/thrash-erasure-code-overwrites/leveldb.yaml | 1 + qa/suites/rados/thrash-erasure-code-overwrites/msgr-failures | 1 + qa/suites/rados/thrash-erasure-code-overwrites/rados.yaml | 1 + qa/suites/rados/thrash-erasure-code-overwrites/thrashers | 1 + .../workloads/ec-pool-snaps-few-objects-overwrites.yaml | 0 .../workloads/ec-small-objects-fast-read-overwrites.yaml | 0 .../workloads/ec-small-objects-overwrites.yaml | 0 .../workloads/ec-snaps-few-objects-overwrites.yaml | 0 12 files changed, 7 insertions(+) create mode 100644 qa/suites/rados/thrash-erasure-code-overwrites/% create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/clusters create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/fast create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/leveldb.yaml create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/msgr-failures create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/rados.yaml create mode 120000 qa/suites/rados/thrash-erasure-code-overwrites/thrashers rename qa/suites/rados/{thrash-erasure-code => thrash-erasure-code-overwrites}/workloads/ec-pool-snaps-few-objects-overwrites.yaml (100%) rename qa/suites/rados/{thrash-erasure-code => thrash-erasure-code-overwrites}/workloads/ec-small-objects-fast-read-overwrites.yaml (100%) rename qa/suites/rados/{thrash-erasure-code => thrash-erasure-code-overwrites}/workloads/ec-small-objects-overwrites.yaml (100%) rename qa/suites/rados/{thrash-erasure-code => thrash-erasure-code-overwrites}/workloads/ec-snaps-few-objects-overwrites.yaml (100%) diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/% b/qa/suites/rados/thrash-erasure-code-overwrites/% new file mode 100644 index 00000000000..e69de29bb2d diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml new file mode 120000 index 00000000000..1249ffda0ee --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/bluestore.yaml @@ -0,0 +1 @@ +../thrash-erasure-code/objectstore/bluestore.yaml \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/clusters b/qa/suites/rados/thrash-erasure-code-overwrites/clusters new file mode 120000 index 00000000000..646ea04cd93 --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/clusters @@ -0,0 +1 @@ +../thrash-erasure-code/clusters \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/fast b/qa/suites/rados/thrash-erasure-code-overwrites/fast new file mode 120000 index 00000000000..6170b30e009 --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/fast @@ -0,0 +1 @@ +../thrash-erasure-code/fast \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/leveldb.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/leveldb.yaml new file mode 120000 index 00000000000..531ecf3336a --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/leveldb.yaml @@ -0,0 +1 @@ +../thrash-erasure-code/leveldb.yaml \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/msgr-failures b/qa/suites/rados/thrash-erasure-code-overwrites/msgr-failures new file mode 120000 index 00000000000..70c9ca130e2 --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/msgr-failures @@ -0,0 +1 @@ +../thrash-erasure-code/msgr-failures \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/rados.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/rados.yaml new file mode 120000 index 00000000000..017df6f601e --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/rados.yaml @@ -0,0 +1 @@ +../thrash-erasure-code/rados.yaml \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/thrashers b/qa/suites/rados/thrash-erasure-code-overwrites/thrashers new file mode 120000 index 00000000000..40ff82cf70a --- /dev/null +++ b/qa/suites/rados/thrash-erasure-code-overwrites/thrashers @@ -0,0 +1 @@ +../thrash-erasure-code/thrashers \ No newline at end of file diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-pool-snaps-few-objects-overwrites.yaml similarity index 100% rename from qa/suites/rados/thrash-erasure-code/workloads/ec-pool-snaps-few-objects-overwrites.yaml rename to qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-pool-snaps-few-objects-overwrites.yaml diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-small-objects-fast-read-overwrites.yaml similarity index 100% rename from qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-fast-read-overwrites.yaml rename to qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-small-objects-fast-read-overwrites.yaml diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-small-objects-overwrites.yaml similarity index 100% rename from qa/suites/rados/thrash-erasure-code/workloads/ec-small-objects-overwrites.yaml rename to qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-small-objects-overwrites.yaml diff --git a/qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-snaps-few-objects-overwrites.yaml similarity index 100% rename from qa/suites/rados/thrash-erasure-code/workloads/ec-snaps-few-objects-overwrites.yaml rename to qa/suites/rados/thrash-erasure-code-overwrites/workloads/ec-snaps-few-objects-overwrites.yaml From 4a7c05818306f9c3f6861b9f61f526a36de0d7af Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 13 Apr 2017 17:31:02 -0700 Subject: [PATCH 09/11] qa: use 4k stripe_width again for test_rados_tool.sh The stripe_unit change altered the default stripe_width to depend on k. Signed-off-by: Josh Durgin --- qa/workunits/rados/test_rados_tool.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qa/workunits/rados/test_rados_tool.sh b/qa/workunits/rados/test_rados_tool.sh index 3a5ddd2e859..a792835d099 100755 --- a/qa/workunits/rados/test_rados_tool.sh +++ b/qa/workunits/rados/test_rados_tool.sh @@ -87,7 +87,7 @@ run_expect_nosignal "$RADOS_TOOL" --object_locator "asdf" ls run_expect_nosignal "$RADOS_TOOL" --namespace "asdf" ls run_expect_succ "$RADOS_TOOL" mkpool "$POOL" -run_expect_succ "$CEPH_TOOL" osd erasure-code-profile set myprofile k=2 m=1 ruleset-failure-domain=osd +run_expect_succ "$CEPH_TOOL" osd erasure-code-profile set myprofile k=2 m=1 stripe_unit=2K ruleset-failure-domain=osd --force run_expect_succ "$CEPH_TOOL" osd pool create "$POOL_EC" 100 100 erasure myprofile From f52b9d19f2bc7438afeec8d69d469fb51aeac021 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 18 Apr 2017 22:54:04 -0700 Subject: [PATCH 10/11] qa: use bluestore for cephfs cephtool tests This lets the tests enable ec overwrites Signed-off-by: Josh Durgin --- src/test/cephtool-test-mds.sh | 2 +- src/test/vstart_wrapper.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/test/cephtool-test-mds.sh b/src/test/cephtool-test-mds.sh index 93ea99b4f85..90f309c0427 100755 --- a/src/test/cephtool-test-mds.sh +++ b/src/test/cephtool-test-mds.sh @@ -18,7 +18,7 @@ source $(dirname $0)/detect-build-env-vars.sh CEPH_CLI_TEST_DUP_COMMAND=1 \ -MDS=1 MON=1 OSD=3 MGR=0 CEPH_PORT=7200 $CEPH_ROOT/src/test/vstart_wrapper.sh \ +MDS=1 MON=1 OSD=3 MGR=0 CEPH_PORT=7200 CEPH_OBJECTSTORE="bluestore" $CEPH_ROOT/src/test/vstart_wrapper.sh \ $CEPH_ROOT/qa/workunits/cephtool/test.sh \ --test-mds \ --asok-does-not-need-root diff --git a/src/test/vstart_wrapper.sh b/src/test/vstart_wrapper.sh index 62cd6e16b8a..17fd9836f66 100755 --- a/src/test/vstart_wrapper.sh +++ b/src/test/vstart_wrapper.sh @@ -30,8 +30,13 @@ function vstart_setup() trap "teardown $CEPH_DIR" EXIT export LC_ALL=C # some tests are vulnerable to i18n export PATH="$(pwd):${PATH}" + OBJSTORE_ARGS="" + if [ "bluestore" = "${CEPH_OBJECTSTORE}" ]; then + OBJSTORE_ARGS="-b" + fi $CEPH_ROOT/src/vstart.sh \ --short \ + $OBJSTORE_ARGS \ -o 'paxos propose interval = 0.01' \ -n -l || return 1 export CEPH_CONF=$CEPH_DIR/ceph.conf From cab2968e8f7d61f72c568a6cd6e4f08aa7777a5a Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Wed, 19 Apr 2017 16:46:09 -0700 Subject: [PATCH 11/11] qa: use bluestore for (k)rbd ec overwrites tests Only bluestore supports ec overwrites Signed-off-by: Josh Durgin --- .../krbd/rbd-nomount/tasks/krbd_data_pool.yaml | 18 ++++++++++++++++++ qa/suites/rbd/cli/pool/ec-data-pool.yaml | 15 +++++++++++++++ qa/suites/rbd/librbd/pool/ec-data-pool.yaml | 15 +++++++++++++++ qa/suites/rbd/qemu/pool/ec-data-pool.yaml | 15 +++++++++++++++ 4 files changed, 63 insertions(+) diff --git a/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml b/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml index 35b9d67ebff..1dab39755c8 100644 --- a/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml +++ b/qa/suites/krbd/rbd-nomount/tasks/krbd_data_pool.yaml @@ -1,3 +1,21 @@ +overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 + ceph: + fs: xfs + conf: + osd: # force bluestore since it's required for ec overwrites + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + enable experimental unrecoverable data corrupting features: "*" + osd debug randomize hobject sort order: false +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true tasks: - workunit: clients: diff --git a/qa/suites/rbd/cli/pool/ec-data-pool.yaml b/qa/suites/rbd/cli/pool/ec-data-pool.yaml index 523d7450afb..9558cc63a8c 100644 --- a/qa/suites/rbd/cli/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/cli/pool/ec-data-pool.yaml @@ -6,7 +6,22 @@ tasks: - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 ceph: + fs: xfs conf: client: rbd default data pool: datapool + osd: # force bluestore since it's required for ec overwrites + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + enable experimental unrecoverable data corrupting features: "*" + osd debug randomize hobject sort order: false +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml b/qa/suites/rbd/librbd/pool/ec-data-pool.yaml index 523d7450afb..9558cc63a8c 100644 --- a/qa/suites/rbd/librbd/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/librbd/pool/ec-data-pool.yaml @@ -6,7 +6,22 @@ tasks: - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 ceph: + fs: xfs conf: client: rbd default data pool: datapool + osd: # force bluestore since it's required for ec overwrites + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + enable experimental unrecoverable data corrupting features: "*" + osd debug randomize hobject sort order: false +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true diff --git a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml b/qa/suites/rbd/qemu/pool/ec-data-pool.yaml index 523d7450afb..9558cc63a8c 100644 --- a/qa/suites/rbd/qemu/pool/ec-data-pool.yaml +++ b/qa/suites/rbd/qemu/pool/ec-data-pool.yaml @@ -6,7 +6,22 @@ tasks: - sudo ceph osd pool set datapool allow_ec_overwrites true overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 ceph: + fs: xfs conf: client: rbd default data pool: datapool + osd: # force bluestore since it's required for ec overwrites + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 30 + debug bdev: 20 + debug bluefs: 20 + debug rocksdb: 10 + enable experimental unrecoverable data corrupting features: "*" + osd debug randomize hobject sort order: false +# this doesn't work with failures bc the log writes are not atomic across the two backends +# bluestore bluefs env mirror: true