mirror of
https://github.com/ceph/ceph
synced 2025-01-04 10:12:30 +00:00
OSDMonitor: get stripe_width via stripe_unit in ec profile
With bluestore, making the smallest write match min_alloc_size avoids write amplification. With EC pools this is the stripe unit, or stripe_width / num_data_chunks. Rather than requiring people to divide by k to get the smallest ec write, allow it to be specified directly via stripe_unit. Store it in the ec profile so changing a monitor config option isn't necessary to set it. This is particularly important for ec overwrites since they allow random i/o which should match bluestore's checksum granularity (aka min_alloc_size). Signed-off-by: Josh Durgin <jdurgin@redhat.com>
This commit is contained in:
parent
5080affedc
commit
930eae214c
@ -78,7 +78,6 @@
|
||||
still works. See the documentation page "Mount CephFS in your
|
||||
file systems table" for details.
|
||||
|
||||
|
||||
12.0.1
|
||||
------
|
||||
|
||||
@ -106,3 +105,9 @@
|
||||
* The RGW api for getting object torrent has changed its params from 'get_torrent'
|
||||
to 'torrent' so that it can be compatible with Amazon S3. Now the request for
|
||||
object torrent is like 'GET /ObjectName?torrent'.
|
||||
|
||||
* The configuration option "osd pool erasure code stripe width" has
|
||||
been replaced by "osd pool erasure code stripe unit", and given the
|
||||
ability to be overridden by the erasure code profile setting
|
||||
"stripe_unit". For more details see "Erasure Code Profiles" in the
|
||||
documentation.
|
||||
|
@ -85,14 +85,18 @@ Ceph configuration file.
|
||||
make pool creation work in the absence of ruleset 0.
|
||||
|
||||
|
||||
``osd pool erasure code stripe width``
|
||||
``osd pool erasure code stripe unit``
|
||||
|
||||
:Description: Sets the desired size, in bytes, of an object stripe on every
|
||||
erasure coded pools. Every object if size S will be stored as
|
||||
N stripes and each stripe will be encoded/decoded individually.
|
||||
:Description: Sets the default size, in bytes, of a chunk of an object
|
||||
stripe for erasure coded pools. Every object of size S
|
||||
will be stored as N stripes, with each data chunk
|
||||
receiving ``stripe unit`` bytes. Each stripe of ``N *
|
||||
stripe unit`` bytes will be encoded/decoded
|
||||
individually. This option can is overridden by the
|
||||
``stripe_unit`` setting in an erasure code profile.
|
||||
|
||||
:Type: Unsigned 32-bit Integer
|
||||
:Default: ``4096``
|
||||
:Default: ``4096``
|
||||
|
||||
|
||||
``osd pool default size``
|
||||
|
@ -39,6 +39,7 @@ To create a new erasure code profile::
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
[{directory=directory}] \
|
||||
[{plugin=plugin}] \
|
||||
[{stripe_unit=stripe_unit}] \
|
||||
[{key=value} ...] \
|
||||
[--force]
|
||||
|
||||
@ -60,23 +61,40 @@ Where:
|
||||
plugins`_ for more information.
|
||||
|
||||
:Type: String
|
||||
:Required: No.
|
||||
:Required: No.
|
||||
:Default: jerasure
|
||||
|
||||
``{stripe_unit=stripe_unit}``
|
||||
|
||||
:Description: The amount of data in a data chunk, per stripe. For
|
||||
example, a profile with 2 data chunks and stripe_unit=4K
|
||||
would put the range 0-4K in chunk 0, 4K-8K in chunk 1,
|
||||
then 8K-12K in chunk 0 again. This should be a multiple
|
||||
of 4K for best performance. The default value is taken
|
||||
from the monitor config option
|
||||
``osd_pool_erasure_code_stripe_unit`` when a pool is
|
||||
created. The stripe_width of a pool using this profile
|
||||
will be the number of data chunks multiplied by this
|
||||
stripe_unit.
|
||||
|
||||
:Type: String
|
||||
:Required: No.
|
||||
|
||||
``{key=value}``
|
||||
|
||||
:Description: The semantic of the remaining key/value pairs is defined
|
||||
by the erasure code plugin.
|
||||
|
||||
:Type: String
|
||||
:Required: No.
|
||||
:Required: No.
|
||||
|
||||
``--force``
|
||||
|
||||
:Description: Override an existing profile by the same name.
|
||||
:Description: Override an existing profile by the same name, and allow
|
||||
setting a non-4K-aligned stripe_unit.
|
||||
|
||||
:Type: String
|
||||
:Required: No.
|
||||
:Required: No.
|
||||
|
||||
osd erasure-code-profile rm
|
||||
============================
|
||||
|
@ -38,8 +38,6 @@ enum {
|
||||
#define OSD_REP_SPLAY 1
|
||||
#define OSD_REP_CHAIN 2
|
||||
|
||||
#define OSD_POOL_ERASURE_CODE_STRIPE_WIDTH 4096
|
||||
|
||||
class CephContext;
|
||||
|
||||
extern const char *CEPH_CONF_FILE_DEFAULT;
|
||||
|
@ -675,7 +675,7 @@ OPTION(osd_crush_update_on_start, OPT_BOOL, true)
|
||||
OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds.
|
||||
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
|
||||
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
|
||||
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
|
||||
OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes
|
||||
OPTION(osd_pool_default_size, OPT_INT, 3)
|
||||
OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2
|
||||
OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf
|
||||
|
@ -4893,8 +4893,9 @@ void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& prof
|
||||
}
|
||||
}
|
||||
|
||||
int OSDMonitor::normalize_profile(const string& profilename,
|
||||
ErasureCodeProfile &profile,
|
||||
int OSDMonitor::normalize_profile(const string& profilename,
|
||||
ErasureCodeProfile &profile,
|
||||
bool force,
|
||||
ostream *ss)
|
||||
{
|
||||
ErasureCodeInterfaceRef erasure_code;
|
||||
@ -4904,10 +4905,39 @@ int OSDMonitor::normalize_profile(const string& profilename,
|
||||
int err = instance.factory(plugin->second,
|
||||
g_conf->get_val<std::string>("erasure_code_dir"),
|
||||
profile, &erasure_code, ss);
|
||||
if (err)
|
||||
if (err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
return erasure_code->init(profile, ss);
|
||||
err = erasure_code->init(profile, ss);
|
||||
if (err) {
|
||||
return err;
|
||||
}
|
||||
|
||||
auto it = profile.find("stripe_unit");
|
||||
if (it != profile.end()) {
|
||||
string err_str;
|
||||
uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
|
||||
if (!err_str.empty()) {
|
||||
*ss << "could not parse stripe_unit '" << it->second
|
||||
<< "': " << err_str << std::endl;
|
||||
return -EINVAL;
|
||||
}
|
||||
uint32_t data_chunks = erasure_code->get_data_chunk_count();
|
||||
uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
|
||||
if (chunk_size != stripe_unit) {
|
||||
*ss << "stripe_unit " << stripe_unit << " does not match ec profile "
|
||||
<< "alignment. Would be padded to " << chunk_size
|
||||
<< std::endl;
|
||||
return -EINVAL;
|
||||
}
|
||||
if ((stripe_unit % 4096) != 0 && !force) {
|
||||
*ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
|
||||
<< "use --force to override this check" << std::endl;
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int OSDMonitor::crush_ruleset_create_erasure(const string &name,
|
||||
@ -5130,12 +5160,22 @@ int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
|
||||
break;
|
||||
case pg_pool_t::TYPE_ERASURE:
|
||||
{
|
||||
ErasureCodeProfile profile =
|
||||
osdmap.get_erasure_code_profile(erasure_code_profile);
|
||||
ErasureCodeInterfaceRef erasure_code;
|
||||
err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
|
||||
uint32_t desired_stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
|
||||
if (err == 0)
|
||||
*stripe_width = erasure_code->get_data_chunk_count() *
|
||||
erasure_code->get_chunk_size(desired_stripe_width);
|
||||
if (err)
|
||||
break;
|
||||
uint32_t data_chunks = erasure_code->get_data_chunk_count();
|
||||
uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
|
||||
auto it = profile.find("stripe_unit");
|
||||
if (it != profile.end()) {
|
||||
string err_str;
|
||||
stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
|
||||
assert(err_str.empty());
|
||||
}
|
||||
*stripe_width = data_chunks *
|
||||
erasure_code->get_chunk_size(stripe_unit * data_chunks);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -6831,14 +6871,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
|
||||
if (err)
|
||||
goto reply;
|
||||
}
|
||||
err = normalize_profile(name, profile_map, &ss);
|
||||
err = normalize_profile(name, profile_map, force, &ss);
|
||||
if (err)
|
||||
goto reply;
|
||||
|
||||
if (osdmap.has_erasure_code_profile(name)) {
|
||||
ErasureCodeProfile existing_profile_map =
|
||||
osdmap.get_erasure_code_profile(name);
|
||||
err = normalize_profile(name, existing_profile_map, &ss);
|
||||
err = normalize_profile(name, existing_profile_map, force, &ss);
|
||||
if (err)
|
||||
goto reply;
|
||||
|
||||
@ -6892,7 +6932,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
|
||||
&ss);
|
||||
if (err)
|
||||
goto reply;
|
||||
err = normalize_profile(name, profile_map, &ss);
|
||||
err = normalize_profile(name, profile_map, true, &ss);
|
||||
if (err)
|
||||
goto reply;
|
||||
dout(20) << "erasure code profile set " << profile << "="
|
||||
|
@ -297,6 +297,7 @@ private:
|
||||
const string& profile) const;
|
||||
int normalize_profile(const string& profilename,
|
||||
ErasureCodeProfile &profile,
|
||||
bool force,
|
||||
ostream *ss);
|
||||
int crush_ruleset_create_erasure(const string &name,
|
||||
const string &profile,
|
||||
|
@ -615,8 +615,8 @@ TEST(ErasureCodeLrc, encode_decode)
|
||||
profile["layers"] = description_string;
|
||||
EXPECT_EQ(0, lrc.init(profile, &cerr));
|
||||
EXPECT_EQ(4U, lrc.get_data_chunk_count());
|
||||
unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
|
||||
unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
|
||||
unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit;
|
||||
unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size;
|
||||
EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width));
|
||||
set<int> want_to_encode;
|
||||
map<int, bufferlist> encoded;
|
||||
@ -745,8 +745,8 @@ TEST(ErasureCodeLrc, encode_decode_2)
|
||||
profile["layers"] = description_string;
|
||||
EXPECT_EQ(0, lrc.init(profile, &cerr));
|
||||
EXPECT_EQ(4U, lrc.get_data_chunk_count());
|
||||
unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
|
||||
unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
|
||||
unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit;
|
||||
unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size;
|
||||
EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width));
|
||||
set<int> want_to_encode;
|
||||
map<int, bufferlist> encoded;
|
||||
|
@ -259,8 +259,9 @@ function TEST_alignment_constraints() {
|
||||
# imposed by the stripe width
|
||||
# See http://tracker.ceph.com/issues/8622
|
||||
#
|
||||
local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
|
||||
local block_size=$((stripe_width - 1))
|
||||
local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
|
||||
eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
|
||||
local block_size=$((stripe_unit * k - 1))
|
||||
dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2
|
||||
rados --block-size=$block_size \
|
||||
--pool ecpool put UNALIGNED $dir/ORIGINAL || return 1
|
||||
@ -268,9 +269,7 @@ function TEST_alignment_constraints() {
|
||||
}
|
||||
|
||||
function chunk_size() {
|
||||
local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
|
||||
eval local $(ceph osd erasure-code-profile get default | grep k=)
|
||||
echo $(($stripe_width / $k))
|
||||
echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
|
||||
}
|
||||
|
||||
#
|
||||
|
@ -122,30 +122,32 @@ function TEST_erasure_code_profile_default() {
|
||||
ceph osd erasure-code-profile ls | grep default || return 1
|
||||
}
|
||||
|
||||
function TEST_erasure_crush_stripe_width() {
|
||||
function TEST_erasure_crush_stripe_unit() {
|
||||
local dir=$1
|
||||
# the default stripe width is used to initialize the pool
|
||||
# the default stripe unit is used to initialize the pool
|
||||
run_mon $dir a --public-addr $CEPH_MON
|
||||
stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
|
||||
stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
|
||||
eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
|
||||
stripe_width = $((stripe_unit * k))
|
||||
ceph osd pool create pool_erasure 12 12 erasure
|
||||
ceph --format json osd dump | tee $dir/osd.json
|
||||
grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1
|
||||
}
|
||||
|
||||
function TEST_erasure_crush_stripe_width_padded() {
|
||||
function TEST_erasure_crush_stripe_unit_padded() {
|
||||
local dir=$1
|
||||
# setting osd_pool_erasure_code_stripe_width modifies the stripe_width
|
||||
# setting osd_pool_erasure_code_stripe_unit modifies the stripe_width
|
||||
# and it is padded as required by the default plugin
|
||||
profile+=" plugin=jerasure"
|
||||
profile+=" technique=reed_sol_van"
|
||||
k=4
|
||||
profile+=" k=$k"
|
||||
profile+=" m=2"
|
||||
expected_chunk_size=2048
|
||||
actual_stripe_width=$(($expected_chunk_size * $k))
|
||||
desired_stripe_width=$(($actual_stripe_width - 1))
|
||||
actual_stripe_unit=2048
|
||||
desired_stripe_unit=$((actual_stripe_unit - 1))
|
||||
actual_stripe_width=$((actual_stripe_unit * k))
|
||||
run_mon $dir a \
|
||||
--osd_pool_erasure_code_stripe_width $desired_stripe_width \
|
||||
--osd_pool_erasure_code_stripe_unit $desired_stripe_unit \
|
||||
--osd_pool_default_erasure_code_profile "$profile" || return 1
|
||||
ceph osd pool create pool_erasure 12 12 erasure
|
||||
ceph osd dump | tee $dir/osd.json
|
||||
|
Loading…
Reference in New Issue
Block a user