OSDMonitor: get stripe_width via stripe_unit in ec profile

With bluestore, making the smallest write match min_alloc_size avoids
write amplification. With EC pools this is the stripe unit, or
stripe_width / num_data_chunks. Rather than requiring people to divide
by k to get the smallest ec write, allow it to be specified directly
via stripe_unit. Store it in the ec profile so changing a monitor
config option isn't necessary to set it.

This is particularly important for ec overwrites since they allow random i/o
which should match bluestore's checksum granularity (aka min_alloc_size).

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
This commit is contained in:
Josh Durgin 2017-02-13 17:42:33 -08:00
parent 5080affedc
commit 930eae214c
10 changed files with 109 additions and 42 deletions

View File

@ -78,7 +78,6 @@
still works. See the documentation page "Mount CephFS in your
file systems table" for details.
12.0.1
------
@ -106,3 +105,9 @@
* The RGW api for getting object torrent has changed its params from 'get_torrent'
to 'torrent' so that it can be compatible with Amazon S3. Now the request for
object torrent is like 'GET /ObjectName?torrent'.
* The configuration option "osd pool erasure code stripe width" has
been replaced by "osd pool erasure code stripe unit", and given the
ability to be overridden by the erasure code profile setting
"stripe_unit". For more details see "Erasure Code Profiles" in the
documentation.

View File

@ -85,14 +85,18 @@ Ceph configuration file.
make pool creation work in the absence of ruleset 0.
``osd pool erasure code stripe width``
``osd pool erasure code stripe unit``
:Description: Sets the desired size, in bytes, of an object stripe on every
erasure coded pools. Every object if size S will be stored as
N stripes and each stripe will be encoded/decoded individually.
:Description: Sets the default size, in bytes, of a chunk of an object
stripe for erasure coded pools. Every object of size S
will be stored as N stripes, with each data chunk
receiving ``stripe unit`` bytes. Each stripe of ``N *
stripe unit`` bytes will be encoded/decoded
individually. This option can is overridden by the
``stripe_unit`` setting in an erasure code profile.
:Type: Unsigned 32-bit Integer
:Default: ``4096``
:Default: ``4096``
``osd pool default size``

View File

@ -39,6 +39,7 @@ To create a new erasure code profile::
ceph osd erasure-code-profile set {name} \
[{directory=directory}] \
[{plugin=plugin}] \
[{stripe_unit=stripe_unit}] \
[{key=value} ...] \
[--force]
@ -60,23 +61,40 @@ Where:
plugins`_ for more information.
:Type: String
:Required: No.
:Required: No.
:Default: jerasure
``{stripe_unit=stripe_unit}``
:Description: The amount of data in a data chunk, per stripe. For
example, a profile with 2 data chunks and stripe_unit=4K
would put the range 0-4K in chunk 0, 4K-8K in chunk 1,
then 8K-12K in chunk 0 again. This should be a multiple
of 4K for best performance. The default value is taken
from the monitor config option
``osd_pool_erasure_code_stripe_unit`` when a pool is
created. The stripe_width of a pool using this profile
will be the number of data chunks multiplied by this
stripe_unit.
:Type: String
:Required: No.
``{key=value}``
:Description: The semantic of the remaining key/value pairs is defined
by the erasure code plugin.
:Type: String
:Required: No.
:Required: No.
``--force``
:Description: Override an existing profile by the same name.
:Description: Override an existing profile by the same name, and allow
setting a non-4K-aligned stripe_unit.
:Type: String
:Required: No.
:Required: No.
osd erasure-code-profile rm
============================

View File

@ -38,8 +38,6 @@ enum {
#define OSD_REP_SPLAY 1
#define OSD_REP_CHAIN 2
#define OSD_POOL_ERASURE_CODE_STRIPE_WIDTH 4096
class CephContext;
extern const char *CEPH_CONF_FILE_DEFAULT;

View File

@ -675,7 +675,7 @@ OPTION(osd_crush_update_on_start, OPT_BOOL, true)
OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes
OPTION(osd_pool_erasure_code_stripe_unit, OPT_U32, 4096) // in bytes
OPTION(osd_pool_default_size, OPT_INT, 3)
OPTION(osd_pool_default_min_size, OPT_INT, 0) // 0 means no specific default; ceph will use size-size/2
OPTION(osd_pool_default_pg_num, OPT_INT, 8) // number of PGs for new pools. Configure in global or mon section of ceph.conf

View File

@ -4893,8 +4893,9 @@ void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& prof
}
}
int OSDMonitor::normalize_profile(const string& profilename,
ErasureCodeProfile &profile,
int OSDMonitor::normalize_profile(const string& profilename,
ErasureCodeProfile &profile,
bool force,
ostream *ss)
{
ErasureCodeInterfaceRef erasure_code;
@ -4904,10 +4905,39 @@ int OSDMonitor::normalize_profile(const string& profilename,
int err = instance.factory(plugin->second,
g_conf->get_val<std::string>("erasure_code_dir"),
profile, &erasure_code, ss);
if (err)
if (err) {
return err;
}
return erasure_code->init(profile, ss);
err = erasure_code->init(profile, ss);
if (err) {
return err;
}
auto it = profile.find("stripe_unit");
if (it != profile.end()) {
string err_str;
uint32_t stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
if (!err_str.empty()) {
*ss << "could not parse stripe_unit '" << it->second
<< "': " << err_str << std::endl;
return -EINVAL;
}
uint32_t data_chunks = erasure_code->get_data_chunk_count();
uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
if (chunk_size != stripe_unit) {
*ss << "stripe_unit " << stripe_unit << " does not match ec profile "
<< "alignment. Would be padded to " << chunk_size
<< std::endl;
return -EINVAL;
}
if ((stripe_unit % 4096) != 0 && !force) {
*ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
<< "use --force to override this check" << std::endl;
return -EINVAL;
}
}
return 0;
}
int OSDMonitor::crush_ruleset_create_erasure(const string &name,
@ -5130,12 +5160,22 @@ int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
break;
case pg_pool_t::TYPE_ERASURE:
{
ErasureCodeProfile profile =
osdmap.get_erasure_code_profile(erasure_code_profile);
ErasureCodeInterfaceRef erasure_code;
err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
uint32_t desired_stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
if (err == 0)
*stripe_width = erasure_code->get_data_chunk_count() *
erasure_code->get_chunk_size(desired_stripe_width);
if (err)
break;
uint32_t data_chunks = erasure_code->get_data_chunk_count();
uint32_t stripe_unit = g_conf->osd_pool_erasure_code_stripe_unit;
auto it = profile.find("stripe_unit");
if (it != profile.end()) {
string err_str;
stripe_unit = strict_si_cast<uint32_t>(it->second.c_str(), &err_str);
assert(err_str.empty());
}
*stripe_width = data_chunks *
erasure_code->get_chunk_size(stripe_unit * data_chunks);
}
break;
default:
@ -6831,14 +6871,14 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
if (err)
goto reply;
}
err = normalize_profile(name, profile_map, &ss);
err = normalize_profile(name, profile_map, force, &ss);
if (err)
goto reply;
if (osdmap.has_erasure_code_profile(name)) {
ErasureCodeProfile existing_profile_map =
osdmap.get_erasure_code_profile(name);
err = normalize_profile(name, existing_profile_map, &ss);
err = normalize_profile(name, existing_profile_map, force, &ss);
if (err)
goto reply;
@ -6892,7 +6932,7 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
&ss);
if (err)
goto reply;
err = normalize_profile(name, profile_map, &ss);
err = normalize_profile(name, profile_map, true, &ss);
if (err)
goto reply;
dout(20) << "erasure code profile set " << profile << "="

View File

@ -297,6 +297,7 @@ private:
const string& profile) const;
int normalize_profile(const string& profilename,
ErasureCodeProfile &profile,
bool force,
ostream *ss);
int crush_ruleset_create_erasure(const string &name,
const string &profile,

View File

@ -615,8 +615,8 @@ TEST(ErasureCodeLrc, encode_decode)
profile["layers"] = description_string;
EXPECT_EQ(0, lrc.init(profile, &cerr));
EXPECT_EQ(4U, lrc.get_data_chunk_count());
unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit;
unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size;
EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width));
set<int> want_to_encode;
map<int, bufferlist> encoded;
@ -745,8 +745,8 @@ TEST(ErasureCodeLrc, encode_decode_2)
profile["layers"] = description_string;
EXPECT_EQ(0, lrc.init(profile, &cerr));
EXPECT_EQ(4U, lrc.get_data_chunk_count());
unsigned int stripe_width = g_conf->osd_pool_erasure_code_stripe_width;
unsigned int chunk_size = stripe_width / lrc.get_data_chunk_count();
unsigned int chunk_size = g_conf->osd_pool_erasure_code_stripe_unit;
unsigned int stripe_width = lrc.get_data_chunk_count() * chunk_size;
EXPECT_EQ(chunk_size, lrc.get_chunk_size(stripe_width));
set<int> want_to_encode;
map<int, bufferlist> encoded;

View File

@ -259,8 +259,9 @@ function TEST_alignment_constraints() {
# imposed by the stripe width
# See http://tracker.ceph.com/issues/8622
#
local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
local block_size=$((stripe_width - 1))
local stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
local block_size=$((stripe_unit * k - 1))
dd if=/dev/zero of=$dir/ORIGINAL bs=$block_size count=2
rados --block-size=$block_size \
--pool ecpool put UNALIGNED $dir/ORIGINAL || return 1
@ -268,9 +269,7 @@ function TEST_alignment_constraints() {
}
function chunk_size() {
local stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
eval local $(ceph osd erasure-code-profile get default | grep k=)
echo $(($stripe_width / $k))
echo $(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
}
#

View File

@ -122,30 +122,32 @@ function TEST_erasure_code_profile_default() {
ceph osd erasure-code-profile ls | grep default || return 1
}
function TEST_erasure_crush_stripe_width() {
function TEST_erasure_crush_stripe_unit() {
local dir=$1
# the default stripe width is used to initialize the pool
# the default stripe unit is used to initialize the pool
run_mon $dir a --public-addr $CEPH_MON
stripe_width=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_width)
stripe_unit=$(ceph-conf --show-config-value osd_pool_erasure_code_stripe_unit)
eval local $(ceph osd erasure-code-profile get myprofile | grep k=)
stripe_width = $((stripe_unit * k))
ceph osd pool create pool_erasure 12 12 erasure
ceph --format json osd dump | tee $dir/osd.json
grep '"stripe_width":'$stripe_width $dir/osd.json > /dev/null || return 1
}
function TEST_erasure_crush_stripe_width_padded() {
function TEST_erasure_crush_stripe_unit_padded() {
local dir=$1
# setting osd_pool_erasure_code_stripe_width modifies the stripe_width
# setting osd_pool_erasure_code_stripe_unit modifies the stripe_width
# and it is padded as required by the default plugin
profile+=" plugin=jerasure"
profile+=" technique=reed_sol_van"
k=4
profile+=" k=$k"
profile+=" m=2"
expected_chunk_size=2048
actual_stripe_width=$(($expected_chunk_size * $k))
desired_stripe_width=$(($actual_stripe_width - 1))
actual_stripe_unit=2048
desired_stripe_unit=$((actual_stripe_unit - 1))
actual_stripe_width=$((actual_stripe_unit * k))
run_mon $dir a \
--osd_pool_erasure_code_stripe_width $desired_stripe_width \
--osd_pool_erasure_code_stripe_unit $desired_stripe_unit \
--osd_pool_default_erasure_code_profile "$profile" || return 1
ceph osd pool create pool_erasure 12 12 erasure
ceph osd dump | tee $dir/osd.json