crush: document tunables and rule step set_

Signed-off-by: Loic Dachary <loic@dachary.org>
This commit is contained in:
Loic Dachary 2017-03-02 22:18:37 +01:00
parent 0c2f5f7214
commit 42aa85bd95
2 changed files with 93 additions and 22 deletions

View File

@ -76,6 +76,9 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi
*
* - __CRUSH_RULE_NOOP__ do nothing.
* - __CRUSH_RULE_TAKE__ select the __arg1__ item
* - __CRUSH_RULE_EMIT__ append the selection to the results and clear
* the selection
*
* - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
* recursively explore each bucket currently selected, looking for
* __arg1__ items of type __arg2__ and select them.
@ -83,17 +86,54 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi
* recursively explore each bucket currently selected, looking for
* __arg1__ leaves within all the buckets of type __arg2__ and
* select them.
* - __CRUSH_RULE_EMIT__ append the selection to the results and clear
* the selection
*
* In all __CHOOSE__ steps, if __arg1__ is zero, the number of items
* to select is determined by the __max_result__ argument of
* crush_do_rule(), i.e. __arg1__ is __max_result__ minus the number of
* items already in the result.
*
* - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
*
* The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
* a given type, randomly selecting them. If they are unlucky and
* find the same bucket twice, they will try N+1 times (N being the
* value of the choose_total_tries tunable). If there is a previous
* SET_CHOOSE_TRIES step in the same rule, it will try C times
* instead (C being the value of the argument of the
* SET_CHOOSE_TRIES step).
*
* Note: the __choose_total_tries__ tunable defined in crush_map is
* the number of retry, not the number of tries. The number of tries
* is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
* number of tries and does not need the + 1. This confusing
* difference is inherited from an off-by-one bug from years ago.
*
* The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
* as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
* each bucket found, looking for a single device. The same device
* may be found in two different buckets because the crush map is
* not a strict hierarchy, it is a DAG. When such a collision
* happens, they will try again. The number of times they try to
* find a non colliding device is:
*
* - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
* step: try N + 1 times (N being the value of the
* __choose_total_tries__ tunable defined in crush_map)
*
* - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
* step: try P times (P being the value of the argument of the
* SET_CHOOSELEAF_TRIES rule step)
*
* - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
* step: try 1 time.
*
* - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
* P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
* rule step)
*
* @param rule the rule in which the step is inserted
* @param pos the zero based step index
* @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__ or __CRUSH_RULE_EMIT__
* @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
* @param arg1 first argument for __op__
* @param arg2 second argument for __op__
*/

View File

@ -315,28 +315,54 @@ struct crush_map {
*/
__s32 max_devices;
/*! choose local retries before re-descent */
/*! Backward compatibility tunable. It implements a bad solution
* and must always be set to 0 except for backward compatibility
* purposes
*/
__u32 choose_local_tries;
/*! choose local attempts using a fallback permutation before
*! re-descent */
/*! Backward compatibility tunable. It implements a bad solution
* and must always be set to 0 except for backward compatibility
* purposes
*/
__u32 choose_local_fallback_tries;
/*! choose attempts before giving up */
/*! Tunable. The default value when the CHOOSE_TRIES or
* CHOOSELEAF_TRIES steps are omitted in a rule. See the
* documentation for crush_rule_set_step() for more
* information
*/
__u32 choose_total_tries;
/*! attempt chooseleaf inner descent once for firstn mode; on
*! reject retry outer descent. Note that this does *not*
*! apply to a collision: in that case we will retry as we used
*! to. */
/*! Backward compatibility tunable. It should always be set
* to 1 except for backward compatibility. Implemented in 2012
* it was generalized late 2013 and is mostly unused except
* in one border case, reason why it must be set to 1.
*
* Attempt chooseleaf inner descent once for firstn mode; on
* reject retry outer descent. Note that this does *not*
* apply to a collision: in that case we will retry as we
* used to.
*/
__u32 chooseleaf_descend_once;
/*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
*! bits. a value of 1 is best for new clusters. for legacy clusters
*! that want to limit reshuffling, a value of 3 or 4 will make the
*! mappings line up a bit better with previous mappings. */
/*! Backward compatibility tunable. It is a fix for bad
* mappings implemented in 2014 at
* https://github.com/ceph/ceph/pull/1185. It should always
* be set to 1 except for backward compatibility.
*
* If non-zero, feed r into chooseleaf, bit-shifted right by
* (r-1) bits. a value of 1 is best for new clusters. for
* legacy clusters that want to limit reshuffling, a value of
* 3 or 4 will make the mappings line up a bit better with
* previous mappings.
*/
__u8 chooseleaf_vary_r;
/*! if true, it makes chooseleaf firstn to return stable results (if
*! no local retry) so that data migrations would be optimal when some
*! device fails. */
/*! Backward compatibility tunable. It is an improvement that
* avoids unnecessary mapping changes, implemented at
* https://github.com/ceph/ceph/pull/6572 and explained in
* this post: "chooseleaf may cause some unnecessary pg
* migrations" in October 2015
* https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
* It should always be set to 1 except for backward compatibility.
*/
__u8 chooseleaf_stable;
/*! @cond INTERNAL */
@ -354,12 +380,17 @@ struct crush_map {
size_t working_size;
#ifndef __KERNEL__
/*
* version 0 (original) of straw_calc has various flaws. version 1
* fixes a few of them.
/*! @endcond */
/*! Backward compatibility tunable. It is a fix for the straw
* scaler values for the straw algorithm which is deprecated
* (straw2 replaces it) implemented at
* https://github.com/ceph/ceph/pull/3057. It should always
* be set to 1 except for backward compatibility.
*
*/
__u8 straw_calc_version;
/*! @cond INTERNAL */
/*
* allowed bucket algs is a bitmask, here the bit positions
* are CRUSH_BUCKET_*. note that these are *bits* and