mirror of
https://github.com/ceph/ceph
synced 2025-01-03 01:22:53 +00:00
crush: document tunables and rule step set_
Signed-off-by: Loic Dachary <loic@dachary.org>
This commit is contained in:
parent
0c2f5f7214
commit
42aa85bd95
@ -76,6 +76,9 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi
|
||||
*
|
||||
* - __CRUSH_RULE_NOOP__ do nothing.
|
||||
* - __CRUSH_RULE_TAKE__ select the __arg1__ item
|
||||
* - __CRUSH_RULE_EMIT__ append the selection to the results and clear
|
||||
* the selection
|
||||
*
|
||||
* - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
|
||||
* recursively explore each bucket currently selected, looking for
|
||||
* __arg1__ items of type __arg2__ and select them.
|
||||
@ -83,17 +86,54 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi
|
||||
* recursively explore each bucket currently selected, looking for
|
||||
* __arg1__ leaves within all the buckets of type __arg2__ and
|
||||
* select them.
|
||||
* - __CRUSH_RULE_EMIT__ append the selection to the results and clear
|
||||
* the selection
|
||||
*
|
||||
* In all __CHOOSE__ steps, if __arg1__ is zero, the number of items
|
||||
* to select is determined by the __max_result__ argument of
|
||||
* crush_do_rule(), i.e. __arg1__ is __max_result__ minus the number of
|
||||
* items already in the result.
|
||||
*
|
||||
* - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
|
||||
*
|
||||
* The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
|
||||
* a given type, randomly selecting them. If they are unlucky and
|
||||
* find the same bucket twice, they will try N+1 times (N being the
|
||||
* value of the choose_total_tries tunable). If there is a previous
|
||||
* SET_CHOOSE_TRIES step in the same rule, it will try C times
|
||||
* instead (C being the value of the argument of the
|
||||
* SET_CHOOSE_TRIES step).
|
||||
*
|
||||
* Note: the __choose_total_tries__ tunable defined in crush_map is
|
||||
* the number of retry, not the number of tries. The number of tries
|
||||
* is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
|
||||
* number of tries and does not need the + 1. This confusing
|
||||
* difference is inherited from an off-by-one bug from years ago.
|
||||
*
|
||||
* The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
|
||||
* as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
|
||||
* each bucket found, looking for a single device. The same device
|
||||
* may be found in two different buckets because the crush map is
|
||||
* not a strict hierarchy, it is a DAG. When such a collision
|
||||
* happens, they will try again. The number of times they try to
|
||||
* find a non colliding device is:
|
||||
*
|
||||
* - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
|
||||
* step: try N + 1 times (N being the value of the
|
||||
* __choose_total_tries__ tunable defined in crush_map)
|
||||
*
|
||||
* - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
|
||||
* step: try P times (P being the value of the argument of the
|
||||
* SET_CHOOSELEAF_TRIES rule step)
|
||||
*
|
||||
* - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
|
||||
* step: try 1 time.
|
||||
*
|
||||
* - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
|
||||
* P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
|
||||
* rule step)
|
||||
*
|
||||
* @param rule the rule in which the step is inserted
|
||||
* @param pos the zero based step index
|
||||
* @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__ or __CRUSH_RULE_EMIT__
|
||||
* @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
|
||||
* @param arg1 first argument for __op__
|
||||
* @param arg2 second argument for __op__
|
||||
*/
|
||||
|
@ -315,28 +315,54 @@ struct crush_map {
|
||||
*/
|
||||
__s32 max_devices;
|
||||
|
||||
/*! choose local retries before re-descent */
|
||||
/*! Backward compatibility tunable. It implements a bad solution
|
||||
* and must always be set to 0 except for backward compatibility
|
||||
* purposes
|
||||
*/
|
||||
__u32 choose_local_tries;
|
||||
/*! choose local attempts using a fallback permutation before
|
||||
*! re-descent */
|
||||
/*! Backward compatibility tunable. It implements a bad solution
|
||||
* and must always be set to 0 except for backward compatibility
|
||||
* purposes
|
||||
*/
|
||||
__u32 choose_local_fallback_tries;
|
||||
/*! choose attempts before giving up */
|
||||
/*! Tunable. The default value when the CHOOSE_TRIES or
|
||||
* CHOOSELEAF_TRIES steps are omitted in a rule. See the
|
||||
* documentation for crush_rule_set_step() for more
|
||||
* information
|
||||
*/
|
||||
__u32 choose_total_tries;
|
||||
/*! attempt chooseleaf inner descent once for firstn mode; on
|
||||
*! reject retry outer descent. Note that this does *not*
|
||||
*! apply to a collision: in that case we will retry as we used
|
||||
*! to. */
|
||||
/*! Backward compatibility tunable. It should always be set
|
||||
* to 1 except for backward compatibility. Implemented in 2012
|
||||
* it was generalized late 2013 and is mostly unused except
|
||||
* in one border case, reason why it must be set to 1.
|
||||
*
|
||||
* Attempt chooseleaf inner descent once for firstn mode; on
|
||||
* reject retry outer descent. Note that this does *not*
|
||||
* apply to a collision: in that case we will retry as we
|
||||
* used to.
|
||||
*/
|
||||
__u32 chooseleaf_descend_once;
|
||||
|
||||
/*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
|
||||
*! bits. a value of 1 is best for new clusters. for legacy clusters
|
||||
*! that want to limit reshuffling, a value of 3 or 4 will make the
|
||||
*! mappings line up a bit better with previous mappings. */
|
||||
/*! Backward compatibility tunable. It is a fix for bad
|
||||
* mappings implemented in 2014 at
|
||||
* https://github.com/ceph/ceph/pull/1185. It should always
|
||||
* be set to 1 except for backward compatibility.
|
||||
*
|
||||
* If non-zero, feed r into chooseleaf, bit-shifted right by
|
||||
* (r-1) bits. a value of 1 is best for new clusters. for
|
||||
* legacy clusters that want to limit reshuffling, a value of
|
||||
* 3 or 4 will make the mappings line up a bit better with
|
||||
* previous mappings.
|
||||
*/
|
||||
__u8 chooseleaf_vary_r;
|
||||
|
||||
/*! if true, it makes chooseleaf firstn to return stable results (if
|
||||
*! no local retry) so that data migrations would be optimal when some
|
||||
*! device fails. */
|
||||
/*! Backward compatibility tunable. It is an improvement that
|
||||
* avoids unnecessary mapping changes, implemented at
|
||||
* https://github.com/ceph/ceph/pull/6572 and explained in
|
||||
* this post: "chooseleaf may cause some unnecessary pg
|
||||
* migrations" in October 2015
|
||||
* https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
|
||||
* It should always be set to 1 except for backward compatibility.
|
||||
*/
|
||||
__u8 chooseleaf_stable;
|
||||
|
||||
/*! @cond INTERNAL */
|
||||
@ -354,12 +380,17 @@ struct crush_map {
|
||||
size_t working_size;
|
||||
|
||||
#ifndef __KERNEL__
|
||||
/*
|
||||
* version 0 (original) of straw_calc has various flaws. version 1
|
||||
* fixes a few of them.
|
||||
/*! @endcond */
|
||||
/*! Backward compatibility tunable. It is a fix for the straw
|
||||
* scaler values for the straw algorithm which is deprecated
|
||||
* (straw2 replaces it) implemented at
|
||||
* https://github.com/ceph/ceph/pull/3057. It should always
|
||||
* be set to 1 except for backward compatibility.
|
||||
*
|
||||
*/
|
||||
__u8 straw_calc_version;
|
||||
|
||||
/*! @cond INTERNAL */
|
||||
/*
|
||||
* allowed bucket algs is a bitmask, here the bit positions
|
||||
* are CRUSH_BUCKET_*. note that these are *bits* and
|
||||
|
Loading…
Reference in New Issue
Block a user