From 42aa85bd95b50f8bffc54c531331a13c0c73e033 Mon Sep 17 00:00:00 2001 From: Loic Dachary Date: Thu, 2 Mar 2017 22:18:37 +0100 Subject: [PATCH] crush: document tunables and rule step set_ Signed-off-by: Loic Dachary --- src/crush/builder.h | 46 ++++++++++++++++++++++++++++-- src/crush/crush.h | 69 ++++++++++++++++++++++++++++++++------------- 2 files changed, 93 insertions(+), 22 deletions(-) diff --git a/src/crush/builder.h b/src/crush/builder.h index aad103a97de..c6c004abe8d 100644 --- a/src/crush/builder.h +++ b/src/crush/builder.h @@ -76,6 +76,9 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi * * - __CRUSH_RULE_NOOP__ do nothing. * - __CRUSH_RULE_TAKE__ select the __arg1__ item + * - __CRUSH_RULE_EMIT__ append the selection to the results and clear + * the selection + * * - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__ * recursively explore each bucket currently selected, looking for * __arg1__ items of type __arg2__ and select them. @@ -83,17 +86,54 @@ extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int mi * recursively explore each bucket currently selected, looking for * __arg1__ leaves within all the buckets of type __arg2__ and * select them. - * - __CRUSH_RULE_EMIT__ append the selection to the results and clear - * the selection * * In all __CHOOSE__ steps, if __arg1__ is zero, the number of items * to select is determined by the __max_result__ argument of * crush_do_rule(), i.e. __arg1__ is __max_result__ minus the number of * items already in the result. * + * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ + * + * The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of + * a given type, randomly selecting them. If they are unlucky and + * find the same bucket twice, they will try N+1 times (N being the + * value of the choose_total_tries tunable). If there is a previous + * SET_CHOOSE_TRIES step in the same rule, it will try C times + * instead (C being the value of the argument of the + * SET_CHOOSE_TRIES step). + * + * Note: the __choose_total_tries__ tunable defined in crush_map is + * the number of retry, not the number of tries. The number of tries + * is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the + * number of tries and does not need the + 1. This confusing + * difference is inherited from an off-by-one bug from years ago. + * + * The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same + * as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore + * each bucket found, looking for a single device. The same device + * may be found in two different buckets because the crush map is + * not a strict hierarchy, it is a DAG. When such a collision + * happens, they will try again. The number of times they try to + * find a non colliding device is: + * + * - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule + * step: try N + 1 times (N being the value of the + * __choose_total_tries__ tunable defined in crush_map) + * + * - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule + * step: try P times (P being the value of the argument of the + * SET_CHOOSELEAF_TRIES rule step) + * + * - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule + * step: try 1 time. + * + * - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try + * P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES + * rule step) + * * @param rule the rule in which the step is inserted * @param pos the zero based step index - * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__ or __CRUSH_RULE_EMIT__ + * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__ * @param arg1 first argument for __op__ * @param arg2 second argument for __op__ */ diff --git a/src/crush/crush.h b/src/crush/crush.h index 9af8c6aceb4..0e332bea0c3 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -315,28 +315,54 @@ struct crush_map { */ __s32 max_devices; - /*! choose local retries before re-descent */ + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ __u32 choose_local_tries; - /*! choose local attempts using a fallback permutation before - *! re-descent */ + /*! Backward compatibility tunable. It implements a bad solution + * and must always be set to 0 except for backward compatibility + * purposes + */ __u32 choose_local_fallback_tries; - /*! choose attempts before giving up */ + /*! Tunable. The default value when the CHOOSE_TRIES or + * CHOOSELEAF_TRIES steps are omitted in a rule. See the + * documentation for crush_rule_set_step() for more + * information + */ __u32 choose_total_tries; - /*! attempt chooseleaf inner descent once for firstn mode; on - *! reject retry outer descent. Note that this does *not* - *! apply to a collision: in that case we will retry as we used - *! to. */ + /*! Backward compatibility tunable. It should always be set + * to 1 except for backward compatibility. Implemented in 2012 + * it was generalized late 2013 and is mostly unused except + * in one border case, reason why it must be set to 1. + * + * Attempt chooseleaf inner descent once for firstn mode; on + * reject retry outer descent. Note that this does *not* + * apply to a collision: in that case we will retry as we + * used to. + */ __u32 chooseleaf_descend_once; - - /*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) - *! bits. a value of 1 is best for new clusters. for legacy clusters - *! that want to limit reshuffling, a value of 3 or 4 will make the - *! mappings line up a bit better with previous mappings. */ + /*! Backward compatibility tunable. It is a fix for bad + * mappings implemented in 2014 at + * https://github.com/ceph/ceph/pull/1185. It should always + * be set to 1 except for backward compatibility. + * + * If non-zero, feed r into chooseleaf, bit-shifted right by + * (r-1) bits. a value of 1 is best for new clusters. for + * legacy clusters that want to limit reshuffling, a value of + * 3 or 4 will make the mappings line up a bit better with + * previous mappings. + */ __u8 chooseleaf_vary_r; - /*! if true, it makes chooseleaf firstn to return stable results (if - *! no local retry) so that data migrations would be optimal when some - *! device fails. */ + /*! Backward compatibility tunable. It is an improvement that + * avoids unnecessary mapping changes, implemented at + * https://github.com/ceph/ceph/pull/6572 and explained in + * this post: "chooseleaf may cause some unnecessary pg + * migrations" in October 2015 + * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html + * It should always be set to 1 except for backward compatibility. + */ __u8 chooseleaf_stable; /*! @cond INTERNAL */ @@ -354,12 +380,17 @@ struct crush_map { size_t working_size; #ifndef __KERNEL__ - /* - * version 0 (original) of straw_calc has various flaws. version 1 - * fixes a few of them. + /*! @endcond */ + /*! Backward compatibility tunable. It is a fix for the straw + * scaler values for the straw algorithm which is deprecated + * (straw2 replaces it) implemented at + * https://github.com/ceph/ceph/pull/3057. It should always + * be set to 1 except for backward compatibility. + * */ __u8 straw_calc_version; + /*! @cond INTERNAL */ /* * allowed bucket algs is a bitmask, here the bit positions * are CRUSH_BUCKET_*. note that these are *bits* and