Merge pull request #8357 from liewegas/wip-osd-prestart

osd: update crush_location from ceph-osd on startup

Reviewed-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
Sage Weil 2016-05-09 08:59:05 -04:00
commit a28b71e3c9
13 changed files with 268 additions and 47 deletions

View File

@ -10,3 +10,14 @@
New monitors will now use rocksdb by default, but if that file is
not present, existing monitors will use leveldb. The ``mon keyvaluedb`` option
now only affects the backend chosen when a monitor is created.
* The 'osd crush initial weight' option allows you to specify a CRUSH
weight for a newly added OSD. Previously a value of 0 (the default)
meant that we should use the size of the OSD's store to weight the
new OSD. Now, a value of 0 means it should have a weight of 0, and
a negative value (the new default) means we should automatically
weight the OSD based on its size. If your configuration file
explicitly specifies a value of 0 for this option you will need to
change it to a negative value (e.g., -1) to preserve the current
behavior.

View File

@ -184,7 +184,8 @@ set(crush_srcs
crush/hash.c
crush/CrushWrapper.cc
crush/CrushCompiler.cc
crush/CrushTester.cc)
crush/CrushTester.cc
crush/CrushLocation.cc)
add_library(crush STATIC ${crush_srcs})

View File

@ -20,32 +20,6 @@ fi
data="/var/lib/ceph/osd/${cluster:-ceph}-$id"
journal="$data/journal"
update="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_update_on_start || :)"
if [ "${update:-1}" = "1" -o "${update:-1}" = "true" ]; then
# update location in crush
hook="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_location_hook || :)"
if [ -z "$hook" ]; then
hook="/usr/bin/ceph-crush-location"
fi
location="$($hook --cluster ${cluster:-ceph} --id $id --type osd)"
weight="$(ceph-conf --cluster=${cluster:-ceph} --name=osd.$id --lookup osd_crush_initial_weight || :)"
if [ -e $data/block ]; then
defaultweight=`blockdev --getsize64 $data/block | awk '{ d= $1/1099511627776 ; r = sprintf("%.4f", d); print r }'`
else
defaultweight=`df -P -k $data/ | tail -1 | awk '{ d= $2/1073741824 ; r = sprintf("%.4f", d); print r }'`
fi
ceph \
--cluster="${cluster:-ceph}" \
--name="osd.$id" \
--keyring="$data/keyring" \
osd crush create-or-move \
-- \
"$id" \
"${weight:-${defaultweight:-1}}" \
$location
fi
if [ -L "$journal" -a ! -e "$journal" ]; then
udevadm settle --timeout=5 || :
if [ -L "$journal" -a ! -e "$journal" ]; then

View File

@ -239,6 +239,7 @@ public:
const char** get_tracked_conf_keys() const {
static const char *KEYS[] = {
"enable_experimental_unrecoverable_data_corrupting_features",
"crush_location",
NULL
};
return KEYS;
@ -246,13 +247,20 @@ public:
void handle_conf_change(const md_config_t *conf,
const std::set <std::string> &changed) {
ceph_spin_lock(&cct->_feature_lock);
get_str_set(conf->enable_experimental_unrecoverable_data_corrupting_features,
cct->_experimental_features);
ceph_spin_unlock(&cct->_feature_lock);
if (!cct->_experimental_features.empty())
lderr(cct) << "WARNING: the following dangerous and experimental features are enabled: "
<< cct->_experimental_features << dendl;
if (changed.count(
"enable_experimental_unrecoverable_data_corrupting_features")) {
ceph_spin_lock(&cct->_feature_lock);
get_str_set(
conf->enable_experimental_unrecoverable_data_corrupting_features,
cct->_experimental_features);
ceph_spin_unlock(&cct->_feature_lock);
if (!cct->_experimental_features.empty())
lderr(cct) << "WARNING: the following dangerous and experimental features are enabled: "
<< cct->_experimental_features << dendl;
}
if (changed.count("crush_location")) {
cct->crush_location.update_from_conf();
}
}
};
@ -459,6 +467,7 @@ CephContext::CephContext(uint32_t module_type_, int init_flags_)
_crypto_aes(NULL),
_plugin_registry(NULL),
_lockdep_obs(NULL),
crush_location(this),
_cct_perf(NULL)
{
ceph_spin_init(&_service_thread_lock);

View File

@ -25,6 +25,7 @@
#include "include/atomic.h"
#include "common/cmdparse.h"
#include "include/Spinlock.h"
#include "crush/CrushLocation.h"
#include <boost/noncopyable.hpp>
class AdminSocket;
@ -247,6 +248,10 @@ private:
md_config_obs_t *_lockdep_obs;
public:
CrushLocation crush_location;
private:
enum {
l_cct_first,
l_cct_total_workers,

View File

@ -417,6 +417,8 @@ OPTION(client_use_faked_inos, OPT_BOOL, false)
OPTION(client_mds_namespace, OPT_INT, -1)
OPTION(crush_location, OPT_STR, "") // whitespace-separated list of key=value pairs describing crush location
OPTION(crush_location_hook, OPT_STR, "")
OPTION(crush_location_hook_timeout, OPT_INT, 10)
OPTION(objecter_tick_interval, OPT_DOUBLE, 5.0)
OPTION(objecter_timeout, OPT_DOUBLE, 10.0) // before we ask for a map
@ -610,9 +612,9 @@ OPTION(osd_pg_op_threshold_ratio, OPT_U64, 2) // the expected maximu
OPTION(osd_pg_bits, OPT_INT, 6) // bits per osd
OPTION(osd_pgp_bits, OPT_INT, 6) // bits per osd
OPTION(osd_crush_chooseleaf_type, OPT_INT, 1) // 1 = host
// This parameter is not consumed by ceph C code but the upstart scripts.
// OPTION(osd_crush_initial_weight, OPT_DOUBLE, 0) // the initial weight is for newly added osds.
OPTION(osd_pool_use_gmt_hitset, OPT_BOOL, true) // try to use gmt for hitset archive names if all osds in cluster support it.
OPTION(osd_crush_update_on_start, OPT_BOOL, true)
OPTION(osd_crush_initial_weight, OPT_DOUBLE, -1) // if >=0, the initial weight is for newly added osds.
OPTION(osd_pool_default_crush_rule, OPT_INT, -1) // deprecated for osd_pool_default_crush_replicated_ruleset
OPTION(osd_pool_default_crush_replicated_ruleset, OPT_INT, CEPH_DEFAULT_CRUSH_REPLICATED_RULESET)
OPTION(osd_pool_erasure_code_stripe_width, OPT_U32, OSD_POOL_ERASURE_CODE_STRIPE_WIDTH) // in bytes

105
src/crush/CrushLocation.cc Normal file
View File

@ -0,0 +1,105 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#include "CrushLocation.h"
#include "CrushWrapper.h"
#include "common/config.h"
#include "include/str_list.h"
#include "common/debug.h"
#include <common/SubProcess.h>
#include <vector>
int CrushLocation::update_from_conf()
{
if (cct->_conf->crush_location.length())
return _parse(cct->_conf->crush_location);
return 0;
}
int CrushLocation::_parse(const std::string& s)
{
std::multimap<std::string,std::string> new_crush_location;
std::vector<std::string> lvec;
get_str_vec(s, ";, \t", lvec);
int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location);
if (r < 0) {
lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
<< "' does not parse, keeping original crush_location "
<< loc << dendl;
return -EINVAL;
}
std::lock_guard<std::mutex> l(lock);
loc.swap(new_crush_location);
lgeneric_dout(cct, 10) << "crush_location is " << loc << dendl;
return 0;
}
int CrushLocation::update_from_hook()
{
if (cct->_conf->crush_location_hook.length() == 0)
return 0;
SubProcessTimed hook(
cct->_conf->crush_location_hook.c_str(),
SubProcess::CLOSE, SubProcess::PIPE, SubProcess::PIPE,
cct->_conf->crush_location_hook_timeout);
hook.add_cmd_args(
"--cluster", cct->_conf->cluster.c_str(),
"--id", cct->_conf->name.get_id().c_str(),
"--type", cct->_conf->name.get_type_str(),
NULL);
int ret = hook.spawn();
if (ret != 0) {
lderr(cct) << "error: failed run " << cct->_conf->crush_location_hook << ": "
<< hook.err() << dendl;
return ret;
}
bufferlist bl;
ret = bl.read_fd(hook.get_stdout(), 100 * 1024);
if (ret < 0) {
lderr(cct) << "error: failed read stdout from "
<< cct->_conf->crush_location_hook
<< ": " << cpp_strerror(-ret) << dendl;
bufferlist err;
err.read_fd(hook.get_stderr(), 100 * 1024);
lderr(cct) << "stderr:\n";
err.hexdump(*_dout);
*_dout << dendl;
return ret;
}
if (hook.join() != 0) {
lderr(cct) << "error: failed to join: " << hook.err() << dendl;
return -EINVAL;
}
std::string out;
bl.copy(0, bl.length(), out);
out.erase(out.find_last_not_of(" \n\r\t")+1);
return _parse(out);
}
int CrushLocation::init_on_startup()
{
if (cct->_conf->crush_location.length()) {
return update_from_conf();
}
if (cct->_conf->crush_location_hook.length()) {
return update_from_hook();
}
// start with a sane default
char hostname[HOST_NAME_MAX + 1];
int r = gethostname(hostname, sizeof(hostname)-1);
if (r < 0)
strcpy(hostname, "unknown_host");
std::lock_guard<std::mutex> l(lock);
loc.clear();
loc.insert(make_pair<std::string,std::string>("host", hostname));
loc.insert(make_pair<std::string,std::string>("root", "default"));
lgeneric_dout(cct, 10) << "crush_location is (default) " << loc << dendl;
return 0;
}

35
src/crush/CrushLocation.h Normal file
View File

@ -0,0 +1,35 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#ifndef CEPH_CRUSH_LOCATION_H
#define CEPH_CRUSH_LOCATION_H
#include <map>
#include <mutex>
#include <string>
class CephContext;
class CrushLocation {
CephContext *cct;
std::multimap<std::string,std::string> loc;
std::mutex lock;
int _parse(const std::string& s);
public:
CrushLocation(CephContext *c) : cct(c) {
update_from_conf();
}
int update_from_conf(); ///< refresh from config
int update_from_hook(); ///< call hook, if present
int init_on_startup();
std::multimap<std::string,std::string> get_location() {
std::lock_guard<std::mutex> l(lock);
return loc;
}
};
#endif

View File

@ -5,11 +5,13 @@ libcrush_la_SOURCES = \
crush/hash.c \
crush/CrushWrapper.cc \
crush/CrushCompiler.cc \
crush/CrushTester.cc
crush/CrushTester.cc \
crush/CrushLocation.cc
noinst_LTLIBRARIES += libcrush.la
noinst_HEADERS += \
crush/CrushCompiler.h \
crush/CrushLocation.h \
crush/CrushTester.h \
crush/CrushTreeDumper.h \
crush/CrushWrapper.h \

View File

@ -321,6 +321,8 @@ void global_init(std::vector < const char * > *alt_def_args,
if (code_env == CODE_ENVIRONMENT_DAEMON && !(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
output_ceph_version();
g_ceph_context->crush_location.init_on_startup();
}
void global_print_banner(void)

View File

@ -2207,6 +2207,12 @@ int OSD::init()
}
}
r = update_crush_location();
if (r < 0) {
osd_lock.Lock();
goto monout;
}
osd_lock.Lock();
if (is_stopping())
return 0;
@ -2752,6 +2758,82 @@ int OSD::shutdown()
return r;
}
int OSD::update_crush_location()
{
if (!g_conf->osd_crush_update_on_start) {
dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
return 0;
}
char weight[32];
if (g_conf->osd_crush_initial_weight >= 0) {
snprintf(weight, sizeof(weight), "%.4lf", g_conf->osd_crush_initial_weight);
} else {
struct statfs st;
int r = store->statfs(&st);
if (r < 0) {
derr << "statfs: " << cpp_strerror(r) << dendl;
return r;
}
snprintf(weight, sizeof(weight), "%.4lf",
MAX((double).00001,
(double)(st.f_blocks * st.f_bsize) /
(double)(1ull << 40 /* TB */)));
}
std::multimap<string,string> loc = cct->crush_location.get_location();
dout(10) << __func__ << " crush location is " << loc << dendl;
string cmd =
string("{\"prefix\": \"osd crush create-or-move\", ") +
string("\"id\": ") + stringify(whoami) + string(", ") +
string("\"weight\":") + weight + string(", ") +
string("\"args\": [");
for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
if (p != loc.begin())
cmd += ", ";
cmd += "\"" + p->first + "=" + p->second + "\"";
}
cmd += "]}";
bool created = false;
while (true) {
dout(10) << __func__ << " cmd: " << cmd << dendl;
vector<string> vcmd{cmd};
bufferlist inbl;
C_SaferCond w;
string outs;
int r = monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
if (r == 0)
r = w.wait();
if (r < 0) {
if (r == -ENOENT && !created) {
string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
+ ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
vector<string> vnewcmd{newcmd};
bufferlist inbl;
C_SaferCond w;
string outs;
int r = monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
if (r == 0)
r = w.wait();
if (r < 0) {
derr << __func__ << " fail: osd does not exist and created failed: "
<< cpp_strerror(r) << dendl;
return r;
}
created = true;
continue;
}
derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
return r;
}
break;
}
return 0;
}
void OSD::write_superblock(ObjectStore::Transaction& t)
{
dout(10) << "write_superblock " << superblock << dendl;

View File

@ -2404,6 +2404,8 @@ protected:
}
private:
int update_crush_location();
static int write_meta(ObjectStore *store,
uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);

View File

@ -184,16 +184,7 @@ void Objecter::handle_conf_change(const struct md_config_t *conf,
void Objecter::update_crush_location()
{
unique_lock wl(rwlock);
std::multimap<string,string> new_crush_location;
vector<string> lvec;
get_str_vec(cct->_conf->crush_location, ";, \t", lvec);
int r = CrushWrapper::parse_loc_multimap(lvec, &new_crush_location);
if (r < 0) {
lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
<< "' does not parse, leave origin crush_location untouched." << dendl;
return;
}
crush_location = new_crush_location;
crush_location = cct->crush_location.get_location();
}
// messages ------------------------------