ceph/src/mon/OSDMonitor.h

444 lines
13 KiB
C
Raw Normal View History

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
* Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
*
* Author: Loic Dachary <loic@dachary.org>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
2008-01-16 14:06:02 +00:00
/* Object Store Device (OSD) Monitor
*/
#ifndef CEPH_OSDMONITOR_H
#define CEPH_OSDMONITOR_H
#include <map>
#include <set>
using namespace std;
#include "include/types.h"
#include "msg/Messenger.h"
#include "osd/OSDMap.h"
#include "PaxosService.h"
#include "Session.h"
class Monitor;
class PGMap;
#include "messages/MOSDBoot.h"
#include "messages/MMonCommand.h"
#include "messages/MOSDMap.h"
#include "messages/MPoolOp.h"
#include "erasure-code/ErasureCodeInterface.h"
#include "common/TrackedOp.h"
#include "mon/MonOpRequest.h"
#define OSD_METADATA_PREFIX "osd_metadata"
/// information about a particular peer's failure reports for one osd
struct failure_reporter_t {
int num_reports; ///< reports from this reporter
utime_t failed_since; ///< when they think it failed
MonOpRequestRef op; ///< most recent failure op request
failure_reporter_t() : num_reports(0) {}
failure_reporter_t(utime_t s) : num_reports(1), failed_since(s) {}
~failure_reporter_t() { }
};
/// information about all failure reports for one osd
struct failure_info_t {
map<int, failure_reporter_t> reporters; ///< reporter -> # reports
utime_t max_failed_since; ///< most recent failed_since
int num_reports;
failure_info_t() : num_reports(0) {}
utime_t get_failed_since() {
if (max_failed_since == utime_t() && !reporters.empty()) {
// the old max must have canceled; recalculate.
for (map<int, failure_reporter_t>::iterator p = reporters.begin();
p != reporters.end();
++p)
if (p->second.failed_since > max_failed_since)
max_failed_since = p->second.failed_since;
}
return max_failed_since;
}
// set the message for the latest report. return any old op request we had,
// if any, so we can discard it.
MonOpRequestRef add_report(int who, utime_t failed_since,
MonOpRequestRef op) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end()) {
if (max_failed_since == utime_t())
max_failed_since = failed_since;
else if (max_failed_since < failed_since)
max_failed_since = failed_since;
p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
} else {
p->second.num_reports++;
}
num_reports++;
MonOpRequestRef ret = p->second.op;
p->second.op = op;
return ret;
}
void take_report_messages(list<MonOpRequestRef>& ls) {
for (map<int, failure_reporter_t>::iterator p = reporters.begin();
p != reporters.end();
++p) {
if (p->second.op) {
ls.push_back(p->second.op);
p->second.op.reset();
}
}
}
void cancel_report(int who) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end())
return;
num_reports -= p->second.num_reports;
reporters.erase(p);
if (reporters.empty())
max_failed_since = utime_t();
}
};
class OSDMonitor : public PaxosService {
public:
OSDMap osdmap;
private:
// [leader]
OSDMap::Incremental pending_inc;
map<int, bufferlist> pending_metadata;
set<int> pending_metadata_rm;
map<int, failure_info_t> failure_info;
map<int,utime_t> down_pending_out; // osd down -> out
map<int,double> osd_weight;
void check_failures(utime_t now);
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
// map thrashing
int thrash_map;
int thrash_last_up_osd;
bool thrash();
bool _have_pending_crush();
CrushWrapper &_get_stable_crush();
void _get_pending_crush(CrushWrapper& newcrush);
// svc
public:
void create_initial();
private:
void update_from_paxos(bool *need_bootstrap);
void create_pending(); // prepare a new pending
void encode_pending(MonitorDBStore::TransactionRef t);
void on_active();
void on_shutdown();
/**
* we haven't delegated full version stashing to paxosservice for some time
* now, making this function useless in current context.
*/
virtual void encode_full(MonitorDBStore::TransactionRef t) { }
/**
* do not let paxosservice periodically stash full osdmaps, or we will break our
* locally-managed full maps. (update_from_paxos loads the latest and writes them
* out going forward from there, but if we just synced that may mean we skip some.)
*/
virtual bool should_stash_full() {
return false;
}
/**
* hook into trim to include the oldest full map in the trim transaction
*
* This ensures that anyone post-sync will have enough to rebuild their
* full osdmaps.
*/
void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first);
void update_msgr_features();
int check_cluster_features(uint64_t features, stringstream &ss);
/**
* check if the cluster supports the features required by the
* given crush map. Outputs the daemons which don't support it
* to the stringstream.
*
* @returns true if the map is passable, false otherwise
*/
bool validate_crush_against_features(const CrushWrapper *newcrush,
stringstream &ss);
void share_map_with_random_osd();
void maybe_prime_pg_temp();
void prime_pg_temp(OSDMap& next,
ceph::unordered_map<pg_t, pg_stat_t>::iterator pp);
int prime_pg_temp(OSDMap& next, PGMap *pg_map, int osd);
void update_logger();
void handle_query(PaxosServiceMessage *m);
bool preprocess_query(MonOpRequestRef op); // true if processed.
bool prepare_update(MonOpRequestRef op);
bool should_propose(double &delay);
version_t get_trim_to();
mon: Paxos: trim through Paxos Instead of directly modifying the store whenever we want to trim our Paxos state, we should do it through Paxos, proposing the trim to the quorum and commit it once accepted. This enforces three major invariants that we will be able to leverage later on during the store synchronization: 1) The Leader will set the pace for trimming across the system. No one will trim their state unless they are committing the value proposed by the Leader; 2) Following (1), the monitors in the quorum will trim at the same time. There will be no diverging states due to trimming on different monitors. 3) Each trim will be kept as a transaction in the Paxos' store allowing us to obtain a consistent state during synchronization, by shipping the Paxos versions to the other monitor and applying them. We could incur in an inconsistent state if the trim happened without constraints, without being logged; by going through Paxos this concern is no longer relevant. The trimming itself may be triggered each time a proposal finishes, which is the time at which we know we have committed a new version on the store. It shall be triggered iff we are sure we have enough versions on the store to fill the gap of any monitor that might become alive and still hasn't drifted enough to require synchronization. Roughly speaking, we will check if the number of available versions is higher than 'paxos_max_join_drift'. Furthermore, we added a new option, 'paxos_trim_tolerance', so we are able to avoid trimming every single time the above condition is met -- which would happen every time we trimmed a version, and then proposed a new one, and then we would trim it again, etc. So, just tolerate a couple of commits before trimming again. Finally, we added support to enable/disable trimming, which will be essential during the store synchronization process. Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
2012-07-04 10:47:03 +00:00
bool can_mark_down(int o);
bool can_mark_up(int o);
bool can_mark_out(int o);
bool can_mark_in(int o);
// ...
MOSDMap *build_latest_full();
MOSDMap *build_incremental(epoch_t first, epoch_t last);
void send_full(MonOpRequestRef op);
void send_incremental(MonOpRequestRef op, epoch_t first);
void send_incremental(epoch_t first, MonSession *session, bool onetime);
int reweight_by_utilization(int oload, std::string& out_str, bool by_pg,
const set<int64_t> *pools);
void print_utilization(ostream &out, Formatter *f, bool tree) const;
bool check_source(PaxosServiceMessage *m, uuid_d fsid);
bool preprocess_get_osdmap(MonOpRequestRef op);
bool preprocess_mark_me_down(MonOpRequestRef op);
friend class C_AckMarkedDown;
bool preprocess_failure(MonOpRequestRef op);
bool prepare_failure(MonOpRequestRef op);
bool prepare_mark_me_down(MonOpRequestRef op);
void process_failures();
void take_all_failures(list<MonOpRequestRef>& ls);
bool preprocess_boot(MonOpRequestRef op);
bool prepare_boot(MonOpRequestRef op);
void _booted(MonOpRequestRef op, bool logit);
bool preprocess_alive(MonOpRequestRef op);
bool prepare_alive(MonOpRequestRef op);
void _reply_map(MonOpRequestRef op, epoch_t e);
bool preprocess_pgtemp(MonOpRequestRef op);
bool prepare_pgtemp(MonOpRequestRef op);
int _check_remove_pool(int64_t pool, const pg_pool_t *pi, ostream *ss);
bool _check_become_tier(
int64_t tier_pool_id, const pg_pool_t *tier_pool,
int64_t base_pool_id, const pg_pool_t *base_pool,
int *err, ostream *ss) const;
bool _check_remove_tier(
int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
int *err, ostream *ss) const;
int _prepare_remove_pool(int64_t pool, ostream *ss);
int _prepare_rename_pool(int64_t pool, string newname);
bool preprocess_pool_op (MonOpRequestRef op);
bool preprocess_pool_op_create (MonOpRequestRef op);
bool prepare_pool_op (MonOpRequestRef op);
bool prepare_pool_op_create (MonOpRequestRef op);
bool prepare_pool_op_delete(MonOpRequestRef op);
int crush_rename_bucket(const string& srcname,
const string& dstname,
ostream *ss);
int normalize_profile(ErasureCodeProfile &profile, ostream *ss);
int crush_ruleset_create_erasure(const string &name,
const string &profile,
int *ruleset,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int get_crush_ruleset(const string &ruleset_name,
int *crush_ruleset,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int get_erasure_code(const string &erasure_code_profile,
ErasureCodeInterfaceRef *erasure_code,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss) const;
int prepare_pool_crush_ruleset(const unsigned pool_type,
const string &erasure_code_profile,
const string &ruleset_name,
int *crush_ruleset,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
bool erasure_code_profile_in_use(const map<int64_t, pg_pool_t> &pools,
const string &profile,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
map<string,string> *erasure_code_profile_map,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int prepare_pool_size(const unsigned pool_type,
const string &erasure_code_profile,
unsigned *size, unsigned *min_size,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int prepare_pool_stripe_width(const unsigned pool_type,
const string &erasure_code_profile,
unsigned *stripe_width,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int prepare_new_pool(string& name, uint64_t auid,
int crush_ruleset,
const string &crush_ruleset_name,
unsigned pg_num, unsigned pgp_num,
const string &erasure_code_profile,
const unsigned pool_type,
const uint64_t expected_num_objects,
erasure-code: implement consistent error stream The error stream in the erasure code path is broken and the error message is sometime not reported back to the user. For instance the ErasureCodePlugin::factory method has no error stream: when an error happens the user is left with a cryptic error code that needs lookup in the sources to figure it out. The error stream is made more systematic by: * always pass it as ostream *ss (instead of something passing it as a reference and sometime as a stringstream) * ostream *ss is added to ErasureCodePlugin::factory * define the ErasureCodeInterface::init pure virtual. It is already implemented by all plugins, only in slightly different ways. The ostream *ss is added so the init function has a way to report error in a human readable way to the caller, in addition to the error code. The ErasureCodePluginJerasure::init return value was incorrectly ignored when called from ErasureCodePluginJerasure::factory and now returns when it fails. The ErasureCodeLrc::layers_init method is given ostream *ss for error messages instead of printing them via derr. The ErasureCodePluginLrc::factory method no longer prints errors via derr: this workaround is made unnecessary by the ostream *ss argument. The ErasureCodeShec::init ostream *ss argument is ignored. The ErasureCodeShec::parse method entirely relies on derr to report errors and converting it goes beyond the scope of this cleanup. There is a slight risk of getting it wrong and it deserves a separate commit and careful and independent review. The PGBackend, OSDMonitor.{cc,h} changes are only about prototype changes. Signed-off-by: Loic Dachary <ldachary@redhat.com>
2015-05-17 13:28:52 +00:00
ostream *ss);
int prepare_new_pool(MonOpRequestRef op);
void update_pool_flags(int64_t pool_id, uint64_t flags);
bool update_pools_status();
void get_pools_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const;
bool prepare_set_flag(MonOpRequestRef op, int flag);
bool prepare_unset_flag(MonOpRequestRef op, int flag);
void _pool_op_reply(MonOpRequestRef op,
int ret, epoch_t epoch, bufferlist *blp=NULL);
struct C_Booted : public C_MonOp {
OSDMonitor *cmon;
bool logit;
C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
C_MonOp(op_), cmon(cm), logit(l) {}
void _finish(int r) {
if (r >= 0)
cmon->_booted(op, logit);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
cmon->dispatch(op);
else
assert(0 == "bad C_Booted return value");
}
};
struct C_ReplyMap : public C_MonOp {
OSDMonitor *osdmon;
epoch_t e;
C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
: C_MonOp(op_), osdmon(o), e(ee) {}
void _finish(int r) {
if (r >= 0)
osdmon->_reply_map(op, e);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
osdmon->dispatch(op);
else
assert(0 == "bad C_ReplyMap return value");
}
};
struct C_PoolOp : public C_MonOp {
OSDMonitor *osdmon;
int replyCode;
int epoch;
bufferlist reply_data;
C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
if (rd)
reply_data = *rd;
}
void _finish(int r) {
if (r >= 0)
osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
osdmon->dispatch(op);
else
assert(0 == "bad C_PoolOp return value");
}
};
bool preprocess_remove_snaps(MonOpRequestRef op);
bool prepare_remove_snaps(MonOpRequestRef op);
CephContext *cct;
OpTracker op_tracker;
int load_metadata(int osd, map<string, string>& m, ostream *err);
public:
OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, string service_name)
mon: Single-paxos and key/value store support We are converting the monitor subsystem to a Single-Paxos architecture, backed by a key/value store. The previous architecture used a Paxos instance for each Paxos Service, backed by a nasty Monitor Store that provided few to no consistency guarantees whatsoever, which led to a fair amount of workarounds. Changes: * Paxos: - Add k/v store support - Add documentation describing the new Paxos storage layout and behavior - Get rid of the stashing code, which was used as a consistency point mechanism (we no longer need it, because of our k/v store) - Debug level of 30 will output json-formatted transaction dumps - Allows for proposal queueing, to be proposed in the same order as they were queued. - No more 'is_leader()' function, using instead the Monitor's for enhanced simplicity. - Add 'is_lease_valid()' function. - Disregard 'stashed versions' - Make the paxos 'state' variable a bit-map, so we lock the proposal mechanism while maintaining the state [5]. - Related notes: [3] * PaxosService: - Add k/v store support, creating wrappers to be used by the services - Add documentation - Support single-paxos behavior, creating wrappers to be used by the services and service-specific version - Rearrange variables so they are neatly organized in the beginning of the class - Add a trim_to() function to be used by the services, instead of letting them rely on Paxos::trim_to(), which is no longer adequate to the job at hand - Debug level of 30 will output json-formatted transaction dumps - Support proposal queueing, taking it into consideration when assessing the current state of the service (active, writeable, readable, ...) - Redefine the conditions for 'is_{active,readable,writeable}()' given the new single-paxos approach, with proposal queueing [1]. - Use our own waiting_for_* callback lists, which now must be dissociated from their Paxos counterparts [2]. - Related notes: [3], [4] * Monitor: - Add k/v store support - Use only one Paxos instance and pass it down to each service instance - Crank up CEPH_MON_PROTOCOL to 10 * {Auth,Log,MDS,Monmap,OSD,PG}Monitor: - Add k/v store support - Add single-paxos support * AuthMonitor: - Don't always propose full versions: if the KeyServer doesn't have keys, we cannot propose a full version. This should only happen when we start with a brand new store and we are creating the first pending proposal, and if we were to commit a full version filled with nothing but a big void of nothingness, we could eventually end up with a corrupted version. * Elector: - Add k/v store support - Add single-paxos support * ceph-mon: - Use the monitor's k/v store instead of MonitorStore * MMonPaxos: - remove the machine_id field: This field was used to identify from/to which paxos service a given message belonged. We no longer have a Paxos for each service, so this field became obsolete. Notes: [1] Redefine the conditions for 'is_{active,readable,writeable}()' on the PaxosService class, to be used with single-paxos and proposal queueing: We should not rely on the Paxos::is_*() functions, since they do not apply directly to the PaxosService. All the PaxosService classes share the same Paxos class, but they do not rely on its values. Each service only relies, uses and updates its own values on the k/v store. Thus, we may have a given service (e.g., the OSDMonitor) proposing a new value, hence updating or waiting to update its store, and we may still consider the LogMonitor as being able to read and write its own values on the k/v store. In a nutshell, different services do not overlap on their access to their own store when it comes to reading, and since the Paxos will queue their updates and deal with them in a FIFO order, their updates won't overlap either. Therefore, the conditions for the PaxosService::is_{active,readable, writeable} differ from those on the Paxos::is_{active,readable,writeable}. * PaxosService::is_active() - the PaxosService will be considered as active iff it is not proposing and the Paxos is not recovering. This means that a given PaxosService (e.g., the OSDMonitor) may be considered as being active even though some other service (e.g., the LogMonitor) is proposing a new value and the Paxos is on the UPDATING state. This means that the OSDMonitor will be able to read its own versions and queue any changes on to the Paxos. However, if the Paxos is on state RECOVERING, we cannot be considered as active. * PaxosService::is_writeable() - We will be able to propose new values iff we are the Leader, we have a valid lease, and we are not already proposing. If we are proposing, we must wait for our proposal to finish in order to proceed with writing to our k/v store; otherwise we could incur in assuming that our last committed version was, say, 10; then assign map epochs/versions taking that into consideration, make changes to the store based on those values, just to come to smash previously proposed values on the store. We really don't want that. To be fair, there was a chance we could assume we were always writable, but there may be unforeseen consequences to this; so we take the conservative approach here for now, and we will relax it in the future if we believe it to be fruitful. * PaxosService::is_readable() - We will be readable iff we are not proposing and the Paxos is not recovering; if our last committed version exists; and if we are either a cluster of one or we have a valid lease. [2] Use own waiting_for_* callback lists on PaxosService, which now must be dissociated from their Paxos counterparts: We were relying on Paxos to wait for state changes, but since our state became somewhat independent from the Paxos state, we have to deal with callbacks waiting for 'readable', 'writable' or 'active' on different terms than those that Paxos provide. So, basically, we will take one of two approaches when it comes to waiting: * If we are proposing, queue ourselves on our own list, waiting for the proposal to finish; * Otherwise, the cause for the need to wait comes from Paxos, so queue the callback directly on Paxos. This approach means that we must make sure to check our desired state whenever the callback is fired up, and re-queue ourselves if the state didn't quite change (or if it changed but our waiting condition result didn't). For instance, if we were waiting for a proposal to finish due to a failed 'is_active()', we will need to recheck if we are active before continuing once the callback is fired. This is mainly because we may have finished our proposal, but a new Election may have been called and the Paxos may not be active. [3] Propose everything in the queue before bootstrapping, but don't allow new proposals: The MonmapMonitor may issue bootstraps once it is updated. We must ensure that we propose every single pending proposal before we actually do it. However, ee don't want to propose if we are going to bootstrap; otherwise, we may end up losing proposals. [4] Handle the case when first_committed_version equals 0 on a PaxosService In a nutshell, the services do not set the first committed version, as they consider it as a SEP (Somebody Else's Problem). They do rely on it though, and we, the PaxosService, must ensure that it contains a valid value (that is, higher than zero) at all times. Since we will only have a first_committed version equal to zero once, and that is before the service's first proposal, we are safe to simply read the variable from the store and assign the first_committed the same value as the last_committed iff the first_committed version is zero. This also affects trimming, since trimming relies on the first_committed version as the lower bound for version trimming. Even though the k/v store will gracefully ignore any problem from trying to remove non-existent versions, the main issue would still stand: we'd be removing a non-existent version and that just doesn't make any sense. [5] 'lock' paxos when we are running some internal proposals Force the paxos services to wait for us to complete whatever we are doing before they can proceed. This is required because on certain occasions we might need to run internal proposals, not affected to any of the paxos services (for instance, when learning an old value), and we need them to stay put, or they might incur in erroneous state and crash the monitor. This could have been done with an extra bool, but there was no point in creating a new variable when we can just as easily reuse the 'state' variable for our twisted interests. Fixes: #4175 Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
2012-06-11 13:55:21 +00:00
: PaxosService(mn, p, service_name),
thrash_map(0), thrash_last_up_osd(-1),
op_tracker(cct, true, 1)
{ }
void tick(); // check state, take actions
int parse_osd_id(const char *s, stringstream *pss);
void get_health(list<pair<health_status_t,string> >& summary,
list<pair<health_status_t,string> > *detail) const;
bool preprocess_command(MonOpRequestRef op);
bool prepare_command(MonOpRequestRef op);
bool prepare_command_impl(MonOpRequestRef op, map<string,cmd_vartype>& cmdmap);
int set_crash_replay_interval(const int64_t pool_id, const uint32_t cri);
int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
stringstream& ss);
void handle_osd_timeouts(const utime_t &now,
std::map<int,utime_t> &last_osd_report);
void mark_all_down();
void send_latest(MonOpRequestRef op, epoch_t start=0);
void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
op->mark_osdmon_event(__func__);
send_incremental(op, start);
}
epoch_t blacklist(const entity_addr_t& a, utime_t until);
void dump_info(Formatter *f);
int dump_osd_metadata(int osd, Formatter *f, ostream *err);
void print_nodes(Formatter *f);
2009-08-28 23:48:09 +00:00
void check_subs();
void check_sub(Subscription *sub);
2009-08-28 23:48:09 +00:00
void add_flag(int flag) {
if (!(osdmap.flags & flag)) {
if (pending_inc.new_flags < 0)
pending_inc.new_flags = osdmap.flags;
pending_inc.new_flags |= flag;
}
}
void remove_flag(int flag) {
if(osdmap.flags & flag) {
if (pending_inc.new_flags < 0)
pending_inc.new_flags = osdmap.flags;
pending_inc.new_flags &= ~flag;
}
}
};
#endif