mirror of
https://github.com/ceph/ceph
synced 2025-01-03 17:42:36 +00:00
Merge pull request #8558 from branch-predictor/bp-mark-down-on-perm-rst
msg: mark daemons down on RST + ECONNREFUSED Reviewed-by: Sage Weil <sage@redhat.com> Reviewed-by: Gregory Farnum <gfarnum@redhat.com> Reviewed-by: Haomai Wang <haomai@xsky.com>
This commit is contained in:
commit
a033dc6f5b
@ -12380,6 +12380,12 @@ void Client::ms_handle_remote_reset(Connection *con)
|
||||
}
|
||||
}
|
||||
|
||||
bool Client::ms_handle_refused(Connection *con)
|
||||
{
|
||||
ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
|
||||
{
|
||||
if (dest_type == CEPH_ENTITY_TYPE_MON)
|
||||
|
@ -569,6 +569,7 @@ protected:
|
||||
void ms_handle_connect(Connection *con);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con);
|
||||
bool ms_handle_refused(Connection *con);
|
||||
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
|
||||
|
||||
int authenticate();
|
||||
|
@ -815,6 +815,7 @@ OPTION(osd_op_history_duration, OPT_U32, 600) // Oldest completed op to track
|
||||
OPTION(osd_target_transaction_size, OPT_INT, 30) // to adjust various transactions that batch smaller items
|
||||
OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "full" (failsafe)
|
||||
OPTION(osd_failsafe_nearfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD near full (failsafe)
|
||||
OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OSDs as down once they refuse to accept connections
|
||||
|
||||
OPTION(osd_pg_object_context_cache_count, OPT_INT, 64)
|
||||
OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled
|
||||
|
@ -478,6 +478,10 @@ void librados::RadosClient::ms_handle_remote_reset(Connection *con)
|
||||
{
|
||||
}
|
||||
|
||||
bool librados::RadosClient::ms_handle_refused(Connection *con)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool librados::RadosClient::_dispatch(Message *m)
|
||||
{
|
||||
|
@ -58,6 +58,7 @@ private:
|
||||
void ms_handle_connect(Connection *con);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con);
|
||||
bool ms_handle_refused(Connection *con);
|
||||
|
||||
Objecter *objecter;
|
||||
|
||||
|
@ -94,6 +94,7 @@ public:
|
||||
void ms_handle_connect(Connection *c) {}
|
||||
bool ms_handle_reset(Connection *c) {return false;}
|
||||
void ms_handle_remote_reset(Connection *c) {}
|
||||
bool ms_handle_refused(Connection *c) {return false;}
|
||||
|
||||
void notify_mdsmap(MDSMap const *mdsmap);
|
||||
void notify_health(MDSRank const *mds);
|
||||
|
@ -1271,6 +1271,12 @@ void MDSDaemon::ms_handle_remote_reset(Connection *con)
|
||||
}
|
||||
}
|
||||
|
||||
bool MDSDaemon::ms_handle_refused(Connection *con)
|
||||
{
|
||||
// do nothing for now
|
||||
return false;
|
||||
}
|
||||
|
||||
bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
|
||||
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
|
||||
bool& is_valid, CryptoKey& session_key)
|
||||
|
@ -138,6 +138,7 @@ class MDSDaemon : public Dispatcher, public md_config_obs_t {
|
||||
void ms_handle_connect(Connection *con);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con);
|
||||
bool ms_handle_refused(Connection *con);
|
||||
|
||||
protected:
|
||||
// admin socket handling
|
||||
|
@ -24,22 +24,41 @@ class MOSDFailure : public PaxosServiceMessage {
|
||||
static const int HEAD_VERSION = 3;
|
||||
|
||||
public:
|
||||
enum {
|
||||
FLAG_ALIVE = 0, // use this on its own to mark as "I'm still alive"
|
||||
FLAG_FAILED = 1, // if set, failure; if not, recovery
|
||||
FLAG_IMMEDIATE = 2, // known failure, not a timeout
|
||||
};
|
||||
|
||||
uuid_d fsid;
|
||||
entity_inst_t target_osd;
|
||||
__u8 is_failed;
|
||||
__u8 flags;
|
||||
epoch_t epoch;
|
||||
int32_t failed_for; // known to be failed since at least this long
|
||||
|
||||
MOSDFailure() : PaxosServiceMessage(MSG_OSD_FAILURE, 0, HEAD_VERSION) { }
|
||||
MOSDFailure(const uuid_d &fs, const entity_inst_t& f, int duration, epoch_t e)
|
||||
: PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION),
|
||||
fsid(fs), target_osd(f), is_failed(true), epoch(e), failed_for(duration) { }
|
||||
fsid(fs), target_osd(f),
|
||||
flags(FLAG_FAILED),
|
||||
epoch(e), failed_for(duration) { }
|
||||
MOSDFailure(const uuid_d &fs, const entity_inst_t& f, int duration,
|
||||
epoch_t e, __u8 extra_flags)
|
||||
: PaxosServiceMessage(MSG_OSD_FAILURE, e, HEAD_VERSION),
|
||||
fsid(fs), target_osd(f),
|
||||
flags(extra_flags),
|
||||
epoch(e), failed_for(duration) { }
|
||||
private:
|
||||
~MOSDFailure() {}
|
||||
|
||||
public:
|
||||
entity_inst_t get_target() { return target_osd; }
|
||||
bool if_osd_failed() { return is_failed; }
|
||||
bool if_osd_failed() const {
|
||||
return flags & FLAG_FAILED;
|
||||
}
|
||||
bool is_immediate() const {
|
||||
return flags & FLAG_IMMEDIATE;
|
||||
}
|
||||
epoch_t get_epoch() { return epoch; }
|
||||
|
||||
void decode_payload() {
|
||||
@ -49,9 +68,9 @@ public:
|
||||
::decode(target_osd, p);
|
||||
::decode(epoch, p);
|
||||
if (header.version >= 2)
|
||||
::decode(is_failed, p);
|
||||
::decode(flags, p);
|
||||
else
|
||||
is_failed = true;
|
||||
flags = FLAG_FAILED;
|
||||
if (header.version >= 3)
|
||||
::decode(failed_for, p);
|
||||
else
|
||||
@ -63,14 +82,15 @@ public:
|
||||
::encode(fsid, payload);
|
||||
::encode(target_osd, payload, features);
|
||||
::encode(epoch, payload);
|
||||
::encode(is_failed, payload);
|
||||
::encode(flags, payload);
|
||||
::encode(failed_for, payload);
|
||||
}
|
||||
|
||||
const char *get_type_name() const { return "osd_failure"; }
|
||||
void print(ostream& out) const {
|
||||
out << "osd_failure("
|
||||
<< (is_failed ? "failed " : "recovered ")
|
||||
<< (if_osd_failed() ? "failed " : "recovered ")
|
||||
<< (is_immediate() ? "immediate " : "timeout ")
|
||||
<< target_osd << " for " << failed_for << "sec e" << epoch
|
||||
<< " v" << version << ")";
|
||||
}
|
||||
|
@ -100,6 +100,9 @@ struct MonClientPinger : public Dispatcher {
|
||||
return true;
|
||||
}
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
class MonClient : public Dispatcher {
|
||||
@ -140,6 +143,7 @@ private:
|
||||
bool ms_dispatch(Message *m);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
|
||||
void handle_monmap(MMonMap *m);
|
||||
|
||||
|
@ -4398,6 +4398,13 @@ bool Monitor::ms_handle_reset(Connection *con)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Monitor::ms_handle_refused(Connection *con)
|
||||
{
|
||||
// just log for now...
|
||||
dout(10) << "ms_handle_refused " << con << " " << con->get_peer_addr() << dendl;
|
||||
return false;
|
||||
}
|
||||
|
||||
void Monitor::check_subs()
|
||||
{
|
||||
string type = "monmap";
|
||||
|
@ -902,6 +902,7 @@ public:
|
||||
bool& isvalid, CryptoKey& session_key);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con);
|
||||
|
||||
int write_default_keyring(bufferlist& bl);
|
||||
void extract_save_mon_key(KeyRing& keyring);
|
||||
|
@ -1820,6 +1820,22 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
|
||||
return false;
|
||||
}
|
||||
|
||||
void OSDMonitor::force_failure(utime_t now, int target_osd)
|
||||
{
|
||||
// already pending failure?
|
||||
if (pending_inc.new_state.count(target_osd) &&
|
||||
pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
|
||||
dout(10) << " already pending failure" << dendl;
|
||||
return;
|
||||
}
|
||||
|
||||
dout(1) << " we're forcing failure of osd." << target_osd << dendl;
|
||||
pending_inc.new_state[target_osd] = CEPH_OSD_UP;
|
||||
|
||||
mon->clog->info() << osdmap.get_inst(target_osd) << " failed (forced)\n";
|
||||
return;
|
||||
}
|
||||
|
||||
bool OSDMonitor::prepare_failure(MonOpRequestRef op)
|
||||
{
|
||||
op->mark_osdmon_event(__func__);
|
||||
@ -1841,8 +1857,15 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
|
||||
|
||||
if (m->if_osd_failed()) {
|
||||
// add a report
|
||||
if (m->is_immediate()) {
|
||||
mon->clog->debug() << m->get_target() << " reported immediately failed by "
|
||||
<< m->get_orig_source_inst() << "\n";
|
||||
force_failure(now, target_osd);
|
||||
return true;
|
||||
}
|
||||
mon->clog->debug() << m->get_target() << " reported failed by "
|
||||
<< m->get_orig_source_inst() << "\n";
|
||||
<< m->get_orig_source_inst() << "\n";
|
||||
|
||||
failure_info_t& fi = failure_info[target_osd];
|
||||
MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
|
||||
if (old_op) {
|
||||
|
@ -128,6 +128,7 @@ private:
|
||||
|
||||
bool check_failures(utime_t now);
|
||||
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
|
||||
void force_failure(utime_t now, int target_osd);
|
||||
|
||||
// map thrashing
|
||||
int thrash_map;
|
||||
|
@ -181,6 +181,9 @@ void DispatchQueue::entry()
|
||||
case D_BAD_RESET:
|
||||
msgr->ms_deliver_handle_reset(qitem.get_connection());
|
||||
break;
|
||||
case D_CONN_REFUSED:
|
||||
msgr->ms_deliver_handle_refused(qitem.get_connection());
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -90,7 +90,7 @@ class DispatchQueue {
|
||||
|
||||
std::atomic<uint64_t> next_id;
|
||||
|
||||
enum { D_CONNECT = 1, D_ACCEPT, D_BAD_REMOTE_RESET, D_BAD_RESET, D_NUM_CODES };
|
||||
enum { D_CONNECT = 1, D_ACCEPT, D_BAD_REMOTE_RESET, D_BAD_RESET, D_CONN_REFUSED, D_NUM_CODES };
|
||||
|
||||
/**
|
||||
* The DispatchThread runs dispatch_entry to empty out the dispatch_queue.
|
||||
@ -185,6 +185,16 @@ class DispatchQueue {
|
||||
QueueItem(D_BAD_RESET, con));
|
||||
cond.Signal();
|
||||
}
|
||||
void queue_refused(Connection *con) {
|
||||
Mutex::Locker l(lock);
|
||||
if (stop)
|
||||
return;
|
||||
mqueue.enqueue_strict(
|
||||
0,
|
||||
CEPH_MSG_PRIO_HIGHEST,
|
||||
QueueItem(D_CONN_REFUSED, con));
|
||||
cond.Signal();
|
||||
}
|
||||
|
||||
bool can_fast_dispatch(Message *m) const;
|
||||
void fast_dispatch(Message *m);
|
||||
|
@ -156,6 +156,16 @@ public:
|
||||
*/
|
||||
virtual void ms_handle_remote_reset(Connection *con) = 0;
|
||||
|
||||
/**
|
||||
* This indicates that the connection is both broken and further
|
||||
* connection attempts are failing because other side refuses
|
||||
* it.
|
||||
*
|
||||
* @param con The Connection which broke. You are not granted
|
||||
* a reference to it.
|
||||
*/
|
||||
virtual bool ms_handle_refused(Connection *con) = 0;
|
||||
|
||||
/**
|
||||
* @defgroup Authentication
|
||||
* @{
|
||||
|
@ -681,6 +681,24 @@ public:
|
||||
++p)
|
||||
(*p)->ms_handle_remote_reset(con);
|
||||
}
|
||||
|
||||
/**
|
||||
* Notify each Dispatcher of a Connection for which reconnection
|
||||
* attempts are being refused. Call this function whenever you
|
||||
* detect that a lossy Connection has been disconnected and it's
|
||||
* impossible to reconnect.
|
||||
*
|
||||
* @param con Pointer to the broken Connection.
|
||||
*/
|
||||
void ms_deliver_handle_refused(Connection *con) {
|
||||
for (list<Dispatcher*>::iterator p = dispatchers.begin();
|
||||
p != dispatchers.end();
|
||||
++p) {
|
||||
if ((*p)->ms_handle_refused(con))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the AuthAuthorizer for a new outgoing Connection.
|
||||
*
|
||||
|
@ -878,6 +878,10 @@ ssize_t AsyncConnection::_process_connection()
|
||||
r = cs.is_connected();
|
||||
if (r < 0) {
|
||||
ldout(async_msgr->cct, 1) << __func__ << " reconnect failed " << dendl;
|
||||
if (r == -ECONNREFUSED) {
|
||||
ldout(async_msgr->cct, 2) << __func__ << " connection refused!" << dendl;
|
||||
dispatch_queue->queue_refused(this);
|
||||
}
|
||||
goto fail;
|
||||
} else if (r == 0) {
|
||||
ldout(async_msgr->cct, 10) << __func__ << " nonblock connect inprogress" << dendl;
|
||||
|
@ -924,9 +924,13 @@ int Pipe::connect()
|
||||
ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl;
|
||||
rc = ::connect(sd, peer_addr.get_sockaddr(), peer_addr.get_sockaddr_len());
|
||||
if (rc < 0) {
|
||||
rc = -errno;
|
||||
int stored_errno = errno;
|
||||
ldout(msgr->cct,2) << "connect error " << peer_addr
|
||||
<< ", " << cpp_strerror(rc) << dendl;
|
||||
<< ", " << cpp_strerror(stored_errno) << dendl;
|
||||
if (stored_errno == ECONNREFUSED) {
|
||||
ldout(msgr->cct, 2) << "connection refused!" << dendl;
|
||||
msgr->dispatch_queue.queue_refused(connection_state.get());
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
@ -4776,6 +4776,37 @@ bool OSD::ms_handle_reset(Connection *con)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool OSD::ms_handle_refused(Connection *con)
|
||||
{
|
||||
if (!cct->_conf->osd_fast_fail_on_connection_refused)
|
||||
return false;
|
||||
|
||||
OSD::Session *session = (OSD::Session *)con->get_priv();
|
||||
dout(1) << "ms_handle_refused con " << con << " session " << session << dendl;
|
||||
if (!session)
|
||||
return false;
|
||||
int type = con->get_peer_type();
|
||||
// handle only OSD failures here
|
||||
if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
|
||||
OSDMapRef osdmap = get_osdmap();
|
||||
if (osdmap) {
|
||||
int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
|
||||
if (id >= 0 && osdmap->is_up(id)) {
|
||||
// I'm cheating mon heartbeat grace logic, because we know it's not going
|
||||
// to respawn alone. +1 so we won't hit any boundary case.
|
||||
monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
|
||||
osdmap->get_inst(id),
|
||||
cct->_conf->osd_heartbeat_grace + 1,
|
||||
osdmap->get_epoch(),
|
||||
MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
session->put();
|
||||
return true;
|
||||
}
|
||||
|
||||
struct C_OSD_GetVersion : public Context {
|
||||
OSD *osd;
|
||||
uint64_t oldest, newest;
|
||||
@ -5085,8 +5116,7 @@ void OSD::send_failures()
|
||||
|
||||
void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
|
||||
{
|
||||
MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch);
|
||||
m->is_failed = false;
|
||||
MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
|
||||
monc->send_mon_message(m);
|
||||
}
|
||||
|
||||
|
@ -1702,6 +1702,9 @@ public:
|
||||
return osd->heartbeat_reset(con);
|
||||
}
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return osd->ms_handle_refused(con);
|
||||
}
|
||||
bool ms_verify_authorizer(Connection *con, int peer_type,
|
||||
int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
|
||||
bool& isvalid, CryptoKey& session_key) {
|
||||
@ -2395,6 +2398,7 @@ protected:
|
||||
void ms_handle_fast_accept(Connection *con);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con);
|
||||
|
||||
io_queue get_io_queue() const {
|
||||
if (cct->_conf->osd_op_queue == "debug_random") {
|
||||
|
@ -1007,6 +1007,15 @@ int OSDMap::identify_osd(const uuid_d& u) const
|
||||
return -1;
|
||||
}
|
||||
|
||||
int OSDMap::identify_osd_on_all_channels(const entity_addr_t& addr) const
|
||||
{
|
||||
for (int i=0; i<max_osd; i++)
|
||||
if (exists(i) && (get_addr(i) == addr || get_cluster_addr(i) == addr ||
|
||||
get_hb_back_addr(i) == addr || get_hb_front_addr(i) == addr))
|
||||
return i;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int OSDMap::find_osd_on_ip(const entity_addr_t& ip) const
|
||||
{
|
||||
for (int i=0; i<max_osd; i++)
|
||||
|
@ -455,6 +455,7 @@ public:
|
||||
|
||||
int identify_osd(const entity_addr_t& addr) const;
|
||||
int identify_osd(const uuid_d& u) const;
|
||||
int identify_osd_on_all_channels(const entity_addr_t& addr) const;
|
||||
|
||||
bool have_addr(const entity_addr_t& addr) const {
|
||||
return identify_osd(addr) >= 0;
|
||||
|
@ -34,7 +34,6 @@
|
||||
#include "messages/MStatfs.h"
|
||||
#include "messages/MStatfsReply.h"
|
||||
|
||||
#include "messages/MOSDFailure.h"
|
||||
#include "messages/MMonCommand.h"
|
||||
|
||||
#include "messages/MCommand.h"
|
||||
@ -4336,6 +4335,18 @@ void Objecter::ms_handle_remote_reset(Connection *con)
|
||||
ms_handle_reset(con);
|
||||
}
|
||||
|
||||
bool Objecter::ms_handle_refused(Connection *con)
|
||||
{
|
||||
// just log for now
|
||||
if (osdmap && (con->get_peer_type() == CEPH_ENTITY_TYPE_OSD)) {
|
||||
int osd = osdmap->identify_osd(con->get_peer_addr());
|
||||
if (osd >= 0) {
|
||||
ldout(cct, 1) << "ms_handle_refused on osd." << osd << dendl;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Objecter::ms_get_authorizer(int dest_type,
|
||||
AuthAuthorizer **authorizer,
|
||||
bool force_new)
|
||||
@ -4348,7 +4359,6 @@ bool Objecter::ms_get_authorizer(int dest_type,
|
||||
return *authorizer != NULL;
|
||||
}
|
||||
|
||||
|
||||
void Objecter::op_target_t::dump(Formatter *f) const
|
||||
{
|
||||
f->dump_stream("pg") << pgid;
|
||||
|
@ -2929,6 +2929,7 @@ public:
|
||||
void ms_handle_connect(Connection *con);
|
||||
bool ms_handle_reset(Connection *con);
|
||||
void ms_handle_remote_reset(Connection *con);
|
||||
bool ms_handle_refused(Connection *con);
|
||||
bool ms_get_authorizer(int dest_type,
|
||||
AuthAuthorizer **authorizer,
|
||||
bool force_new);
|
||||
|
@ -74,6 +74,8 @@ public:
|
||||
* a reference to it.
|
||||
*/
|
||||
virtual void ms_handle_remote_reset(Connection *con);
|
||||
|
||||
virtual bool ms_handle_refused(Connection *con) { return false; }
|
||||
|
||||
/**
|
||||
* @defgroup Authentication
|
||||
|
@ -74,6 +74,8 @@ public:
|
||||
* a reference to it.
|
||||
*/
|
||||
virtual void ms_handle_remote_reset(Connection *con);
|
||||
|
||||
virtual bool ms_handle_refused(Connection *con) { return false; }
|
||||
|
||||
/**
|
||||
* @defgroup test_xio_dispatcher_h_auth Authentication
|
||||
|
@ -187,6 +187,7 @@ fail:
|
||||
void ms_handle_connect(Connection *con) { }
|
||||
void ms_handle_remote_reset(Connection *con) { }
|
||||
bool ms_handle_reset(Connection *con) { return false; }
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
|
||||
bool is_wanted(Message *m) {
|
||||
dout(20) << __func__ << " " << *m << " type " << m->get_type() << dendl;
|
||||
|
@ -218,6 +218,10 @@ class ClientStub : public TestStub
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const string get_name() {
|
||||
return "client";
|
||||
}
|
||||
@ -903,6 +907,10 @@ class OSDStub : public TestStub
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const string get_name() {
|
||||
stringstream ss;
|
||||
ss << "osd." << whoami;
|
||||
|
@ -54,6 +54,7 @@ class MessengerClient {
|
||||
void ms_fast_dispatch(Message *m);
|
||||
bool ms_handle_reset(Connection *con) { return true; }
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
bool ms_verify_authorizer(Connection *con, int peer_type, int protocol,
|
||||
bufferlist& authorizer, bufferlist& authorizer_reply,
|
||||
bool& isvalid, CryptoKey& session_key) {
|
||||
|
@ -93,6 +93,7 @@ class ServerDispatcher : public Dispatcher {
|
||||
bool ms_dispatch(Message *m) { return true; }
|
||||
bool ms_handle_reset(Connection *con) { return true; }
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
void ms_fast_dispatch(Message *m) {
|
||||
usleep(think_time);
|
||||
//cerr << __func__ << " reply message=" << m << std::endl;
|
||||
|
@ -175,6 +175,9 @@ class FakeDispatcher : public Dispatcher {
|
||||
got_remote_reset = true;
|
||||
cond.Signal();
|
||||
}
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
void ms_fast_dispatch(Message *m) {
|
||||
Session *s = static_cast<Session*>(m->get_connection()->get_priv());
|
||||
if (!s) {
|
||||
@ -825,6 +828,9 @@ class SyntheticDispatcher : public Dispatcher {
|
||||
conn_sent.erase(con);
|
||||
got_remote_reset = true;
|
||||
}
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
void ms_fast_dispatch(Message *m) {
|
||||
// MSG_COMMAND is used to disorganize regular message flow
|
||||
if (m->get_type() == MSG_COMMAND) {
|
||||
@ -1408,6 +1414,9 @@ class MarkdownDispatcher : public Dispatcher {
|
||||
conns.erase(con);
|
||||
lderr(g_ceph_context) << __func__ << " " << con << dendl;
|
||||
}
|
||||
bool ms_handle_refused(Connection *con) {
|
||||
return false;
|
||||
}
|
||||
void ms_fast_dispatch(Message *m) {
|
||||
assert(0);
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ add_ceph_test(osd-reactivate.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reactivate.sh)
|
||||
add_ceph_test(osd-reuse-id.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reuse-id.sh)
|
||||
add_ceph_test(osd-scrub-snaps.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-snaps.sh)
|
||||
add_ceph_test(osd-copy-from.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-copy-from.sh)
|
||||
add_ceph_test(osd-fast-mark-down.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-fast-mark-down.sh)
|
||||
|
||||
# unittest_osdmap
|
||||
add_executable(unittest_osdmap
|
||||
|
96
src/test/osd/osd-fast-mark-down.sh
Executable file
96
src/test/osd/osd-fast-mark-down.sh
Executable file
@ -0,0 +1,96 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (C) 2016 Piotr Dałek <git@predictor.org.pl>
|
||||
# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
|
||||
#
|
||||
# Author: Piotr Dałek <git@predictor.org.pl>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Library Public License as published by
|
||||
# the Free Software Foundation; either version 2, or (at your option)
|
||||
# any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Library Public License for more details.
|
||||
#
|
||||
|
||||
source $(dirname $0)/../detect-build-env-vars.sh
|
||||
source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
|
||||
|
||||
function run() {
|
||||
local dir=$1
|
||||
shift
|
||||
rm -f $dir/*.pid
|
||||
export CEPH_MON="127.0.0.1:7126" # git grep '\<7126\>' : there must be only one
|
||||
export CEPH_ARGS
|
||||
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
||||
CEPH_ARGS+="--mon-host=$CEPH_MON "
|
||||
|
||||
echo "Ensuring old behavior is there..."
|
||||
test_fast_kill $dir && (echo "OSDs died too early!" ; return 1)
|
||||
|
||||
CEPH_ARGS+="--osd-fast-fail-on-connection-refused=true"
|
||||
OLD_ARGS=$CEPH_ARGS
|
||||
|
||||
CEPH_ARGS+="--ms_type=simple"
|
||||
echo "Testing simple msgr..."
|
||||
test_fast_kill $dir || return 1
|
||||
|
||||
CEPH_ARGS=$OLD_ARGS"--ms_type=async"
|
||||
echo "Testing async msgr..."
|
||||
test_fast_kill $dir || return 1
|
||||
|
||||
return 0
|
||||
|
||||
}
|
||||
|
||||
function test_fast_kill() {
|
||||
# create cluster with 3 osds
|
||||
setup $dir || return 1
|
||||
run_mon $dir a --osd_pool_default_size=3 || return 1
|
||||
for oi in {0..2}; do
|
||||
run_osd $dir $oi || return 1
|
||||
pids[$oi]=$(cat $dir/osd.$oi.pid)
|
||||
done
|
||||
|
||||
# make some objects so osds to ensure connectivity between osds
|
||||
rados -p rbd bench 10 write -b 4096 --max-objects 128 --no-cleanup
|
||||
sleep 1
|
||||
|
||||
$killid=0
|
||||
$previd=0
|
||||
|
||||
# kill random osd and see if 1 sec after, the osd count decreased.
|
||||
for i in {1..2}; do
|
||||
while [ $killid -eq $previd ]; do
|
||||
killid=${pids[$RANDOM%${#pids[@]}]}
|
||||
done
|
||||
previd=$killid
|
||||
|
||||
kill -9 $killid
|
||||
sleep 1
|
||||
|
||||
down_osds=$(ceph osd tree | grep -c down)
|
||||
if [ $down_osds -lt $i ]; then
|
||||
echo Killed the OSD, yet it is not marked down
|
||||
ceph osd tree
|
||||
teardown $dir
|
||||
return 1
|
||||
elif [ $down_osds -gt $i ]; then
|
||||
echo Too many \($down_osds\) osds died!
|
||||
teardown $dir
|
||||
return 1
|
||||
fi
|
||||
|
||||
done
|
||||
pkill -SIGTERM rados
|
||||
teardown $dir || return 1
|
||||
}
|
||||
|
||||
main osd-fast-mark-down "$@"
|
||||
|
||||
# Local Variables:
|
||||
# compile-command: "cd ../.. ; make -j4 && test/osd/osd-fast-mark-down.sh"
|
||||
# End:
|
@ -62,6 +62,7 @@ private:
|
||||
|
||||
bool ms_handle_reset(Connection *con) { return false; }
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
|
||||
} dispatcher;
|
||||
|
||||
|
@ -50,6 +50,7 @@ public:
|
||||
bool ms_dispatch(Message *m);
|
||||
bool ms_handle_reset(Connection *con) { return false; }
|
||||
void ms_handle_remote_reset(Connection *con) {}
|
||||
bool ms_handle_refused(Connection *con) { return false; }
|
||||
bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer,
|
||||
bool force_new);
|
||||
int init();
|
||||
|
Loading…
Reference in New Issue
Block a user