From 4aa90a5e684f86b936e553218de003fed1a8bc55 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 24 Sep 2011 14:45:32 -0700 Subject: [PATCH] osd: don't finish boot unless instance in map is really us We were going BOOTING->ACTIVE as soon as we showed up in the map with the same client_addr. Also verify that we were up_from an epoch after when we started or rebound, to avoid the case where we rebind to the same ports for client_addr (but maybe not others) and get caught in a rebind loop. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 13 +++++++++---- src/osd/OSD.h | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 247be200b53..88daac5ac75 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -525,7 +525,7 @@ OSD::OSD(int id, Messenger *internal_messenger, Messenger *external_messenger, dev_path(dev), journal_path(jdev), dispatch_running(false), osd_compat(get_osd_compat_set()), - state(STATE_BOOTING), boot_epoch(0), up_epoch(0), + state(STATE_BOOTING), boot_epoch(0), up_epoch(0), bind_epoch(0), op_tp(external_messenger->cct, "OSD::op_tp", g_conf->osd_op_threads), recovery_tp(external_messenger->cct, "OSD::recovery_tp", g_conf->osd_recovery_threads), disk_tp(external_messenger->cct, "OSD::disk_tp", g_conf->osd_disk_threads), @@ -663,6 +663,8 @@ int OSD::init() } osdmap = get_map(superblock.current_epoch); + bind_epoch = osdmap->get_epoch(); + clear_temp(); // make sure (newish) temp dir exists @@ -3174,7 +3176,8 @@ void OSD::handle_osd_map(MOSDMap *m) C_Contexts *fin = new C_Contexts(g_ceph_context); if (osdmap->is_up(whoami) && - osdmap->get_addr(whoami) == client_messenger->get_myaddr()) { + osdmap->get_addr(whoami) == client_messenger->get_myaddr() && + bind_epoch < osdmap->get_up_from(whoami)) { if (is_booting()) { dout(1) << "state: booting -> active" << dendl; @@ -3205,16 +3208,17 @@ void OSD::handle_osd_map(MOSDMap *m) << " != my " << client_messenger->get_myaddr(); else if (!osdmap->get_cluster_addr(whoami).probably_equals(cluster_messenger->get_myaddr())) clog.error() << "map e" << osdmap->get_epoch() - << " had wrong client addr (" << osdmap->get_cluster_addr(whoami) + << " had wrong cluster addr (" << osdmap->get_cluster_addr(whoami) << " != my " << cluster_messenger->get_myaddr(); else if (!osdmap->get_hb_addr(whoami).probably_equals(hbout_messenger->get_myaddr())) clog.error() << "map e" << osdmap->get_epoch() - << " had wrong client addr (" << osdmap->get_hb_addr(whoami) + << " had wrong hb addr (" << osdmap->get_hb_addr(whoami) << " != my " << hbout_messenger->get_myaddr(); state = STATE_BOOTING; up_epoch = 0; do_restart = true; + bind_epoch = osdmap->get_epoch(); int cport = cluster_messenger->get_myaddr().get_port(); int hbport = hbout_messenger->get_myaddr().get_port(); @@ -3547,6 +3551,7 @@ void OSD::add_map_bl(epoch_t e, bufferlist& bl) dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl; map_bl[e] = bl; } + void OSD::add_map_inc_bl(epoch_t e, bufferlist& bl) { Mutex::Locker l(map_cache_lock); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 9f54484da21..3c177384fe8 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -206,6 +206,7 @@ private: int state; epoch_t boot_epoch; // _first_ epoch we were marked up (after this process started) epoch_t up_epoch; // _most_recent_ epoch we were marked up + epoch_t bind_epoch; // epoch we last did a bind to new ip:ports public: bool is_booting() { return state == STATE_BOOTING; }