From 4111729dda7437c23f59e7100b3c4a9ec4101dd0 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 5 Dec 2013 17:34:38 -0800 Subject: [PATCH] osd: drop writes when full instead of returning an error There's a race between the client and osd with a newly marked full osdmap. If the client gets the new map first, it blocks writes and everything works as expected, with no errors from the osd. If the osd gets the map first, however, it will respond to any writes with -ENOSPC. Clients will pass this up the stack, and not retry these writes later. -ENOSPC isn't handled well by all clients. RBD, for example, may pass it on to qemu or kernel rbd which will both interpret it as EIO. Filesystems on top of rbd will not behave well when they receive EIOs like this, especially if the cluster oscillates between full and not full, so some writes succeed. To fix this, never return ENOSPC from the osd because of a map marked full, and rely on the client to retry all writes when the map is no longer marked full. Old clients talking to osds with this fix will hang instead of propagating an error, but only if they run into this race condition. ceph-fuse and rbd with caching enabled are not affected, since the ObjectCacher will retry writes that return errors. Refs: #6938 Backport: dumpling, emperor Signed-off-by: Josh Durgin --- src/osd/OSD.cc | 13 +++++++++++-- src/osd/osd_types.cc | 10 ++++++++-- src/osd/osd_types.h | 3 ++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1a60de6bdfe..4ddaffe8c8e 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5066,6 +5066,7 @@ void OSD::handle_osd_map(MOSDMap *m) ObjectStore::Transaction &t = *_t; // store new maps: queue for disk and put in the osdmap cache + epoch_t last_marked_full = 0; epoch_t start = MAX(osdmap->get_epoch() + 1, first); for (epoch_t e = start; e <= last; e++) { map::iterator p; @@ -5076,6 +5077,8 @@ void OSD::handle_osd_map(MOSDMap *m) bufferlist& bl = p->second; o->decode(bl); + if (o->test_flag(CEPH_OSDMAP_FULL)) + last_marked_full = e; pinned_maps.push_back(add_map(o)); hobject_t fulloid = get_osdmap_pobject_name(e); @@ -5108,6 +5111,8 @@ void OSD::handle_osd_map(MOSDMap *m) assert(0 == "bad fsid"); } + if (o->test_flag(CEPH_OSDMAP_FULL)) + last_marked_full = e; pinned_maps.push_back(add_map(o)); bufferlist fbl; @@ -5143,6 +5148,8 @@ void OSD::handle_osd_map(MOSDMap *m) superblock.oldest_map = first; superblock.newest_map = last; + if (last_marked_full > superblock.last_map_marked_full) + superblock.last_map_marked_full = last_marked_full; map_lock.get_write(); @@ -6937,9 +6944,11 @@ void OSD::handle_op(OpRequestRef op) if (op->may_write()) { // full? if ((service.check_failsafe_full() || - osdmap->test_flag(CEPH_OSDMAP_FULL)) && + osdmap->test_flag(CEPH_OSDMAP_FULL) || + m->get_map_epoch() < superblock.last_map_marked_full) && !m->get_source().is_mds()) { // FIXME: we'll exclude mds writes for now. - service.reply_op_error(op, -ENOSPC); + // Drop the request, since the client will retry when the full + // flag is unset. return; } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 05b83c4af21..93a7095eed3 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2684,7 +2684,7 @@ ostream& operator<<(ostream& out, const osd_peer_stat_t &stat) void OSDSuperblock::encode(bufferlist &bl) const { - ENCODE_START(5, 5, bl); + ENCODE_START(6, 5, bl); ::encode(cluster_fsid, bl); ::encode(whoami, bl); ::encode(current_epoch, bl); @@ -2695,12 +2695,13 @@ void OSDSuperblock::encode(bufferlist &bl) const ::encode(clean_thru, bl); ::encode(mounted, bl); ::encode(osd_fsid, bl); + ::encode(last_map_marked_full, bl); ENCODE_FINISH(bl); } void OSDSuperblock::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl); + DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl); if (struct_v < 3) { string magic; ::decode(magic, bl); @@ -2720,6 +2721,8 @@ void OSDSuperblock::decode(bufferlist::iterator &bl) ::decode(mounted, bl); if (struct_v >= 4) ::decode(osd_fsid, bl); + if (struct_v >= 6) + ::decode(last_map_marked_full, bl); DECODE_FINISH(bl); } @@ -2737,6 +2740,7 @@ void OSDSuperblock::dump(Formatter *f) const f->close_section(); f->dump_int("clean_thru", clean_thru); f->dump_int("last_epoch_mounted", mounted); + f->dump_int("last_map_marked_full", last_map_marked_full); } void OSDSuperblock::generate_test_instances(list& o) @@ -2752,6 +2756,8 @@ void OSDSuperblock::generate_test_instances(list& o) z.mounted = 8; z.clean_thru = 7; o.push_back(new OSDSuperblock(z)); + z.last_map_marked_full = 7; + o.push_back(new OSDSuperblock(z)); } // -- SnapSet -- diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 351d050c476..2b331492f72 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -1956,11 +1956,12 @@ public: // last interval over which i mounted and was then active epoch_t mounted; // last epoch i mounted epoch_t clean_thru; // epoch i was active and clean thru + epoch_t last_map_marked_full; // last epoch osdmap was marked full OSDSuperblock() : whoami(-1), current_epoch(0), oldest_map(0), newest_map(0), weight(0), - mounted(0), clean_thru(0) { + mounted(0), clean_thru(0), last_map_marked_full(0) { } void encode(bufferlist &bl) const;