osd: drop writes when full instead of returning an error

There's a race between the client and osd with a newly marked full
osdmap.  If the client gets the new map first, it blocks writes and
everything works as expected, with no errors from the osd.

If the osd gets the map first, however, it will respond to any writes
with -ENOSPC. Clients will pass this up the stack, and not retry these
writes later.  -ENOSPC isn't handled well by all clients. RBD, for
example, may pass it on to qemu or kernel rbd which will both
interpret it as EIO.  Filesystems on top of rbd will not behave well
when they receive EIOs like this, especially if the cluster oscillates
between full and not full, so some writes succeed.

To fix this, never return ENOSPC from the osd because of a map marked
full, and rely on the client to retry all writes when the map is no
longer marked full.

Old clients talking to osds with this fix will hang instead of
propagating an error, but only if they run into this race
condition. ceph-fuse and rbd with caching enabled are not affected,
since the ObjectCacher will retry writes that return errors.

Refs: #6938
Backport: dumpling, emperor
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
This commit is contained in:
Josh Durgin 2013-12-05 17:34:38 -08:00
parent 5fe3dc647b
commit 4111729dda
3 changed files with 21 additions and 5 deletions

View File

@ -5066,6 +5066,7 @@ void OSD::handle_osd_map(MOSDMap *m)
ObjectStore::Transaction &t = *_t;
// store new maps: queue for disk and put in the osdmap cache
epoch_t last_marked_full = 0;
epoch_t start = MAX(osdmap->get_epoch() + 1, first);
for (epoch_t e = start; e <= last; e++) {
map<epoch_t,bufferlist>::iterator p;
@ -5076,6 +5077,8 @@ void OSD::handle_osd_map(MOSDMap *m)
bufferlist& bl = p->second;
o->decode(bl);
if (o->test_flag(CEPH_OSDMAP_FULL))
last_marked_full = e;
pinned_maps.push_back(add_map(o));
hobject_t fulloid = get_osdmap_pobject_name(e);
@ -5108,6 +5111,8 @@ void OSD::handle_osd_map(MOSDMap *m)
assert(0 == "bad fsid");
}
if (o->test_flag(CEPH_OSDMAP_FULL))
last_marked_full = e;
pinned_maps.push_back(add_map(o));
bufferlist fbl;
@ -5143,6 +5148,8 @@ void OSD::handle_osd_map(MOSDMap *m)
superblock.oldest_map = first;
superblock.newest_map = last;
if (last_marked_full > superblock.last_map_marked_full)
superblock.last_map_marked_full = last_marked_full;
map_lock.get_write();
@ -6937,9 +6944,11 @@ void OSD::handle_op(OpRequestRef op)
if (op->may_write()) {
// full?
if ((service.check_failsafe_full() ||
osdmap->test_flag(CEPH_OSDMAP_FULL)) &&
osdmap->test_flag(CEPH_OSDMAP_FULL) ||
m->get_map_epoch() < superblock.last_map_marked_full) &&
!m->get_source().is_mds()) { // FIXME: we'll exclude mds writes for now.
service.reply_op_error(op, -ENOSPC);
// Drop the request, since the client will retry when the full
// flag is unset.
return;
}

View File

@ -2684,7 +2684,7 @@ ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
void OSDSuperblock::encode(bufferlist &bl) const
{
ENCODE_START(5, 5, bl);
ENCODE_START(6, 5, bl);
::encode(cluster_fsid, bl);
::encode(whoami, bl);
::encode(current_epoch, bl);
@ -2695,12 +2695,13 @@ void OSDSuperblock::encode(bufferlist &bl) const
::encode(clean_thru, bl);
::encode(mounted, bl);
::encode(osd_fsid, bl);
::encode(last_map_marked_full, bl);
ENCODE_FINISH(bl);
}
void OSDSuperblock::decode(bufferlist::iterator &bl)
{
DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl);
DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
if (struct_v < 3) {
string magic;
::decode(magic, bl);
@ -2720,6 +2721,8 @@ void OSDSuperblock::decode(bufferlist::iterator &bl)
::decode(mounted, bl);
if (struct_v >= 4)
::decode(osd_fsid, bl);
if (struct_v >= 6)
::decode(last_map_marked_full, bl);
DECODE_FINISH(bl);
}
@ -2737,6 +2740,7 @@ void OSDSuperblock::dump(Formatter *f) const
f->close_section();
f->dump_int("clean_thru", clean_thru);
f->dump_int("last_epoch_mounted", mounted);
f->dump_int("last_map_marked_full", last_map_marked_full);
}
void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
@ -2752,6 +2756,8 @@ void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
z.mounted = 8;
z.clean_thru = 7;
o.push_back(new OSDSuperblock(z));
z.last_map_marked_full = 7;
o.push_back(new OSDSuperblock(z));
}
// -- SnapSet --

View File

@ -1956,11 +1956,12 @@ public:
// last interval over which i mounted and was then active
epoch_t mounted; // last epoch i mounted
epoch_t clean_thru; // epoch i was active and clean thru
epoch_t last_map_marked_full; // last epoch osdmap was marked full
OSDSuperblock() :
whoami(-1),
current_epoch(0), oldest_map(0), newest_map(0), weight(0),
mounted(0), clean_thru(0) {
mounted(0), clean_thru(0), last_map_marked_full(0) {
}
void encode(bufferlist &bl) const;