mirror of
https://github.com/ceph/ceph
synced 2024-12-30 07:23:11 +00:00
osd: drop writes when full instead of returning an error
There's a race between the client and osd with a newly marked full osdmap. If the client gets the new map first, it blocks writes and everything works as expected, with no errors from the osd. If the osd gets the map first, however, it will respond to any writes with -ENOSPC. Clients will pass this up the stack, and not retry these writes later. -ENOSPC isn't handled well by all clients. RBD, for example, may pass it on to qemu or kernel rbd which will both interpret it as EIO. Filesystems on top of rbd will not behave well when they receive EIOs like this, especially if the cluster oscillates between full and not full, so some writes succeed. To fix this, never return ENOSPC from the osd because of a map marked full, and rely on the client to retry all writes when the map is no longer marked full. Old clients talking to osds with this fix will hang instead of propagating an error, but only if they run into this race condition. ceph-fuse and rbd with caching enabled are not affected, since the ObjectCacher will retry writes that return errors. Refs: #6938 Backport: dumpling, emperor Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
This commit is contained in:
parent
5fe3dc647b
commit
4111729dda
@ -5066,6 +5066,7 @@ void OSD::handle_osd_map(MOSDMap *m)
|
||||
ObjectStore::Transaction &t = *_t;
|
||||
|
||||
// store new maps: queue for disk and put in the osdmap cache
|
||||
epoch_t last_marked_full = 0;
|
||||
epoch_t start = MAX(osdmap->get_epoch() + 1, first);
|
||||
for (epoch_t e = start; e <= last; e++) {
|
||||
map<epoch_t,bufferlist>::iterator p;
|
||||
@ -5076,6 +5077,8 @@ void OSD::handle_osd_map(MOSDMap *m)
|
||||
bufferlist& bl = p->second;
|
||||
|
||||
o->decode(bl);
|
||||
if (o->test_flag(CEPH_OSDMAP_FULL))
|
||||
last_marked_full = e;
|
||||
pinned_maps.push_back(add_map(o));
|
||||
|
||||
hobject_t fulloid = get_osdmap_pobject_name(e);
|
||||
@ -5108,6 +5111,8 @@ void OSD::handle_osd_map(MOSDMap *m)
|
||||
assert(0 == "bad fsid");
|
||||
}
|
||||
|
||||
if (o->test_flag(CEPH_OSDMAP_FULL))
|
||||
last_marked_full = e;
|
||||
pinned_maps.push_back(add_map(o));
|
||||
|
||||
bufferlist fbl;
|
||||
@ -5143,6 +5148,8 @@ void OSD::handle_osd_map(MOSDMap *m)
|
||||
superblock.oldest_map = first;
|
||||
superblock.newest_map = last;
|
||||
|
||||
if (last_marked_full > superblock.last_map_marked_full)
|
||||
superblock.last_map_marked_full = last_marked_full;
|
||||
|
||||
map_lock.get_write();
|
||||
|
||||
@ -6937,9 +6944,11 @@ void OSD::handle_op(OpRequestRef op)
|
||||
if (op->may_write()) {
|
||||
// full?
|
||||
if ((service.check_failsafe_full() ||
|
||||
osdmap->test_flag(CEPH_OSDMAP_FULL)) &&
|
||||
osdmap->test_flag(CEPH_OSDMAP_FULL) ||
|
||||
m->get_map_epoch() < superblock.last_map_marked_full) &&
|
||||
!m->get_source().is_mds()) { // FIXME: we'll exclude mds writes for now.
|
||||
service.reply_op_error(op, -ENOSPC);
|
||||
// Drop the request, since the client will retry when the full
|
||||
// flag is unset.
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2684,7 +2684,7 @@ ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
|
||||
|
||||
void OSDSuperblock::encode(bufferlist &bl) const
|
||||
{
|
||||
ENCODE_START(5, 5, bl);
|
||||
ENCODE_START(6, 5, bl);
|
||||
::encode(cluster_fsid, bl);
|
||||
::encode(whoami, bl);
|
||||
::encode(current_epoch, bl);
|
||||
@ -2695,12 +2695,13 @@ void OSDSuperblock::encode(bufferlist &bl) const
|
||||
::encode(clean_thru, bl);
|
||||
::encode(mounted, bl);
|
||||
::encode(osd_fsid, bl);
|
||||
::encode(last_map_marked_full, bl);
|
||||
ENCODE_FINISH(bl);
|
||||
}
|
||||
|
||||
void OSDSuperblock::decode(bufferlist::iterator &bl)
|
||||
{
|
||||
DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl);
|
||||
DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
|
||||
if (struct_v < 3) {
|
||||
string magic;
|
||||
::decode(magic, bl);
|
||||
@ -2720,6 +2721,8 @@ void OSDSuperblock::decode(bufferlist::iterator &bl)
|
||||
::decode(mounted, bl);
|
||||
if (struct_v >= 4)
|
||||
::decode(osd_fsid, bl);
|
||||
if (struct_v >= 6)
|
||||
::decode(last_map_marked_full, bl);
|
||||
DECODE_FINISH(bl);
|
||||
}
|
||||
|
||||
@ -2737,6 +2740,7 @@ void OSDSuperblock::dump(Formatter *f) const
|
||||
f->close_section();
|
||||
f->dump_int("clean_thru", clean_thru);
|
||||
f->dump_int("last_epoch_mounted", mounted);
|
||||
f->dump_int("last_map_marked_full", last_map_marked_full);
|
||||
}
|
||||
|
||||
void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
|
||||
@ -2752,6 +2756,8 @@ void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
|
||||
z.mounted = 8;
|
||||
z.clean_thru = 7;
|
||||
o.push_back(new OSDSuperblock(z));
|
||||
z.last_map_marked_full = 7;
|
||||
o.push_back(new OSDSuperblock(z));
|
||||
}
|
||||
|
||||
// -- SnapSet --
|
||||
|
@ -1956,11 +1956,12 @@ public:
|
||||
// last interval over which i mounted and was then active
|
||||
epoch_t mounted; // last epoch i mounted
|
||||
epoch_t clean_thru; // epoch i was active and clean thru
|
||||
epoch_t last_map_marked_full; // last epoch osdmap was marked full
|
||||
|
||||
OSDSuperblock() :
|
||||
whoami(-1),
|
||||
current_epoch(0), oldest_map(0), newest_map(0), weight(0),
|
||||
mounted(0), clean_thru(0) {
|
||||
mounted(0), clean_thru(0), last_map_marked_full(0) {
|
||||
}
|
||||
|
||||
void encode(bufferlist &bl) const;
|
||||
|
Loading…
Reference in New Issue
Block a user