From 33279822eabb64380f5968cc645735a8f99a3ac1 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 18 Dec 2018 14:00:29 -0800 Subject: [PATCH 1/5] mds: delete on_error context on des Otherwise it leaks. Signed-off-by: Patrick Donnelly --- src/mds/PurgeQueue.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index 1b053013775..d30294881d7 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -108,6 +108,7 @@ PurgeQueue::~PurgeQueue() if (logger) { g_ceph_context->get_perfcounters_collection()->remove(logger.get()); } + delete on_error; } void PurgeQueue::create_logger() From c7350ac23c73867b52cd9b7bb23b6c618eebe44d Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 18 Dec 2018 15:08:11 -0800 Subject: [PATCH 2/5] mds: add missing locks for PurgeQueue methods These could race with the asynchronous workings of the PQ. Signed-off-by: Patrick Donnelly --- src/mds/PurgeQueue.cc | 8 +++++--- src/mds/PurgeQueue.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index d30294881d7..1c4577a3e54 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -287,7 +287,7 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion) if (!could_consume) { // Usually, it is not necessary to explicitly flush here, because the reader // will get flushes generated inside Journaler::is_readable. However, - // if we remain in a can_consume()==false state for a long period then + // if we remain in a _can_consume()==false state for a long period then // we should flush in order to allow MDCache to drop its strays rather // than having them wait for purgequeue to progress. if (!delayed_flush) { @@ -333,7 +333,7 @@ uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const return ops_required; } -bool PurgeQueue::can_consume() +bool PurgeQueue::_can_consume() { dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, " << in_flight.size() << "/" << g_conf()->mds_max_purge_files @@ -367,7 +367,7 @@ bool PurgeQueue::_consume() ceph_assert(lock.is_locked_by_me()); bool could_consume = false; - while(can_consume()) { + while(_can_consume()) { if (delayed_flush) { // We are now going to read from the journal, so any proactive @@ -637,6 +637,8 @@ bool PurgeQueue::drain( size_t *in_flight_count ) { + std::lock_guard l(lock); + ceph_assert(progress != nullptr); ceph_assert(progress_total != nullptr); ceph_assert(in_flight_count != nullptr); diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h index 037bda255c9..1c18482d401 100644 --- a/src/mds/PurgeQueue.h +++ b/src/mds/PurgeQueue.h @@ -134,7 +134,7 @@ protected: uint32_t _calculate_ops(const PurgeItem &item) const; - bool can_consume(); + bool _can_consume(); // How many bytes were remaining when drain() was first called, // used for indicating progress. From e540028cbac453b5461c4a9cad70df31fc4cd80d Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 18 Dec 2018 15:10:30 -0800 Subject: [PATCH 3/5] mds: return string_view for type str This is a cheap refactor. Signed-off-by: Patrick Donnelly --- src/mds/PurgeQueue.cc | 2 +- src/mds/PurgeQueue.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index 1c4577a3e54..62c77184da2 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -671,7 +671,7 @@ bool PurgeQueue::drain( return false; } -std::string PurgeItem::get_type_str() const +std::string_view PurgeItem::get_type_str() const { switch(action) { case PurgeItem::NONE: return "NONE"; diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h index 1c18482d401..f762a426f76 100644 --- a/src/mds/PurgeQueue.h +++ b/src/mds/PurgeQueue.h @@ -76,7 +76,7 @@ public: f->close_section(); } - std::string get_type_str() const; + std::string_view get_type_str() const; private: static const std::map actions; }; From 4cccc4dffb0915ef9e7d3b446e9a32f277646562 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 18 Dec 2018 15:11:02 -0800 Subject: [PATCH 4/5] mds: setup readonly mode for PurgeQueue If the PQ faces an error, it should go read-only along with the MDS rank. Fixes: http://tracker.ceph.com/issues/37543 Signed-off-by: Patrick Donnelly --- src/mds/PurgeQueue.cc | 65 +++++++++++++++++++++++++++++++++++++------ src/mds/PurgeQueue.h | 3 ++ 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/mds/PurgeQueue.cc b/src/mds/PurgeQueue.cc index 62c77184da2..a93c9d72181 100644 --- a/src/mds/PurgeQueue.cc +++ b/src/mds/PurgeQueue.cc @@ -19,6 +19,7 @@ #include "PurgeQueue.h" +#include #define dout_context cct #define dout_subsys ceph_subsys_mds @@ -139,6 +140,12 @@ void PurgeQueue::init() void PurgeQueue::activate() { std::lock_guard l(lock); + + if (readonly) { + dout(10) << "skipping activate: PurgeQueue is readonly" << dendl; + return; + } + if (journaler.get_read_pos() == journaler.get_write_pos()) return; @@ -193,7 +200,7 @@ void PurgeQueue::open(Context *completion) finish_contexts(g_ceph_context, waiting_for_recovery); } else { derr << "Error " << r << " loading Journaler" << dendl; - on_error->complete(r); + _go_readonly(r); } })); } @@ -201,10 +208,14 @@ void PurgeQueue::open(Context *completion) void PurgeQueue::wait_for_recovery(Context* c) { std::lock_guard l(lock); - if (recovered) + if (recovered) { c->complete(0); - else + } else if (readonly) { + dout(10) << "cannot wait for recovery: PurgeQueue is readonly" << dendl; + c->complete(-EROFS); + } else { waiting_for_recovery.push_back(c); + } } void PurgeQueue::_recover() @@ -226,7 +237,7 @@ void PurgeQueue::_recover() if (journaler.get_error()) { int r = journaler.get_error(); derr << "Error " << r << " recovering write_pos" << dendl; - on_error->complete(r); + _go_readonly(r); return; } @@ -260,8 +271,12 @@ void PurgeQueue::create(Context *fin) journaler.create(&layout, JOURNAL_FORMAT_RESILIENT); journaler.write_head(new FunctionContext([this](int r) { std::lock_guard l(lock); - recovered = true; - finish_contexts(g_ceph_context, waiting_for_recovery); + if (r) { + _go_readonly(r); + } else { + recovered = true; + finish_contexts(g_ceph_context, waiting_for_recovery); + } })); } @@ -273,6 +288,12 @@ void PurgeQueue::push(const PurgeItem &pi, Context *completion) dout(4) << "pushing inode " << pi.ino << dendl; std::lock_guard l(lock); + if (readonly) { + dout(10) << "cannot push inode: PurgeQueue is readonly" << dendl; + completion->complete(-EROFS); + return; + } + // Callers should have waited for open() before using us ceph_assert(!journaler.is_readonly()); @@ -335,6 +356,11 @@ uint32_t PurgeQueue::_calculate_ops(const PurgeItem &item) const bool PurgeQueue::_can_consume() { + if (readonly) { + dout(10) << "can't consume: PurgeQueue is readonly" << dendl; + return false; + } + dout(20) << ops_in_flight << "/" << max_purge_ops << " ops, " << in_flight.size() << "/" << g_conf()->mds_max_purge_files << " files" << dendl; @@ -362,6 +388,17 @@ bool PurgeQueue::_can_consume() } } +void PurgeQueue::_go_readonly(int r) +{ + if (readonly) return; + dout(1) << "going readonly because internal IO failed: " << strerror(-r) << dendl; + readonly = true; + on_error->complete(r); + on_error = nullptr; + journaler.set_readonly(); + finish_contexts(g_ceph_context, waiting_for_recovery, r); +} + bool PurgeQueue::_consume() { ceph_assert(lock.is_locked_by_me()); @@ -379,7 +416,7 @@ bool PurgeQueue::_consume() if (int r = journaler.get_error()) { derr << "Error " << r << " recovering write_pos" << dendl; - on_error->complete(r); + _go_readonly(r); return could_consume; } @@ -393,7 +430,7 @@ bool PurgeQueue::_consume() if (r == 0) { _consume(); } else if (r != -EAGAIN) { - on_error->complete(r); + _go_readonly(r); } })); } @@ -415,7 +452,7 @@ bool PurgeQueue::_consume() } catch (const buffer::error &err) { derr << "Decode error at read_pos=0x" << std::hex << journaler.get_read_pos() << dendl; - on_error->complete(0); + _go_readonly(EIO); } dout(20) << " executing item (" << item.ino << ")" << dendl; _execute_item(item, journaler.get_read_pos()); @@ -582,6 +619,11 @@ void PurgeQueue::update_op_limit(const MDSMap &mds_map) { std::lock_guard l(lock); + if (readonly) { + dout(10) << "skipping; PurgeQueue is readonly" << dendl; + return; + } + uint64_t pg_count = 0; objecter->with_osdmap([&](const OSDMap& o) { // Number of PGs across all data pools @@ -639,6 +681,11 @@ bool PurgeQueue::drain( { std::lock_guard l(lock); + if (readonly) { + dout(10) << "skipping drain; PurgeQueue is readonly" << dendl; + return true; + } + ceph_assert(progress != nullptr); ceph_assert(progress_total != nullptr); ceph_assert(in_flight_count != nullptr); diff --git a/src/mds/PurgeQueue.h b/src/mds/PurgeQueue.h index f762a426f76..feb304019bd 100644 --- a/src/mds/PurgeQueue.h +++ b/src/mds/PurgeQueue.h @@ -106,6 +106,7 @@ protected: CephContext *cct; const mds_rank_t rank; Mutex lock; + bool readonly = false; int64_t metadata_pool; @@ -164,6 +165,8 @@ protected: bool recovered; std::list waiting_for_recovery; + void _go_readonly(int r); + public: void init(); void activate(); From c7ce967b778a0b86b335f6801301e484aaf6ebc3 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sun, 23 Dec 2018 14:22:49 -0800 Subject: [PATCH 5/5] mds: allow boot on read-only Signed-off-by: Patrick Donnelly --- qa/tasks/cephfs/fuse_mount.py | 11 ++++++++-- qa/tasks/cephfs/test_damage.py | 37 +++++++++++++++++++++++++--------- src/mds/MDSRank.cc | 2 ++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py index e76a0165028..0c0c84b7537 100644 --- a/qa/tasks/cephfs/fuse_mount.py +++ b/qa/tasks/cephfs/fuse_mount.py @@ -229,8 +229,15 @@ class FuseMount(CephFSMount): # Now that we're mounted, set permissions so that the rest of the test will have # unrestricted access to the filesystem mount. - self.client_remote.run( - args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60)) + try: + stderr = StringIO() + self.client_remote.run(args=['sudo', 'chmod', '1777', self.mountpoint], timeout=(15*60), stderr=stderr) + except run.CommandFailedError: + stderr = stderr.getvalue() + if "Read-only file system".lower() in stderr.lower(): + pass + else: + raise def _mountpoint_exists(self): return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False, timeout=(15*60)).exitstatus == 0 diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py index 459077b0428..c6728880cc3 100644 --- a/qa/tasks/cephfs/test_damage.py +++ b/qa/tasks/cephfs/test_damage.py @@ -12,6 +12,7 @@ DAMAGED_ON_START = "damaged_on_start" DAMAGED_ON_LS = "damaged_on_ls" CRASHED = "server crashed" NO_DAMAGE = "no damage" +READONLY = "readonly" FAILED_CLIENT = "client failed" FAILED_SERVER = "server failed" @@ -161,14 +162,22 @@ class TestDamage(CephFSTestCase): )) # Blatant corruptions - mutations.extend([ - MetadataMutation( - o, - "Corrupt {0}".format(o), - lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk), - DAMAGED_ON_START - ) for o in data_objects - ]) + for obj_id in data_objects: + if obj_id == "500.00000000": + # purge queue corruption results in read-only FS + mutations.append(MetadataMutation( + obj_id, + "Corrupt {0}".format(obj_id), + lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk), + READONLY + )) + else: + mutations.append(MetadataMutation( + obj_id, + "Corrupt {0}".format(obj_id), + lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk), + DAMAGED_ON_START + )) # Truncations for obj_id in data_objects: @@ -316,7 +325,17 @@ class TestDamage(CephFSTestCase): else: log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc)) results[mutation] = FAILED_SERVER - + elif mutation.expectation == READONLY: + proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False) + try: + proc.wait() + except CommandFailedError: + stderr = proc.stderr.getvalue() + log.info(stderr) + if "Read-only file system".lower() in stderr.lower(): + pass + else: + raise else: try: wait([proc], 20) diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index d126c013198..4ecbc35fcde 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1461,6 +1461,8 @@ void MDSRank::boot_start(BootStep step, int r) << cpp_strerror(r); damaged(); ceph_assert(r == 0); // Unreachable, damaged() calls respawn() + } else if (r == -EROFS) { + dout(0) << "boot error forcing transition to read-only; MDS will try to continue" << dendl; } else { // Completely unexpected error, give up and die dout(0) << "boot_start encountered an error, failing" << dendl;