From f8eed41ecb0d784619eab80e55a3c10b9d984d23 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 18 Jan 2018 09:45:22 +0800 Subject: [PATCH 01/14] mds: cleanup CInode/CDir states definition Signed-off-by: "Yan, Zheng" --- src/mds/CDir.h | 60 +++++++++++++++++++++++++----------------------- src/mds/CInode.h | 36 ++++++++++++++--------------- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 3e6bc0eb2d2..a81053f6951 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -73,25 +73,25 @@ public: } // -- state -- - static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 4); - static const unsigned STATE_FREEZINGDIR = (1<< 5); - static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit - static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_CREATING = (1<< 8); - static const unsigned STATE_IMPORTBOUND = (1<<10); - static const unsigned STATE_EXPORTBOUND = (1<<11); - static const unsigned STATE_EXPORTING = (1<<12); - static const unsigned STATE_IMPORTING = (1<<13); - static const unsigned STATE_FRAGMENTING = (1<<14); - static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs - static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting - static const unsigned STATE_ASSIMRSTAT = (1<<17); // assimilating inode->frag rstats - static const unsigned STATE_DIRTYDFT = (1<<18); // dirty dirfragtree - static const unsigned STATE_BADFRAG = (1<<19); // bad dirfrag - static const unsigned STATE_AUXSUBTREE = (1<<20); // no subtree merge + static const unsigned STATE_COMPLETE = (1<< 0); // the complete contents are in cache + static const unsigned STATE_FROZENTREE = (1<< 1); // root of tree (bounded by exports) + static const unsigned STATE_FREEZINGTREE = (1<< 2); // in process of freezing + static const unsigned STATE_FROZENDIR = (1<< 3); + static const unsigned STATE_FREEZINGDIR = (1<< 4); + static const unsigned STATE_COMMITTING = (1<< 5); // mid-commit + static const unsigned STATE_FETCHING = (1<< 6); // currenting fetching + static const unsigned STATE_CREATING = (1<< 7); + static const unsigned STATE_IMPORTBOUND = (1<< 8); + static const unsigned STATE_EXPORTBOUND = (1<< 9); + static const unsigned STATE_EXPORTING = (1<<10); + static const unsigned STATE_IMPORTING = (1<<11); + static const unsigned STATE_FRAGMENTING = (1<<12); + static const unsigned STATE_STICKY = (1<<13); // sticky pin due to inode stickydirs + static const unsigned STATE_DNPINNEDFRAG = (1<<14); // dir is refragmenting + static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats + static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree + static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag + static const unsigned STATE_AUXSUBTREE = (1<<18); // no subtree merge // common states static const unsigned STATE_CLEAN = 0; @@ -102,18 +102,20 @@ public: (STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG); static const unsigned MASK_STATE_IMPORT_KEPT = ( - STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY); + STATE_IMPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_STICKY); static const unsigned MASK_STATE_EXPORT_KEPT = - (STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_STICKY); + (STATE_EXPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_FROZENDIR | + STATE_STICKY); static const unsigned MASK_STATE_FRAGMENT_KEPT = - (STATE_DIRTY| + (STATE_DIRTY | STATE_EXPORTBOUND | STATE_IMPORTBOUND | STATE_AUXSUBTREE | diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 152456e8252..c7de90b2ff2 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -214,24 +214,24 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter Date: Thu, 25 Jan 2018 08:31:30 +0800 Subject: [PATCH 02/14] mds: cleanup MDCache::opening_inodes access Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index de600c84f02..055aa49752a 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8379,8 +8379,8 @@ struct C_MDC_OpenInoTraverseDir : public MDCacheContext { mdcache->handle_open_ino(msg, r); return; } - assert(mdcache->opening_inodes.count(ino)); - mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r); + auto& info = mdcache->opening_inodes.at(ino); + mdcache->_open_ino_traverse_dir(ino, info, r); } }; @@ -8397,8 +8397,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err { dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; + open_ino_info_t& info = opening_inodes.at(ino); CInode *in = get_inode(ino); if (in) { @@ -8473,8 +8472,7 @@ void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) { dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; + open_ino_info_t& info = opening_inodes.at(ino); CInode *in = get_inode(ino); if (in) { @@ -8830,8 +8828,9 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin, dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " << want_replica << dendl; - if (opening_inodes.count(ino)) { - open_ino_info_t& info = opening_inodes[ino]; + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end()) { + open_ino_info_t& info = it->second; if (want_replica) { info.want_replica = true; if (want_xlocked && !info.want_xlocked) { From a59f3536f48e3d59cdc1ca65f2d83cb8267bfd18 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 16 Jan 2018 15:16:45 +0800 Subject: [PATCH 03/14] mds: track how many clients/mds want caps for each inode Signed-off-by: "Yan, Zheng" --- src/mds/CInode.cc | 47 +++++++++++++++++++++++++++++++++++++++++-- src/mds/CInode.h | 10 +++++++-- src/mds/Capability.cc | 18 +++++++++++++++-- src/mds/Capability.h | 9 +++------ src/mds/Locker.cc | 11 ++++------ src/mds/MDCache.cc | 4 ++-- src/mds/Migrator.cc | 7 ++++++- 7 files changed, 84 insertions(+), 22 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 8c1719dc32e..dc71edff8cc 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2730,7 +2730,7 @@ client_t CInode::calc_ideal_loner() { if (mdcache->is_readonly()) return -1; - if (!mds_caps_wanted.empty()) + if (!get_mds_caps_wanted().empty()) return -1; int n = 0; @@ -2844,6 +2844,43 @@ void CInode::choose_lock_states(int dirty_caps) choose_lock_state(&linklock, issued); } +void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map& m) +{ + bool old_empty = mds_caps_wanted.empty(); + mds_caps_wanted.swap(m); + if (old_empty != (bool)mds_caps_wanted.empty()) { + if (old_empty) + adjust_num_caps_wanted(1); + else + adjust_num_caps_wanted(-1); + } +} + +void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) +{ + bool old_empty = mds_caps_wanted.empty(); + if (wanted) { + mds_caps_wanted[mds] = wanted; + if (old_empty) + adjust_num_caps_wanted(1); + } else if (!old_empty) { + mds_caps_wanted.erase(mds); + if (mds_caps_wanted.empty()) + adjust_num_caps_wanted(-1); + } +} + +void CInode::adjust_num_caps_wanted(int d) +{ + if (!num_caps_wanted && d > 0) + ; // add 'this' to open file table + else if (num_caps_wanted > 0 && num_caps_wanted == -d) + ; // remove 'this' from open file table + + num_caps_wanted +=d; + assert(num_caps_wanted >= 0); +} + Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm) { assert(last == CEPH_NOSNAP); @@ -2888,6 +2925,9 @@ void CInode::remove_client_cap(client_t client) if (client == loner_cap) loner_cap = -1; + if (cap->wanted()) + adjust_num_caps_wanted(-1); + delete cap; client_caps.erase(client); if (client_caps.empty()) { @@ -2947,7 +2987,10 @@ void CInode::clear_client_caps_after_export() remove_client_cap(client_caps.begin()->first); loner_cap = -1; want_loner_cap = -1; - mds_caps_wanted.clear(); + if (!get_mds_caps_wanted().empty()) { + mempool::mds_co::compact_map empty; + set_mds_caps_wanted(empty); + } } void CInode::export_client_caps(map& cl) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index c7de90b2ff2..55c07dcbf9f 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -564,7 +564,8 @@ protected: using cap_map = mempool::mds_co::map; cap_map client_caps; // client -> caps mempool::mds_co::compact_map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted = 0; // [replica] what i've requested from auth + int replica_caps_wanted = 0; // [replica] what i've requested from auth + int num_caps_wanted = 0; public: mempool::mds_co::compact_map > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head @@ -689,6 +690,7 @@ public: clear_file_locks(); assert(num_projected_xattrs == 0); assert(num_projected_srnodes == 0); + assert(num_caps_wanted == 0); } @@ -973,7 +975,8 @@ public: bool is_any_nonstale_caps() { return count_nonstale_caps(); } const mempool::mds_co::compact_map& get_mds_caps_wanted() const { return mds_caps_wanted; } - mempool::mds_co::compact_map& get_mds_caps_wanted() { return mds_caps_wanted; } + void set_mds_caps_wanted(mempool::mds_co::compact_map& m); + void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted); const cap_map& get_client_caps() const { return client_caps; } Capability *get_client_cap(client_t client) { @@ -991,6 +994,9 @@ public: } } + int get_num_caps_wanted() const { return num_caps_wanted; } + void adjust_num_caps_wanted(int d); + Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0); void remove_client_cap(client_t client); void move_to_realm(SnapRealm *realm); diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc index 78c531865d8..3cd408937b3 100644 --- a/src/mds/Capability.cc +++ b/src/mds/Capability.cc @@ -13,6 +13,7 @@ */ #include "Capability.h" +#include "CInode.h" #include "common/Formatter.h" @@ -141,6 +142,17 @@ void Capability::revoke_info::generate_test_instances(listadjust_num_caps_wanted(1); + else if (_wanted && !w) + in->adjust_num_caps_wanted(-1); + } + _wanted = w; +} + void Capability::encode(bufferlist& bl) const { ENCODE_START(2, 2, bl) @@ -159,7 +171,9 @@ void Capability::decode(bufferlist::iterator &bl) decode(last_sent, bl); decode(last_issue_stamp, bl); - decode(_wanted, bl); + __u32 tmp_wanted; + decode(tmp_wanted, bl); + set_wanted(tmp_wanted); decode(_pending, bl); decode(_revokes, bl); DECODE_FINISH(bl); @@ -189,7 +203,7 @@ void Capability::generate_test_instances(list& ls) ls.push_back(new Capability); ls.back()->last_sent = 11; ls.back()->last_issue_stamp = utime_t(12, 13); - ls.back()->_wanted = 14; + ls.back()->set_wanted(14); ls.back()->_pending = 15; { auto &r = ls.back()->_revokes.emplace_back(); diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 2934ab9ebd7..fc8e9b34fc7 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -257,10 +257,7 @@ public: // caps this client wants to hold int wanted() { return _wanted; } - void set_wanted(int w) { - _wanted = w; - //check_rdcaps_list(); - } + void set_wanted(int w); void inc_last_seq() { last_sent++; } ceph_seq_t get_last_seq() { return last_sent; } @@ -291,7 +288,7 @@ public: client_follows = other.client_follows; // wanted - _wanted = _wanted | other.wanted; + set_wanted(wanted() | other.wanted); if (auth_cap) mseq = other.mseq; } @@ -308,7 +305,7 @@ public: } // wanted - _wanted = _wanted | otherwanted; + set_wanted(wanted() | otherwanted); } void revoke() { diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 4ab6d6194fe..118fdee5fb8 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2263,10 +2263,7 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl; - if (m->get_caps()) - in->mds_caps_wanted[from] = m->get_caps(); - else - in->mds_caps_wanted.erase(from); + in->set_mds_caps_wanted(from, m->get_caps()); try_eval(in, CEPH_CAP_LOCKS); m->put(); @@ -3336,7 +3333,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, bool need_issue = false; if (cap) cap->inc_suppress(); - if (in->mds_caps_wanted.empty() && + if (in->get_mds_caps_wanted().empty() && (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) { if (in->filelock.get_state() != LOCK_EXCL) file_excl(&in->filelock, &need_issue); @@ -5042,7 +5039,7 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue) assert(in->is_auth()); assert(lock->is_stable()); - assert((in->get_loner() >= 0 && in->mds_caps_wanted.empty()) || + assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) || (lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> switch (lock->get_state()) { @@ -5103,7 +5100,7 @@ void Locker::file_xsyn(SimpleLock *lock, bool *need_issue) dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl; CInode *in = static_cast(lock->get_parent()); assert(in->is_auth()); - assert(in->get_loner() >= 0 && in->mds_caps_wanted.empty()); + assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()); switch (lock->get_state()) { case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 055aa49752a..c210eedd6ec 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4803,7 +4803,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) // caps_wanted if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; + in->set_mds_caps_wanted(from, is.caps_wanted); dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) << " on " << *in << dendl; } @@ -7413,7 +7413,7 @@ void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin, set& gather_locks) { in->remove_replica(from); - in->mds_caps_wanted.erase(from); + in->set_mds_caps_wanted(from, 0); // note: this code calls _eval more often than it needs to! // fix lock diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index a28be44bc3d..4317893d62d 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -3047,7 +3047,12 @@ void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap, map cap_map; decode(cap_map, blp); if (auth_cap) - decode(in->get_mds_caps_wanted(), blp); + if (auth_cap) { + mempool::mds_co::compact_map mds_wanted; + decode(mds_wanted, blp); + mds_wanted.erase(mds->get_nodeid()); + in->set_mds_caps_wanted(mds_wanted); + } if (!cap_map.empty() || (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) { peer_exports[in].swap(cap_map); From 01d90d0b32f36e1ffe3e666f7f525ef03709393a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 19 Jan 2018 10:47:40 +0800 Subject: [PATCH 04/14] mds: introduce open file table The open file table records linkage information of inodes that clients or mds want caps and linkage information of their ancestor directories. By using these information, recovering mds can easily get path to any inode in the table. MDLog saves the open file table to object store at the same time of trimming log segments. MDLog only trim log segments that were filled before the last time open file table got saved to object store. By this way, an 'open' inode is either in log or its path information is saved to object store. Signed-off-by: "Yan, Zheng" --- src/mds/Anchor.cc | 60 ++++++ src/mds/Anchor.h | 55 +++++ src/mds/CDir.cc | 6 + src/mds/CInode.cc | 4 +- src/mds/CInode.h | 4 +- src/mds/CMakeLists.txt | 2 + src/mds/MDCache.cc | 3 +- src/mds/MDCache.h | 3 + src/mds/MDLog.cc | 41 +++- src/mds/MDLog.h | 1 + src/mds/MDSRank.cc | 6 +- src/mds/OpenFileTable.cc | 432 +++++++++++++++++++++++++++++++++++++++ src/mds/OpenFileTable.h | 85 ++++++++ 13 files changed, 696 insertions(+), 6 deletions(-) create mode 100644 src/mds/Anchor.cc create mode 100644 src/mds/Anchor.h create mode 100644 src/mds/OpenFileTable.cc create mode 100644 src/mds/OpenFileTable.h diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc new file mode 100644 index 00000000000..99f1504ea74 --- /dev/null +++ b/src/mds/Anchor.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/Anchor.h" + +#include "common/Formatter.h" + +void Anchor::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(ino, bl); + encode(dirino, bl); + encode(d_name, bl); + encode(d_type, bl); + ENCODE_FINISH(bl); +} + +void Anchor::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + decode(ino, bl); + decode(dirino, bl); + decode(d_name, bl); + decode(d_type, bl); + DECODE_FINISH(bl); +} + +void Anchor::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("dirino", dirino); + f->dump_string("d_name", d_name); + f->dump_unsigned("d_type", d_type); +} + +void Anchor::generate_test_instances(list& ls) +{ + ls.push_back(new Anchor); + ls.push_back(new Anchor); + ls.back()->ino = 1; + ls.back()->dirino = 2; + ls.back()->d_name = "hello"; + ls.back()->d_type = DT_DIR; +} + +ostream& operator<<(ostream& out, const Anchor &a) +{ + return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")"; +} diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h new file mode 100644 index 00000000000..7b6344af324 --- /dev/null +++ b/src/mds/Anchor.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ANCHOR_H +#define CEPH_ANCHOR_H + +#include + +#include "include/types.h" +#include "mdstypes.h" +#include "include/buffer.h" + +/* + * Anchor represents primary linkage of an inode. When adding inode to an + * anchor table, MDS ensures that the table also contains inode's ancestor + * inodes. MDS can get inode's path by looking up anchor table recursively. + */ +class Anchor { +public: + inodeno_t ino; // anchored ino + inodeno_t dirino; + std::string d_name; + __u8 d_type; + mutable int nref; // how many children + + Anchor() : d_type(0), nref(0) {} + Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : + ino(i), dirino(di), d_name(str), d_type(tp), nref(nr) { } + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& ls); +}; +WRITE_CLASS_ENCODER(Anchor) + +inline bool operator==(const Anchor &l, const Anchor &r) { + return l.ino == r.ino && l.dirino == r.dirino && + l.d_name == r.d_name && l.d_type == r.d_type; +} + +ostream& operator<<(ostream& out, const Anchor &a); + +#endif diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 9fb6a323fe8..5fffead64ab 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -566,6 +566,9 @@ void CDir::link_inode_work( CDentry *dn, CInode *in) // pin dentry? if (in->get_num_ref()) dn->get(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_link(in); // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) @@ -642,6 +645,9 @@ void CDir::unlink_inode_work( CDentry *dn ) // unpin dentry? if (in->get_num_ref()) dn->put(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_unlink(in); // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index dc71edff8cc..1378e8b440e 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2873,9 +2873,9 @@ void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) void CInode::adjust_num_caps_wanted(int d) { if (!num_caps_wanted && d > 0) - ; // add 'this' to open file table + mdcache->open_file_table.add_inode(this); else if (num_caps_wanted > 0 && num_caps_wanted == -d) - ; // remove 'this' from open file table + mdcache->open_file_table.remove_inode(this); num_caps_wanted +=d; assert(num_caps_wanted >= 0); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 55c07dcbf9f..213ec09a17c 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -232,13 +232,15 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counterobjecter, m->finisher), exceeded_size_limit(false), recovery_queue(m), - stray_manager(m, purge_queue_) + stray_manager(m, purge_queue_), + open_file_table(m) { migrator.reset(new Migrator(mds, this)); root = NULL; diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index bc551a15c68..34c92dd96e9 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -31,6 +31,7 @@ #include "events/EMetaBlob.h" #include "RecoveryQueue.h" #include "StrayManager.h" +#include "OpenFileTable.h" #include "MDSContext.h" #include "MDSMap.h" #include "Mutation.h" @@ -1227,6 +1228,8 @@ public: public: /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ std::set export_pin_queue; + + OpenFileTable open_file_table; }; class C_MDS_RetryRequest : public MDSInternalContext { diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index a38f2d0db20..4dc3e9fe134 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -566,6 +566,17 @@ void MDLog::_journal_segment_subtree_map(MDSInternalContextBase *onsync) _submit_entry(sle, new C_MDL_Flushed(this, onsync)); } +class C_OFT_Committed : public MDSInternalContext { + MDLog *mdlog; + uint64_t seq; +public: + C_OFT_Committed(MDLog *l, uint64_t s) : + MDSInternalContext(l->mds), mdlog(l), seq(s) {} + void finish(int ret) override { + mdlog->trim_expired_segments(); + } +}; + void MDLog::trim(int m) { unsigned max_segments = g_conf->mds_log_max_segments; @@ -636,6 +647,7 @@ void MDLog::trim(int m) << journaler->get_write_safe_pos() << " < end " << ls->end << dendl; break; } + if (expiring_segments.count(ls)) { dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset << ", " << ls->num_events << " events" << dendl; @@ -657,6 +669,18 @@ void MDLog::trim(int m) } } + if (!capped && + !mds->mdcache->open_file_table.is_any_committing()) { + uint64_t last_seq = get_last_segment_seq(); + if (mds->mdcache->open_file_table.is_any_dirty() || + last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_HIGH); + submit_mutex.Lock(); + } + } + // discard expired segments and unlock submit_mutex _trim_expired_segments(); } @@ -689,8 +713,15 @@ int MDLog::trim_all() << "/" << expired_segments.size() << dendl; uint64_t last_seq = 0; - if (!segments.empty()) + if (!segments.empty()) { last_seq = get_last_segment_seq(); + if (last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_DEFAULT); + submit_mutex.Lock(); + } + } map::iterator p = segments.begin(); while (p != segments.end() && @@ -770,6 +801,8 @@ void MDLog::_trim_expired_segments() { assert(submit_mutex.is_locked_by_me()); + uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq(); + // trim expired segments? bool trimmed = false; while (!segments.empty()) { @@ -779,6 +812,12 @@ void MDLog::_trim_expired_segments() << " to expire" << dendl; break; } + + if (!capped && ls->seq >= oft_committed_seq) { + dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq + << " <= " << ls->seq << "/" << ls->offset << dendl; + break; + } dout(10) << "_trim_expired_segments trimming expired " << ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl; diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index 5579e5abc00..fa3704be21a 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -306,6 +306,7 @@ private: friend class C_MaybeExpiredSegment; friend class C_MDL_Flushed; + friend class C_OFT_Committed; public: void trim_expired_segments(); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 91d4e31d341..35ef49d4dc8 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1075,6 +1075,8 @@ void MDSRank::boot_start(BootStep step, int r) } else if (!standby_replaying) { dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl; purge_queue.open(NULL); + dout(2) << "boot_start " << step << ": loading open file table (async)" << dendl; + mdcache->open_file_table.load(nullptr); } if (mdsmap->get_tableserver() == whoami) { @@ -1274,8 +1276,10 @@ void MDSRank::standby_replay_restart() this, mdlog->get_journaler()->get_read_pos())); - dout(1) << " opening purge queue (async)" << dendl; + dout(1) << " opening purge_queue (async)" << dendl; purge_queue.open(NULL); + dout(1) << " opening open_file_table (async)" << dendl; + mdcache->open_file_table.load(nullptr); } else { dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() << " (which blacklists prior instance)" << dendl; diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc new file mode 100644 index 00000000000..b42d5667aeb --- /dev/null +++ b/src/mds/OpenFileTable.cc @@ -0,0 +1,432 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/CInode.h" +#include "mds/MDSRank.h" +#include "mds/MDCache.h" +#include "osdc/Objecter.h" +#include "OpenFileTable.h" + +#include "common/config.h" +#include "common/errno.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".openfiles "; +} + +void OpenFileTable::get_ref(CInode *in) +{ + do { + auto p = anchor_map.find(in->ino()); + if (p != anchor_map.end()) { + assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + assert(p->second.nref > 0); + p->second.nref++; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + + auto ret = anchor_map.emplace(std::piecewise_construct, std::forward_as_tuple(in->ino()), + std::forward_as_tuple(in->ino(), (pin ? pin->ino() : inodeno_t(0)), + (dn ? dn->get_name() : string()), in->d_type(), 1)); + assert(ret.second == true); + in->state_set(CInode::STATE_TRACKEDBYOFT); + + dirty_items.emplace(in->ino(), DIRTY_NEW); + + in = pin; + } while (in); +} + +void OpenFileTable::put_ref(CInode *in) +{ + do { + assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + + if (p->second.nref > 1) { + p->second.nref--; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + if (dn) { + assert(p->second.dirino == pin->ino()); + assert(p->second.d_name == dn->get_name()); + } else { + assert(p->second.dirino == inodeno_t(0)); + assert(p->second.d_name == ""); + } + + anchor_map.erase(p); + in->state_clear(CInode::STATE_TRACKEDBYOFT); + + auto ret = dirty_items.emplace(in->ino(), 0); + if (!ret.second && (ret.first->second & DIRTY_NEW)) + dirty_items.erase(ret.first); + + in = pin; + } while (in); +} + +void OpenFileTable::add_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + assert(p == anchor_map.end()); + } + get_ref(in); +} + +void OpenFileTable::remove_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref == 1); + } + put_ref(in); +} + +void OpenFileTable::notify_link(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + assert(p->second.dirino == inodeno_t(0)); + assert(p->second.d_name == ""); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + + p->second.dirino = pin->ino(); + p->second.d_name = dn->get_name(); + dirty_items.emplace(in->ino(), 0); + + get_ref(pin); +} + +void OpenFileTable::notify_unlink(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + assert(p->second.dirino == pin->ino()); + assert(p->second.d_name == dn->get_name()); + + p->second.dirino = inodeno_t(0); + p->second.d_name = ""; + dirty_items.emplace(in->ino(), 0); + + put_ref(pin); +} + +object_t OpenFileTable::get_object_name() const +{ + char s[30]; + snprintf(s, sizeof(s), "mds%d_openfiles", int(mds->get_nodeid())); + return object_t(s); +} + +class C_IO_OFT_Save : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSInternalContextBase *fin; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Save(OpenFileTable *t, uint64_t s, MDSInternalContextBase *c) : + oft(t), log_seq(s), fin(c) {} + void finish(int r) { + oft->_commit_finish(r, log_seq, fin); + } +}; + +void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + assert(log_seq <= committing_log_seq); + assert(log_seq >= committed_log_seq); + committed_log_seq = log_seq; + num_pending_commit--; + + if (fin) + fin->complete(r); +} + +void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + const unsigned max_write_size = mds->mdcache->max_dir_commit_size; + + assert(log_seq >= committing_log_seq); + committing_log_seq = log_seq; + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + + SnapContext snapc; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + bool first = true; + unsigned write_size = 0; + std::map to_update; + std::set to_remove; + + auto commit_func = [&](bool last) { + ObjectOperation op; + op.priority = op_prio; + + if (clear_on_commit) { + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + clear_on_commit = false; + } + + if (last) { + bufferlist header; + encode(log_seq, header); + op.omap_set_header(header); + } else if (first) { + // make incomplete + bufferlist header; + encode((uint64_t)0, header); + op.omap_set_header(header); + } + + if (!to_update.empty()) + op.omap_set(to_update); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, + gather.new_sub()); + + first = false; + write_size = 0; + to_update.clear(); + to_remove.clear(); + }; + + bool first_commit = !loaded_anchor_map.empty(); + + using ceph::encode; + for (auto& it : dirty_items) { + auto p = anchor_map.find(it.first); + if (first_commit) { + auto q = loaded_anchor_map.find(it.first); + if (q != loaded_anchor_map.end()) { + bool same = (p != anchor_map.end() && p->second == q->second); + loaded_anchor_map.erase(q); + if (same) + continue; + } + } + + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + write_size += len + sizeof(__u32); + + if (p != anchor_map.end()) { + bufferlist bl; + encode(p->second, bl); + + write_size += bl.length() + sizeof(__u32); + + to_update[key].swap(bl); + } else { + to_remove.emplace(key); + } + + if (write_size >= max_write_size) { + commit_func(false); + } + } + dirty_items.clear(); + + if (first_commit) { + for (auto& it : loaded_anchor_map) { + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + write_size += len + sizeof(__u32); + to_remove.emplace(key); + + if (write_size >= max_write_size) { + commit_func(false); + } + } + loaded_anchor_map.clear(); + } + + commit_func(true); + + num_pending_commit++; + gather.activate(); +} + +class C_IO_OFT_Load : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } + +public: + int header_r = 0; //< Return value from OMAP header read + int values_r = 0; //< Return value from OMAP value read + bufferlist header_bl; + std::map values; + bool more = false; + bool first; + + C_IO_OFT_Load(OpenFileTable *t, bool f) : oft(t), first(f) {} + void finish(int r) { + oft->_load_finish(r, header_r, values_r, first, more, header_bl, values); + } +}; + +void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, + bool first, bool more, + bufferlist &header_bl, + std::map &values) +{ + if (op_r < 0) { + derr << __func__ << " got " << cpp_strerror(op_r) << dendl; + clear_on_commit = true; + if (!first) + loaded_anchor_map.clear(); + goto out; + } + + try { + using ceph::decode; + if (first) { + bufferlist::iterator p = header_bl.begin(); + uint64_t log_seq; + decode(log_seq, p); + committed_log_seq = committing_log_seq = log_seq; + if (log_seq == 0) { + dout(1) << __func__ << ": incomplete values" << dendl; + goto out; + } + } + + for (auto& it : values) { + inodeno_t ino; + sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val); + + auto r = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), + std::piecewise_construct, + std::make_tuple(ino), + std::make_tuple()); + Anchor& anchor = r->second; + + bufferlist::iterator p = it.second.begin(); + decode(anchor, p); + assert(ino == anchor.ino); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted header/values: " << e.what() << dendl; + clear_on_commit = true; + loaded_anchor_map.clear(); + goto out; + } + + if (more) { + // Issue another read if we're not at the end of the omap + const std::string last_key = values.rbegin()->first; + dout(10) << __func__ << ": continue to load from '" << last_key << "'" << dendl; + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, false); + ObjectOperation op; + op.omap_get_vals(last_key, "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); + return; + } + + dout(10) << __func__ << ": load complete" << dendl; +out: + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + +void OpenFileTable::load(MDSInternalContextBase *onload) +{ + dout(10) << __func__ << dendl; + assert(!load_done); + if (onload) + waiting_for_load.push_back(onload); + + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, true); + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + ObjectOperation op; + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals("", "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); +} + +bool OpenFileTable::get_ancestors(inodeno_t ino, vector& ancestors) +{ + auto p = loaded_anchor_map.find(ino); + if (p == loaded_anchor_map.end()) + return false; + + inodeno_t dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + return false; + + ancestors.clear(); + while (true) { + ancestors.push_back(inode_backpointer_t(dirino, p->second.d_name, 0)); + + p = loaded_anchor_map.find(dirino); + if (p == loaded_anchor_map.end()) + break; + + dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + break; + } + return true; +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h new file mode 100644 index 00000000000..5ed1600d8b6 --- /dev/null +++ b/src/mds/OpenFileTable.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OPEN_FILE_TABLE_H +#define OPEN_FILE_TABLE_H + +#include "mdstypes.h" +#include "Anchor.h" + +class CDir; +class CInode; +class MDSRank; +class MDSInternalContextBase; + +class OpenFileTable +{ +public: + OpenFileTable(MDSRank *m) : mds(m) {} + + void add_inode(CInode *in); + void remove_inode(CInode *in); + void notify_link(CInode *in); + void notify_unlink(CInode *in); + bool is_any_dirty() const { return !dirty_items.empty(); } + + void commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio); + uint64_t get_committed_log_seq() const { return committed_log_seq; } + uint64_t get_committing_log_seq() const { return committing_log_seq; } + bool is_any_committing() const { return num_pending_commit > 0; } + + void load(MDSInternalContextBase *c); + bool is_loaded() const { return load_done; } + void wait_for_load(MDSInternalContextBase *c) { + assert(!load_done); + waiting_for_load.push_back(c); + } + + bool get_ancestors(inodeno_t ino, vector& ancestors); + +protected: + MDSRank *mds; + + map anchor_map; + + std::map dirty_items; // ino -> dirty state + static const unsigned DIRTY_NEW = 1; + + uint64_t committed_log_seq = 0; + uint64_t committing_log_seq = 0; + + void get_ref(CInode *in); + void put_ref(CInode *in); + + object_t get_object_name() const; + + bool clear_on_commit = false; + unsigned num_pending_commit = 0; + void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); + + map loaded_anchor_map; + list waiting_for_load; + bool load_done = false; + + void _load_finish(int op_r, int header_r, int values_r, + bool first, bool more, + bufferlist &header_bl, + std::map &values); + + + friend class C_IO_OFT_Load; + friend class C_IO_OFT_Save; +}; + +#endif From c8f06bd953f959501701dd5dc462ce238cfb4a04 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 20 Jan 2018 08:40:17 +0800 Subject: [PATCH 05/14] mds: use open file table to speed up mds recovery For inodes that need to open for reconnected/imported caps. First open directory inodes that are in open file table. then open regular inodes that are in open file table. finally open the rest ones. When opening inodes in the open file table, make "open inode by ino" function first try the pathes recorded in the open file table. Signed-off-by: "Yan, Zheng" --- src/mds/Anchor.h | 5 +- src/mds/MDCache.cc | 47 ++++++++++++++-- src/mds/MDCache.h | 4 ++ src/mds/OpenFileTable.cc | 113 ++++++++++++++++++++++++++++++++++++++- src/mds/OpenFileTable.h | 22 +++++++- 5 files changed, 184 insertions(+), 7 deletions(-) diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h index 7b6344af324..0054a850087 100644 --- a/src/mds/Anchor.h +++ b/src/mds/Anchor.h @@ -32,7 +32,10 @@ public: inodeno_t dirino; std::string d_name; __u8 d_type; - mutable int nref; // how many children + union { + mutable int nref; // how many children + mutable mds_rank_t auth; // auth hint + }; Anchor() : d_type(0), nref(0) {} Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index a9cb464ad9d..fd9c580cffe 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5294,10 +5294,41 @@ void MDCache::rejoin_open_sessions_finish(map client_map rejoin_gather_finish(); } +void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret) +{ + auto p = cap_imports.find(ino); + if (p != cap_imports.end()) { + dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl; + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret != mds->get_nodeid()) { + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + assert(q->second.count(MDS_RANK_NONE)); + assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + } +} + bool MDCache::process_imported_caps() { dout(10) << "process_imported_caps" << dendl; + if (!open_file_table.is_prefetched() && + open_file_table.prefetch_inodes()) { + open_file_table.wait_for_prefetch( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + assert(rejoin_gather.count(mds->get_nodeid())); + process_imported_caps(); + }) + ) + ); + return true; + } + for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { CInode *in = get_inode(p->first); if (in) { @@ -8687,21 +8718,22 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) dout(10) << "do_open_ino_peer " << ino << " active " << active << " all " << all << " checked " << info.checked << dendl; + mds_rank_t whoami = mds->get_nodeid(); mds_rank_t peer = MDS_RANK_NONE; - if (info.auth_hint >= 0) { + if (info.auth_hint >= 0 && info.auth_hint != whoami) { if (active.count(info.auth_hint)) { peer = info.auth_hint; info.auth_hint = MDS_RANK_NONE; } } else { for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) { + if (*p != whoami && info.checked.count(*p) == 0) { peer = *p; break; } } if (peer < 0) { - all.erase(mds->get_nodeid()); + all.erase(whoami); if (all != info.checked) { dout(10) << " waiting for more peers to be active" << dendl; } else { @@ -8857,7 +8889,14 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin, info.tid = ++open_ino_last_tid; info.pool = pool >= 0 ? pool : default_file_layout.pool_id; info.waiters.push_back(fin); - do_open_ino(ino, info, 0); + if (mds->is_rejoin() && + open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) { + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + _open_ino_traverse_dir(ino, info, 0); + } else { + do_open_ino(ino, info, 0); + } } } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 34c92dd96e9..ed138f3078a 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -597,6 +597,9 @@ public: mds_rank_t frommds=MDS_RANK_NONE) { cap_imports[ino][client][frommds] = icr; } + bool rejoin_has_cap_reconnect(inodeno_t ino) const { + return cap_imports.count(ino); + } const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { if (cap_imports.count(ino) && cap_imports[ino].count(client) && @@ -644,6 +647,7 @@ public: friend class C_MDC_RejoinOpenInoFinish; friend class C_MDC_RejoinSessionsOpened; void rejoin_open_ino_finish(inodeno_t ino, int ret); + void rejoin_prefetch_ino_finish(inodeno_t ino, int ret); void rejoin_open_sessions_finish(map client_map, map& sseqmap); bool process_imported_caps(); diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index b42d5667aeb..54d3bc5b3b6 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -356,6 +356,7 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, bufferlist::iterator p = it.second.begin(); decode(anchor, p); assert(ino == anchor.ino); + anchor.auth = MDS_RANK_NONE; } } catch (buffer::error &e) { derr << __func__ << ": corrupted header/values: " << e.what() << dendl; @@ -406,7 +407,8 @@ void OpenFileTable::load(MDSInternalContextBase *onload) new C_OnFinisher(c, mds->finisher)); } -bool OpenFileTable::get_ancestors(inodeno_t ino, vector& ancestors) +bool OpenFileTable::get_ancestors(inodeno_t ino, vector& ancestors, + mds_rank_t& auth_hint) { auto p = loaded_anchor_map.find(ino); if (p == loaded_anchor_map.end()) @@ -416,6 +418,7 @@ bool OpenFileTable::get_ancestors(inodeno_t ino, vector& an if (dirino == inodeno_t(0)) return false; + bool first = true; ancestors.clear(); while (true) { ancestors.push_back(inode_backpointer_t(dirino, p->second.d_name, 0)); @@ -424,9 +427,117 @@ bool OpenFileTable::get_ancestors(inodeno_t ino, vector& an if (p == loaded_anchor_map.end()) break; + if (first) + auth_hint = p->second.auth; + dirino = p->second.dirino; if (dirino == inodeno_t(0)) break; + + first = false; } return true; } + +class C_OFT_OpenInoFinish: public MDSInternalContextBase { + OpenFileTable *oft; + inodeno_t ino; + MDSRank *get_mds() override { return oft->mds; } +public: + C_OFT_OpenInoFinish(OpenFileTable *t, inodeno_t i) : oft(t), ino(i) {} + void finish(int r) override { + oft->_open_ino_finish(ino, r); + } +}; + +void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) +{ + if (prefetch_state == DIR_INODES && r >= 0 && ino != inodeno_t(0)) { + auto p = loaded_anchor_map.find(ino); + assert(p != loaded_anchor_map.end()); + p->second.auth = mds_rank_t(r); + } + + if (r != mds->get_nodeid()) + mds->mdcache->rejoin_prefetch_ino_finish(ino, r); + + num_opening_inodes--; + if (num_opening_inodes == 0) { + if (prefetch_state == DIR_INODES) { + prefetch_state = FILE_INODES; + _prefetch_inodes(); + } else if (prefetch_state == FILE_INODES) { + prefetch_state = DONE; + finish_contexts(g_ceph_context, waiting_for_prefetch); + waiting_for_prefetch.clear(); + } else { + assert(0); + } + } +} + +void OpenFileTable::_prefetch_inodes() +{ + dout(10) << __func__ << " state " << prefetch_state << dendl; + assert(!num_opening_inodes); + num_opening_inodes = 1; + + int64_t pool; + if (prefetch_state == DIR_INODES) + pool = mds->mdsmap->get_metadata_pool(); + else if (prefetch_state == FILE_INODES) + pool = mds->mdsmap->get_first_data_pool(); + else + assert(0); + + MDCache *mdcache = mds->mdcache; + + for (auto& it : loaded_anchor_map) { + if (it.second.d_type == DT_DIR) { + if (prefetch_state != DIR_INODES) + continue; + if (MDS_INO_IS_MDSDIR(it.first)) { + it.second.auth = MDS_INO_MDSDIR_OWNER(it.first); + continue; + } + if (MDS_INO_IS_STRAY(it.first)) { + it.second.auth = MDS_INO_STRAY_OWNER(it.first); + continue; + } + } else { + if (prefetch_state != FILE_INODES) + continue; + if (!mdcache->rejoin_has_cap_reconnect(it.first)) + continue; + } + CInode *in = mdcache->get_inode(it.first); + if (in) + continue; + + num_opening_inodes++; + mdcache->open_ino(it.first, pool, new C_OFT_OpenInoFinish(this, it.first), false); + } + + _open_ino_finish(inodeno_t(0), 0); +} + +bool OpenFileTable::prefetch_inodes() +{ + dout(10) << __func__ << dendl; + assert(!prefetch_state); + prefetch_state = DIR_INODES; + + if (!load_done) { + wait_for_load( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + _prefetch_inodes(); + }) + ) + ); + return true; + } + + _prefetch_inodes(); + return !is_prefetched(); +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 5ed1600d8b6..cca7044f0d3 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -46,7 +46,16 @@ public: waiting_for_load.push_back(c); } - bool get_ancestors(inodeno_t ino, vector& ancestors); + bool get_ancestors(inodeno_t ino, vector& ancestors, + mds_rank_t& auth_hint); + + bool prefetch_inodes(); + bool is_prefetched() const { return prefetch_state == DONE; } + void wait_for_prefetch(MDSInternalContextBase *c) { + assert(!is_prefetched()); + waiting_for_prefetch.push_back(c); + } + protected: MDSRank *mds; @@ -77,9 +86,20 @@ protected: bufferlist &header_bl, std::map &values); + enum { + DIR_INODES = 1, + FILE_INODES = 2, + DONE = 3, + }; + unsigned prefetch_state = 0; + unsigned num_opening_inodes = 0; + list waiting_for_prefetch; + void _open_ino_finish(inodeno_t ino, int r); + void _prefetch_inodes(); friend class C_IO_OFT_Load; friend class C_IO_OFT_Save; + friend class C_OFT_OpenInoFinish; }; #endif From 9641adcd063f15b57a88f743e916f7b7161ed591 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 19 Jan 2018 17:10:36 +0800 Subject: [PATCH 06/14] mds: don't re-requeue open files to head of log When MDS tries expiring old log segments, it re-queuae open files tracked by these log segments to head of log. When there are lots of open files, this creates high overhead on the log. Previous path introdoced open file table. MDS can get open files' pathes from the table and load corresponding inodes. Signed-off-by: "Yan, Zheng" --- src/mds/CInode.cc | 1 - src/mds/Locker.cc | 14 ++------------ src/mds/MDCache.cc | 6 ++---- src/mds/Migrator.cc | 2 -- src/mds/OpenFileTable.cc | 17 +++++++++++++++-- src/mds/OpenFileTable.h | 1 + src/mds/Server.cc | 10 +--------- src/mds/journal.cc | 28 ++-------------------------- 8 files changed, 23 insertions(+), 56 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 1378e8b440e..017e2863e73 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2935,7 +2935,6 @@ void CInode::remove_client_cap(client_t client) put(PIN_CAPS); item_caps.remove_myself(); containing_realm = NULL; - item_open_file.remove_myself(); // unpin logsegment mdcache->num_inodes_with_caps--; } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 118fdee5fb8..d4590a2451d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2432,7 +2432,6 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, eo->add_ino(in->ino()); metablob = &eo->metablob; le = eo; - mut->ls->open_files.push_back(&in->item_open_file); } else { EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size"); metablob = &eu->metablob; @@ -2538,27 +2537,18 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) return; } - if (cap->wanted() == 0) { - if (cur->item_open_file.is_on_list() && - !cur->is_any_caps_wanted()) { - dout(10) << " removing unwanted file from open file list " << *cur << dendl; - cur->item_open_file.remove_myself(); - } - } else { + if (cap->wanted()) { if (cur->state_test(CInode::STATE_RECOVERING) && (cap->wanted() & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) { mds->mdcache->recovery_queue.prioritize(cur); } - if (!cur->item_open_file.is_on_list()) { - dout(10) << " adding to open file list " << *cur << dendl; + if (mdcache->open_file_table.should_log_open(cur)) { assert(cur->last == CEPH_NOSNAP); - LogSegment *ls = mds->mdlog->get_current_segment(); EOpen *le = new EOpen(mds->mdlog); mds->mdlog->start_entry(le); le->add_clean_inode(cur); - ls->open_files.push_back(&cur->item_open_file); mds->mdlog->submit_entry(le); } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index fd9c580cffe..590fab102f9 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5634,10 +5634,8 @@ void MDCache::clean_open_file_lists() CInode *in = *q; ++q; if (in->last == CEPH_NOSNAP) { - if (!in->is_any_caps_wanted()) { - dout(10) << " unlisting unwanted/capless inode " << *in << dendl; - in->item_open_file.remove_myself(); - } + dout(10) << " unlisting unwanted/capless inode " << *in << dendl; + in->item_open_file.remove_myself(); } else { if (in->client_snap_caps.empty()) { dout(10) << " unlisting flushed snap inode " << *in << dendl; diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 4317893d62d..dc25b5e0e2e 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1553,8 +1553,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer, if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) in->clear_scatter_dirty(); - in->item_open_file.remove_myself(); - in->clear_dirty_parent(); in->clear_file_locks(); diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index 54d3bc5b3b6..e49ce887132 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -507,8 +507,7 @@ void OpenFileTable::_prefetch_inodes() } else { if (prefetch_state != FILE_INODES) continue; - if (!mdcache->rejoin_has_cap_reconnect(it.first)) - continue; + // load all file inodes for MDCache::identify_files_to_recover() } CInode *in = mdcache->get_inode(it.first); if (in) @@ -541,3 +540,17 @@ bool OpenFileTable::prefetch_inodes() _prefetch_inodes(); return !is_prefetched(); } + +bool OpenFileTable::should_log_open(CInode *in) +{ + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) { + // inode just journaled + if (in->last_journaled >= committing_log_seq) + return false; + // item not dirty. it means the item has already been saved + auto p = dirty_items.find(in->ino()); + if (p == dirty_items.end()) + return false; + } + return true; +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index cca7044f0d3..6f77f4bf362 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -56,6 +56,7 @@ public: waiting_for_prefetch.push_back(c); } + bool should_log_open(CInode *in); protected: MDSRank *mds; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 70f7bd0ad30..b78c04ec71c 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3417,12 +3417,10 @@ void Server::handle_client_open(MDRequestRef& mdr) // make sure this inode gets into the journal if (cur->is_auth() && cur->last == CEPH_NOSNAP && - !cur->item_open_file.is_on_list()) { - LogSegment *ls = mds->mdlog->get_current_segment(); + mdcache->open_file_table.should_log_open(cur)) { EOpen *le = new EOpen(mds->mdlog); mdlog->start_entry(le); le->add_clean_inode(cur); - ls->open_files.push_back(&cur->item_open_file); mdlog->submit_entry(le); } @@ -3637,8 +3635,6 @@ void Server::handle_client_openc(MDRequestRef& mdr) // make sure this inode gets into the journal le->metablob.add_opened_ino(in->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&in->item_open_file); C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows); @@ -4273,8 +4269,6 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode) // make sure ino gets into the journal le->metablob.add_opened_ino(in->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&in->item_open_file); mdr->o_trunc = true; @@ -5262,8 +5256,6 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) // make sure this inode gets into the journal le->metablob.add_opened_ino(newi->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&newi->item_open_file); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi)); } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index ed796b9b071..46ed982427d 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -155,22 +155,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o while (!p.end()) { CInode *in = *p; ++p; - if (in->last == CEPH_NOSNAP && in->is_auth() && - !in->is_ambiguous_auth() && in->is_any_caps()) { - if (in->is_any_caps_wanted()) { - dout(20) << "try_to_expire requeueing open file " << *in << dendl; - if (!le) { - le = new EOpen(mds->mdlog); - mds->mdlog->start_entry(le); - } - le->add_clean_inode(in); - ls->open_files.push_back(&in->item_open_file); - } else { - // drop inodes that aren't wanted - dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in << dendl; - in->item_open_file.remove_myself(); - } - } else if (in->last != CEPH_NOSNAP && !in->client_snap_caps.empty()) { + if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) { // journal snap inodes that need flush. This simplify the mds failover hanlding dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl; if (!le) { @@ -180,16 +165,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o le->add_clean_inode(in); ls->open_files.push_back(&in->item_open_file); } else { - /* - * we can get a capless inode here if we replay an open file, the client fails to - * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS - * it's ok for the client to replay an open on a file it doesn't have in it's cache - * anymore. - * - * this makes the mds less sensitive to strict open_file consistency, although it does - * make it easier to miss subtle problems. - */ - dout(20) << "try_to_expire not requeueing and delisting capless file " << *in << dendl; + // open files are tracked by open file table, no need to journal them again in->item_open_file.remove_myself(); } } From 2154c2bca256d7885d4d36db23aec10ff5b6b96f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 20 Jan 2018 07:07:57 +0800 Subject: [PATCH 07/14] mds: don't try opening inodes that haven't been created Signed-off-by: Yan, Zheng --- src/mds/MDCache.h | 3 +++ src/mds/Server.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index ed138f3078a..f271592384c 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -600,6 +600,9 @@ public: bool rejoin_has_cap_reconnect(inodeno_t ino) const { return cap_imports.count(ino); } + void add_replay_ino_alloc(inodeno_t ino) { + cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin + } const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { if (cap_imports.count(ino) && cap_imports[ino].count(client) && diff --git a/src/mds/Server.cc b/src/mds/Server.cc index b78c04ec71c..8f365378ac3 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -214,6 +214,10 @@ void Server::dispatch(Message *m) if (req->is_replay()) { dout(3) << "queuing replayed op" << dendl; queue_replay = true; + if (req->head.ino && + !session->have_completed_request(req->get_reqid().tid, nullptr)) { + mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino)); + } } else if (req->get_retry_attempt()) { // process completed request in clientreplay stage. The completed request // might have created new file/directorie. This guarantees MDS sends a reply From 50e2d2d3db25239133f3755e82c6cd5dce9ef58d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 26 Jan 2018 15:02:04 +0800 Subject: [PATCH 08/14] mds: don't try prefetching destroyed inodes Signed-off-by: Yan, Zheng --- src/mds/MDLog.cc | 3 +++ src/mds/OpenFileTable.cc | 25 +++++++++++++++++++++++++ src/mds/OpenFileTable.h | 6 ++++++ src/mds/journal.cc | 29 ++++++++++++++++------------- 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 4dc3e9fe134..3ddc26522c6 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -1468,6 +1468,9 @@ void MDLog::standby_trim_segments() dout(10) << "standby_trim_segments" << dendl; uint64_t expire_pos = journaler->get_expire_pos(); dout(10) << " expire_pos=" << expire_pos << dendl; + + mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos); + bool removed_segment = false; while (have_any_segments()) { LogSegment *seg = get_oldest_segment(); diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index e49ce887132..7ae14c41c92 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -468,6 +468,8 @@ void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) _prefetch_inodes(); } else if (prefetch_state == FILE_INODES) { prefetch_state = DONE; + logseg_destroyed_inos.clear(); + destroyed_inos_set.clear(); finish_contexts(g_ceph_context, waiting_for_prefetch); waiting_for_prefetch.clear(); } else { @@ -492,7 +494,14 @@ void OpenFileTable::_prefetch_inodes() MDCache *mdcache = mds->mdcache; + if (destroyed_inos_set.empty()) { + for (auto& it : logseg_destroyed_inos) + destroyed_inos_set.insert(it.second.begin(), it.second.end()); + } + for (auto& it : loaded_anchor_map) { + if (destroyed_inos_set.count(it.first)) + continue; if (it.second.d_type == DT_DIR) { if (prefetch_state != DIR_INODES) continue; @@ -554,3 +563,19 @@ bool OpenFileTable::should_log_open(CInode *in) } return true; } + +void OpenFileTable::note_destroyed_inos(uint64_t seq, const vector& inos) +{ + auto& vec = logseg_destroyed_inos[seq]; + vec.insert(vec.end(), inos.begin(), inos.end()); +} + +void OpenFileTable::trim_destroyed_inos(uint64_t seq) +{ + auto p = logseg_destroyed_inos.begin(); + while (p != logseg_destroyed_inos.end()) { + if (p->first >= seq) + break; + logseg_destroyed_inos.erase(p++); + } +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 6f77f4bf362..0a51f0fff2a 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -58,6 +58,9 @@ public: bool should_log_open(CInode *in); + void note_destroyed_inos(uint64_t seq, const vector& inos); + void trim_destroyed_inos(uint64_t seq); + protected: MDSRank *mds; @@ -98,6 +101,9 @@ protected: void _open_ino_finish(inodeno_t ino, int r); void _prefetch_inodes(); + std::map > logseg_destroyed_inos; + std::set destroyed_inos_set; + friend class C_IO_OFT_Load; friend class C_IO_OFT_Save; friend class C_OFT_OpenInoFinish; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 46ed982427d..f90dfb92bc6 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -1595,21 +1595,24 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) } // destroyed inodes - for (vector::iterator p = destroyed_inodes.begin(); - p != destroyed_inodes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (in) { - dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; - CDentry *parent = in->get_parent_dn(); - mds->mdcache->remove_inode(in); - if (parent) { - dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; - assert(parent->get_linkage()->is_null()); + if (!destroyed_inodes.empty()) { + for (vector::iterator p = destroyed_inodes.begin(); + p != destroyed_inodes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(*p); + if (in) { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; + CDentry *parent = in->get_parent_dn(); + mds->mdcache->remove_inode(in); + if (parent) { + dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; + assert(parent->get_linkage()->is_null()); + } + } else { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; } - } else { - dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; } + mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes); } // client requests From 242ed5bba8c160677adb2edf80d8df84bf37c1c3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 26 Jan 2018 09:18:30 +0800 Subject: [PATCH 09/14] mds: add dirfrags whose child inodes have caps to open file table When opening a file in RO mode, if client has the corresponding inode in its cache and issued caps of the inode already include caps that RO file needs, client does not immediately tell mds what caps it wants. From mds' point of view, the inode just has client caps. Open file table should handle above scenario. But adding all inodes with caps to the open file table can be expensive. So we only add dirfrags whose child inodes have caps to the open file table. MDS will fetch these dirfrags during recovery. Hopefully, most RO opened inodes will be populated into the cache. Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 20 +++++++ src/mds/CDir.h | 13 ++++- src/mds/CInode.cc | 7 ++- src/mds/OpenFileTable.cc | 123 +++++++++++++++++++++++++++++++++++++-- src/mds/OpenFileTable.h | 10 +++- 5 files changed, 162 insertions(+), 11 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 5fffead64ab..ce8eb31cca7 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -289,6 +289,18 @@ bool CDir::check_rstats(bool scrub) return good; } +void CDir::adjust_num_inodes_with_caps(int d) +{ + // FIXME: smarter way to decide if adding 'this' to open file table + if (num_inodes_with_caps == 0 && d > 0) + cache->open_file_table.add_dirfrag(this); + else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d) + cache->open_file_table.remove_dirfrag(this); + + num_inodes_with_caps += d; + assert(num_inodes_with_caps >= 0); +} + CDentry *CDir::lookup(std::string_view name, snapid_t snap) { dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl; @@ -569,6 +581,8 @@ void CDir::link_inode_work( CDentry *dn, CInode *in) if (in->state_test(CInode::STATE_TRACKEDBYOFT)) inode->mdcache->open_file_table.notify_link(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) @@ -648,6 +662,8 @@ void CDir::unlink_inode_work( CDentry *dn ) if (in->state_test(CInode::STATE_TRACKEDBYOFT)) inode->mdcache->open_file_table.notify_unlink(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(-1); // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) @@ -840,6 +856,9 @@ void CDir::steal_dentry(CDentry *dn) if (pi->accounted_rstat.rctime > fnode.rstat.rctime) fnode.rstat.rctime = pi->accounted_rstat.rctime; + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); + // move dirty inode rstat to new dirfrag if (in->is_dirty_rstat()) dirty_rstat_inodes.push_back(&in->dirty_rstat_item); @@ -921,6 +940,7 @@ void CDir::finish_old_fragment(list& waiters, bool repl num_head_items = num_head_null = 0; num_snap_items = num_snap_null = 0; + adjust_num_inodes_with_caps(-num_inodes_with_caps); // this mirrors init_fragment_pins() if (is_auth()) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index a81053f6951..e8877a81351 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -91,7 +91,8 @@ public: static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag - static const unsigned STATE_AUXSUBTREE = (1<<18); // no subtree merge + static const unsigned STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table + static const unsigned STATE_AUXSUBTREE = (1<<19); // no subtree merge // common states static const unsigned STATE_CLEAN = 0; @@ -106,14 +107,16 @@ public: STATE_IMPORTBOUND | STATE_EXPORTBOUND | STATE_FROZENTREE | - STATE_STICKY); + STATE_STICKY | + STATE_TRACKEDBYOFT); static const unsigned MASK_STATE_EXPORT_KEPT = (STATE_EXPORTING | STATE_IMPORTBOUND | STATE_EXPORTBOUND | STATE_FROZENTREE | STATE_FROZENDIR | - STATE_STICKY); + STATE_STICKY | + STATE_TRACKEDBYOFT); static const unsigned MASK_STATE_FRAGMENT_KEPT = (STATE_DIRTY | STATE_EXPORTBOUND | @@ -337,6 +340,8 @@ protected: int num_dirty; + int num_inodes_with_caps = 0; + // state version_t committing_version; version_t committed_version; @@ -429,6 +434,8 @@ protected: return num_dirty; } + void adjust_num_inodes_with_caps(int d); + int64_t get_frag_size() const { return get_projected_fnode()->fragstat.size(); } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 017e2863e73..27a46093084 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2892,10 +2892,11 @@ Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm containing_realm = find_snaprealm(); containing_realm->inodes_with_caps.push_back(&item_caps); dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; - } - if (client_caps.empty()) mdcache->num_inodes_with_caps++; + if (parent) + parent->dir->adjust_num_inodes_with_caps(1); + } Capability *cap = new Capability(this, ++mdcache->last_cap_id, client); assert(client_caps.count(client) == 0); @@ -2936,6 +2937,8 @@ void CInode::remove_client_cap(client_t client) item_caps.remove_myself(); containing_realm = NULL; mdcache->num_inodes_with_caps--; + if (parent) + parent->dir->adjust_num_inodes_with_caps(-1); } //clean up advisory locks diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index 7ae14c41c92..10c6b2c566c 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -13,6 +13,7 @@ */ #include "mds/CInode.h" +#include "mds/CDir.h" #include "mds/MDSRank.h" #include "mds/MDCache.h" #include "osdc/Objecter.h" @@ -110,6 +111,29 @@ void OpenFileTable::remove_inode(CInode *in) put_ref(in); } +void OpenFileTable::add_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + assert(!dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_set(CDir::STATE_TRACKEDBYOFT); + auto ret = dirfrags.insert(dir->dirfrag()); + assert(ret.second); + get_ref(dir->get_inode()); + dirty_items.emplace(dir->ino(), 0); +} + +void OpenFileTable::remove_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + assert(dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_clear(CDir::STATE_TRACKEDBYOFT); + auto p = dirfrags.find(dir->dirfrag()); + assert(p != dirfrags.end()); + dirfrags.erase(p); + dirty_items.emplace(dir->ino(), 0); + put_ref(dir->get_inode()); +} + void OpenFileTable::notify_link(CInode *in) { dout(10) << __func__ << " " << *in << dendl; @@ -246,11 +270,31 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p using ceph::encode; for (auto& it : dirty_items) { + list fgls; auto p = anchor_map.find(it.first); + if (p != anchor_map.end()) { + for (auto q = dirfrags.lower_bound(dirfrag_t(it.first, 0)); + q != dirfrags.end() && q->ino == it.first; + ++q) + fgls.push_back(q->frag); + } + if (first_commit) { auto q = loaded_anchor_map.find(it.first); if (q != loaded_anchor_map.end()) { bool same = (p != anchor_map.end() && p->second == q->second); + if (same) { + auto r = loaded_dirfrags.lower_bound(dirfrag_t(it.first, 0)); + for (auto fg : fgls) { + if (r == loaded_dirfrags.end() || !(*r == dirfrag_t(it.first, fg))) { + same = false; + break; + } + ++r; + } + if (same && r != loaded_dirfrags.end() && r->ino == it.first) + same = false; + } loaded_anchor_map.erase(q); if (same) continue; @@ -264,9 +308,9 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p if (p != anchor_map.end()) { bufferlist bl; encode(p->second, bl); + encode(fgls, bl); write_size += bl.length() + sizeof(__u32); - to_update[key].swap(bl); } else { to_remove.emplace(key); @@ -290,6 +334,7 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p } } loaded_anchor_map.clear(); + loaded_dirfrags.clear(); } commit_func(true); @@ -325,8 +370,10 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, if (op_r < 0) { derr << __func__ << " got " << cpp_strerror(op_r) << dendl; clear_on_commit = true; - if (!first) + if (!first) { loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } goto out; } @@ -357,11 +404,17 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, decode(anchor, p); assert(ino == anchor.ino); anchor.auth = MDS_RANK_NONE; + + list fgls; + decode(fgls, p); + for (auto fg : fgls) + loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); } } catch (buffer::error &e) { derr << __func__ << ": corrupted header/values: " << e.what() << dendl; clear_on_commit = true; loaded_anchor_map.clear(); + loaded_dirfrags.clear(); goto out; } @@ -464,8 +517,8 @@ void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) num_opening_inodes--; if (num_opening_inodes == 0) { if (prefetch_state == DIR_INODES) { - prefetch_state = FILE_INODES; - _prefetch_inodes(); + prefetch_state = DIRFRAGS; + _prefetch_dirfrags(); } else if (prefetch_state == FILE_INODES) { prefetch_state = DONE; logseg_destroyed_inos.clear(); @@ -478,6 +531,68 @@ void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) } } +void OpenFileTable::_prefetch_dirfrags() +{ + dout(10) << __func__ << dendl; + assert(prefetch_state == DIRFRAGS); + + MDCache *mdcache = mds->mdcache; + list fetch_queue; + + CInode *last_in = nullptr; + for (auto df : loaded_dirfrags) { + CInode *diri; + if (last_in && last_in->ino() == df.ino) { + diri = last_in; + } else { + diri = mdcache->get_inode(df.ino); + if (!diri) + continue; + last_in = diri; + } + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + + CDir *dir = diri->get_dirfrag(df.frag); + if (dir) { + if (dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } else { + list fgls; + diri->dirfragtree.get_leaves_under(df.frag, fgls); + for (auto fg : fgls) { + if (diri->is_auth()) { + dir = diri->get_or_open_dirfrag(mdcache, fg); + } else { + dir = diri->get_dirfrag(fg); + } + if (dir && dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } + } + } + + MDSGatherBuilder gather(g_ceph_context); + for (auto dir : fetch_queue) { + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); + dir->fetch(gather.new_sub()); + } + + auto finish_func = [this](int r) { + prefetch_state = FILE_INODES; + _prefetch_inodes(); + }; + if (gather.has_subs()) { + gather.set_finisher( + new MDSInternalContextWrapper(mds, + new FunctionContext(finish_func))); + gather.activate(); + } else { + finish_func(0); + } +} + void OpenFileTable::_prefetch_inodes() { dout(10) << __func__ << " state " << prefetch_state << dendl; diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 0a51f0fff2a..243caefa2ea 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -30,6 +30,8 @@ public: void add_inode(CInode *in); void remove_inode(CInode *in); + void add_dirfrag(CDir *dir); + void remove_dirfrag(CDir *dir); void notify_link(CInode *in); void notify_unlink(CInode *in); bool is_any_dirty() const { return !dirty_items.empty(); } @@ -65,6 +67,7 @@ protected: MDSRank *mds; map anchor_map; + set dirfrags; std::map dirty_items; // ino -> dirty state static const unsigned DIRTY_NEW = 1; @@ -82,6 +85,7 @@ protected: void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); map loaded_anchor_map; + set loaded_dirfrags; list waiting_for_load; bool load_done = false; @@ -92,14 +96,16 @@ protected: enum { DIR_INODES = 1, - FILE_INODES = 2, - DONE = 3, + DIRFRAGS = 2, + FILE_INODES = 3, + DONE = 4, }; unsigned prefetch_state = 0; unsigned num_opening_inodes = 0; list waiting_for_prefetch; void _open_ino_finish(inodeno_t ino, int r); void _prefetch_inodes(); + void _prefetch_dirfrags(); std::map > logseg_destroyed_inos; std::set destroyed_inos_set; From f5f137c8590a9c591e86d3d5c6d3a02aa7be66ed Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 27 Jan 2018 13:29:17 +0800 Subject: [PATCH 10/14] mds: protect open file table against partial omap update When omap update is too large for single osd request, mds first stores the update in special journal keys in the omap, then sends requests to update the original omap. The first request of the second phase also updates a flag in omap header, indicating that update is journaled, The last request of the second phase removes journal keys and removes the journal flag in omap header Signed-off-by: Yan, Zheng --- src/mds/OpenFileTable.cc | 292 ++++++++++++++++++++++++++++++++------- src/mds/OpenFileTable.h | 20 +++ 2 files changed, 264 insertions(+), 48 deletions(-) diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index 10c6b2c566c..9dc262b5653 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -179,6 +179,12 @@ object_t OpenFileTable::get_object_name() const return object_t(s); } +void OpenFileTable::_encode_header(bufferlist &bl) +{ + encode(omap_version, bl); + encode((__u8)journal_state, bl); +} + class C_IO_OFT_Save : public MDSIOContextBase { protected: OpenFileTable *oft; @@ -213,11 +219,11 @@ void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSInternalContextBa void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio) { dout(10) << __func__ << " log_seq " << log_seq << dendl; - const unsigned max_write_size = mds->mdcache->max_dir_commit_size; - assert(log_seq >= committing_log_seq); committing_log_seq = log_seq; + omap_version++; + C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), mds->finisher)); @@ -226,12 +232,14 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p object_t oid = get_object_name(); object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - bool first = true; + const unsigned max_write_size = mds->mdcache->max_dir_commit_size; unsigned write_size = 0; - std::map to_update; - std::set to_remove; + unsigned journal_idx = 0; + std::set journal_keys; + std::map to_update, journaled_update; + std::set to_remove, journaled_remove; - auto commit_func = [&](bool last) { + auto journal_func = [&]() { ObjectOperation op; op.priority = op_prio; @@ -241,14 +249,48 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p clear_on_commit = false; } - if (last) { + if (journal_idx == 0) { + assert(journal_state == JOURNAL_NONE); + journal_state = JOURNAL_START; bufferlist header; - encode(log_seq, header); + _encode_header(header); op.omap_set_header(header); - } else if (first) { - // make incomplete + } + + bufferlist bl; + encode(omap_version, bl); + encode(to_update, bl); + encode(to_remove, bl); + + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", journal_idx++); + journal_keys.insert(key); + std::map tmp_map; + tmp_map[key].swap(bl); + op.omap_set(tmp_map); + + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, + gather.new_sub()); + + journaled_update.merge(to_update); + journaled_remove.merge(to_remove); + to_update.clear(); + to_remove.clear(); + }; + + auto commit_func = [&](bool update_header) { + ObjectOperation op; + op.priority = op_prio; + + if (clear_on_commit) { + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + clear_on_commit = false; + } + + if (update_header) { bufferlist header; - encode((uint64_t)0, header); + _encode_header(header); op.omap_set_header(header); } @@ -260,8 +302,6 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, gather.new_sub()); - first = false; - write_size = 0; to_update.clear(); to_remove.clear(); }; @@ -317,9 +357,11 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p } if (write_size >= max_write_size) { - commit_func(false); + journal_func(); + write_size = 0; } } + dirty_items.clear(); if (first_commit) { @@ -330,14 +372,53 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p to_remove.emplace(key); if (write_size >= max_write_size) { - commit_func(false); + journal_func(); + write_size = 0; } } loaded_anchor_map.clear(); loaded_dirfrags.clear(); } - commit_func(true); + if (journal_idx > 0 && (!to_update.empty() || !to_remove.empty())) { + journal_func(); + write_size = 0; + } + + if (journaled_update.empty() && journaled_remove.empty()) { + assert(journal_state == JOURNAL_NONE); + commit_func(true); + } else { + assert(to_update.empty() && to_remove.empty()); + assert(journal_state == JOURNAL_START); + journal_state = JOURNAL_FINISH; + bool first = true; + for (auto& it : journaled_update) { + write_size += it.first.length() + sizeof(__u32); + write_size += it.second.length() + sizeof(__u32); + to_update[it.first].swap(it.second); + + if (write_size >= max_write_size) { + commit_func(first); + first = false; + write_size = 0; + } + } + for (auto& key : journaled_remove) { + write_size += key.length() + sizeof(__u32); + to_remove.emplace(key); + + if (write_size >= max_write_size) { + commit_func(first); + first = false; + write_size = 0; + } + } + + journal_state = JOURNAL_NONE; + to_remove.insert(journal_keys.begin(), journal_keys.end()); + commit_func(true); + } num_pending_commit++; gather.activate(); @@ -362,59 +443,88 @@ public: } }; +class C_IO_OFT_Recover : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Recover(OpenFileTable *t) : oft(t) {} + void finish(int r) { + oft->_recover_finish(r); + } +}; + +void OpenFileTable::_recover_finish(int r) +{ + if (r < 0) { + derr << __func__ << " got " << cpp_strerror(r) << dendl; + _reset_states(); + } else { + dout(10) << __func__ << ": load complete" << dendl; + } + + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, bool first, bool more, bufferlist &header_bl, std::map &values) { + using ceph::decode; + int err = -EINVAL; + + auto decode_func = [this](inodeno_t ino, bufferlist &bl) { + bufferlist::iterator p = bl.begin(); + + auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), + std::piecewise_construct, + std::make_tuple(ino), + std::make_tuple()); + Anchor& anchor = it->second; + decode(anchor, p); + assert(ino == anchor.ino); + anchor.auth = MDS_RANK_NONE; + + list fgls; + decode(fgls, p); + for (auto fg : fgls) + loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); + }; + if (op_r < 0) { derr << __func__ << " got " << cpp_strerror(op_r) << dendl; - clear_on_commit = true; - if (!first) { - loaded_anchor_map.clear(); - loaded_dirfrags.clear(); - } + err = op_r; goto out; } try { - using ceph::decode; if (first) { bufferlist::iterator p = header_bl.begin(); - uint64_t log_seq; - decode(log_seq, p); - committed_log_seq = committing_log_seq = log_seq; - if (log_seq == 0) { - dout(1) << __func__ << ": incomplete values" << dendl; - goto out; - } + decode(omap_version, p); + __u8 tmp_state; + decode(tmp_state, p); + journal_state = tmp_state; } for (auto& it : values) { + if (it.first.compare(0, 9, "_journal.") == 0) { + if (journal_state == JOURNAL_FINISH) { + loaded_journal[it.first].swap(it.second); + } else { // incomplete journal + loaded_journal[it.first].length(); + } + continue; + } + inodeno_t ino; sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val); - - auto r = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), - std::piecewise_construct, - std::make_tuple(ino), - std::make_tuple()); - Anchor& anchor = r->second; - - bufferlist::iterator p = it.second.begin(); - decode(anchor, p); - assert(ino == anchor.ino); - anchor.auth = MDS_RANK_NONE; - - list fgls; - decode(fgls, p); - for (auto fg : fgls) - loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); + decode_func(ino, it.second); } } catch (buffer::error &e) { derr << __func__ << ": corrupted header/values: " << e.what() << dendl; - clear_on_commit = true; - loaded_anchor_map.clear(); - loaded_dirfrags.clear(); goto out; } @@ -433,8 +543,94 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, return; } + // replay journal + if (loaded_journal.empty()) { + assert(journal_state == JOURNAL_NONE); + } else { + dout(10) << __func__ << ": recover journal" << dendl; + std::vector op_vec; + try { + for (auto& it : loaded_journal) { + if (journal_state != JOURNAL_FINISH) + continue; + bufferlist::iterator p = it.second.begin(); + version_t version; + std::map to_update; + std::set to_remove; + decode(version, p); + if (version != omap_version) + continue; + decode(to_update, p); + decode(to_remove, p); + it.second.clear(); + + for (auto& q : to_update) { + inodeno_t ino; + sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val); + decode_func(ino, q.second); + } + for (auto& q : to_remove) { + inodeno_t ino; + sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val); + assert(ino.val > 0); + loaded_anchor_map.erase(ino); + auto r = loaded_dirfrags.lower_bound(dirfrag_t(ino, 0)); + while (r != loaded_dirfrags.end() && r->ino == ino) + loaded_dirfrags.erase(r++); + } + + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + op.priority = CEPH_MSG_PRIO_HIGH; + if (!to_update.empty()) + op.omap_set(to_update); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted journal: " << e.what() << dendl; + goto out; + } + + journal_state = JOURNAL_NONE; + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + { + bufferlist header; + _encode_header(header); + op.omap_set_header(header); + } + { + // remove journal + std::set to_remove; + for (auto &it : loaded_journal) + to_remove.emplace(it.first); + op.omap_rm_keys(to_remove); + } + loaded_journal.clear(); + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Recover(this), + mds->finisher)); + object_t oid = get_object_name(); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + SnapContext snapc; + + for (auto& op : op_vec) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + gather.activate(); + return; + } + + err = 0; dout(10) << __func__ << ": load complete" << dendl; out: + + if (err < 0) + _reset_states(); + load_done = true; finish_contexts(g_ceph_context, waiting_for_load); waiting_for_load.clear(); diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 243caefa2ea..38e12b38280 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -66,6 +66,8 @@ public: protected: MDSRank *mds; + version_t omap_version = 0; + map anchor_map; set dirfrags; @@ -80,19 +82,36 @@ protected: object_t get_object_name() const; + enum { + JOURNAL_NONE = 0, + JOURNAL_START = 1, + JOURNAL_FINISH = 2, + }; + int journal_state = 0; + bool clear_on_commit = false; unsigned num_pending_commit = 0; + void _encode_header(bufferlist& bl); void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); + std::map loaded_journal; map loaded_anchor_map; set loaded_dirfrags; list waiting_for_load; bool load_done = false; + void _reset_states() { + journal_state = JOURNAL_NONE; + clear_on_commit = true; + loaded_journal.clear(); + loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } void _load_finish(int op_r, int header_r, int values_r, bool first, bool more, bufferlist &header_bl, std::map &values); + void _recover_finish(int r); enum { DIR_INODES = 1, @@ -110,6 +129,7 @@ protected: std::map > logseg_destroyed_inos; std::set destroyed_inos_set; + friend class C_IO_OFT_Recover; friend class C_IO_OFT_Load; friend class C_IO_OFT_Save; friend class C_OFT_OpenInoFinish; From a09d7b7da43471286f16e68144aaa805578c4c1c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sat, 27 Jan 2018 19:39:54 +0800 Subject: [PATCH 11/14] mds: add perf counter for 'open ino' operation Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 6 ++++++ src/mds/MDSRank.cc | 6 ++++++ src/mds/MDSRank.h | 3 +++ 3 files changed, 15 insertions(+) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 590fab102f9..16ce99ff7df 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8555,6 +8555,8 @@ void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool if (dir->state_test(CDir::STATE_REJOINUNDEF)) assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent)); + if (mds->logger) + mds->logger->inc(l_mds_openino_dir_fetch); } int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, @@ -8745,6 +8747,8 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) if (info.discover || !info.fetch_backtrace) pa = &info.ancestors; mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer); + if (mds->logger) + mds->logger->inc(l_mds_openino_peer_discover); } } @@ -9619,6 +9623,8 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte { object_t oid = CInode::get_object_name(ino, frag_t(), ""); mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); + if (mds->logger) + mds->logger->inc(l_mds_openino_backtrace_fetch); } diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 35ef49d4dc8..ff265333bc3 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2630,6 +2630,12 @@ void MDSRank::create_logger() mds_plb.add_u64_counter( l_mds_imported_inodes, "imported_inodes", "Imported inodes", "imi", PerfCountersBuilder::PRIO_INTERESTING); + mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch", + "OpenIno incomplete directory fetchings"); + mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch", + "OpenIno backtrace fetchings"); + mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover", + "OpenIno peer inode discovers"); logger = mds_plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(logger); } diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index 39e09c7b0c0..33a8ce765fa 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -71,6 +71,9 @@ enum { l_mds_exported_inodes, l_mds_imported, l_mds_imported_inodes, + l_mds_openino_dir_fetch, + l_mds_openino_backtrace_fetch, + l_mds_openino_peer_discover, l_mds_last, }; From 542fbd3933763933bf3af15e8f626c6d2391b0c6 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 12 Feb 2018 10:27:51 -0800 Subject: [PATCH 12/14] mds: differentiate Anchor types to clarify purpose * A RecoveredAnchor has an mds auth hint and is used during recovery only. * An OpenedAnchor has a reference count for the number of children and is used for constructing the current OpenFileTable on disk. Signed-off-by: Patrick Donnelly --- src/mds/Anchor.h | 29 +++++++++++++++++++++-------- src/mds/OpenFileTable.cc | 2 +- src/mds/OpenFileTable.h | 4 ++-- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h index 0054a850087..9f6e82a5fc2 100644 --- a/src/mds/Anchor.h +++ b/src/mds/Anchor.h @@ -31,15 +31,11 @@ public: inodeno_t ino; // anchored ino inodeno_t dirino; std::string d_name; - __u8 d_type; - union { - mutable int nref; // how many children - mutable mds_rank_t auth; // auth hint - }; + __u8 d_type = 0; - Anchor() : d_type(0), nref(0) {} - Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : - ino(i), dirino(di), d_name(str), d_type(tp), nref(nr) { } + Anchor() {} + Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) : + ino(i), dirino(di), d_name(str), d_type(tp) {} void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); @@ -55,4 +51,21 @@ inline bool operator==(const Anchor &l, const Anchor &r) { ostream& operator<<(ostream& out, const Anchor &a); +class RecoveredAnchor : public Anchor { +public: + RecoveredAnchor() {} + + mds_rank_t auth = MDS_RANK_NONE; // auth hint +}; + +class OpenedAnchor : public Anchor { +public: + OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : + Anchor(i, di, str, tp), + nref(nr) + {} + + mutable int nref = 0; // how many children +}; + #endif diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index 9dc262b5653..398cc9f531f 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -483,7 +483,7 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, std::piecewise_construct, std::make_tuple(ino), std::make_tuple()); - Anchor& anchor = it->second; + RecoveredAnchor& anchor = it->second; decode(anchor, p); assert(ino == anchor.ino); anchor.auth = MDS_RANK_NONE; diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index 38e12b38280..b5cb151ee04 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -68,7 +68,7 @@ protected: version_t omap_version = 0; - map anchor_map; + map anchor_map; set dirfrags; std::map dirty_items; // ino -> dirty state @@ -95,7 +95,7 @@ protected: void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); std::map loaded_journal; - map loaded_anchor_map; + map loaded_anchor_map; set loaded_dirfrags; list waiting_for_load; bool load_done = false; From 9242ce9013024edcccdc2e375bbeede9ccc6b02c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 15 Feb 2018 17:58:44 +0800 Subject: [PATCH 13/14] mds: allow storing open file table in multiple omaps Adjust number of objects to store open file table according to size of the open file table. The basic idea is, when saving an entry for the first time, assign it to an omap object. If all existing omap objects are too full, allocate an new object. Total number of objects is stored first object's omap header. To guarantee consistency of open file table in the case of updating multiple objects. mds needs to first journal the updates, then update the original omaps. Signed-off-by: "Yan, Zheng" --- src/mds/Anchor.h | 2 + src/mds/MDLog.cc | 3 +- src/mds/OpenFileTable.cc | 506 ++++++++++++++++++++++++++++----------- src/mds/OpenFileTable.h | 24 +- 4 files changed, 389 insertions(+), 146 deletions(-) diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h index 9f6e82a5fc2..eb2d65b18e2 100644 --- a/src/mds/Anchor.h +++ b/src/mds/Anchor.h @@ -33,6 +33,8 @@ public: std::string d_name; __u8 d_type = 0; + int omap_idx = -1; // stored in which omap object + Anchor() {} Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) : ino(i), dirino(di), d_name(str), d_type(tp) {} diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index 3ddc26522c6..785a7d1b41f 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -715,7 +715,8 @@ int MDLog::trim_all() uint64_t last_seq = 0; if (!segments.empty()) { last_seq = get_last_segment_seq(); - if (last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) { + if (!mds->mdcache->open_file_table.is_any_committing() && + last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) { submit_mutex.Unlock(); mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), last_seq, CEPH_MSG_PRIO_DEFAULT); diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc index 398cc9f531f..4bb52ba0054 100644 --- a/src/mds/OpenFileTable.cc +++ b/src/mds/OpenFileTable.cc @@ -50,7 +50,12 @@ void OpenFileTable::get_ref(CInode *in) assert(ret.second == true); in->state_set(CInode::STATE_TRACKEDBYOFT); - dirty_items.emplace(in->ino(), DIRTY_NEW); + auto ret1 = dirty_items.emplace(in->ino(), (int)DIRTY_NEW); + if (!ret1.second) { + int omap_idx = ret1.first->second; + assert(omap_idx >= 0); + ret.first->second.omap_idx = omap_idx; + } in = pin; } while (in); @@ -79,12 +84,20 @@ void OpenFileTable::put_ref(CInode *in) assert(p->second.d_name == ""); } + int omap_idx = p->second.omap_idx; anchor_map.erase(p); in->state_clear(CInode::STATE_TRACKEDBYOFT); - auto ret = dirty_items.emplace(in->ino(), 0); - if (!ret.second && (ret.first->second & DIRTY_NEW)) - dirty_items.erase(ret.first); + auto ret = dirty_items.emplace(in->ino(), omap_idx); + if (!ret.second) { + if (ret.first->second == DIRTY_NEW) { + assert(omap_idx < 0); + dirty_items.erase(ret.first); + } else { + assert(omap_idx >= 0); + ret.first->second = omap_idx; + } + } in = pin; } while (in); @@ -119,7 +132,7 @@ void OpenFileTable::add_dirfrag(CDir *dir) auto ret = dirfrags.insert(dir->dirfrag()); assert(ret.second); get_ref(dir->get_inode()); - dirty_items.emplace(dir->ino(), 0); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); } void OpenFileTable::remove_dirfrag(CDir *dir) @@ -130,7 +143,7 @@ void OpenFileTable::remove_dirfrag(CDir *dir) auto p = dirfrags.find(dir->dirfrag()); assert(p != dirfrags.end()); dirfrags.erase(p); - dirty_items.emplace(dir->ino(), 0); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); put_ref(dir->get_inode()); } @@ -148,7 +161,7 @@ void OpenFileTable::notify_link(CInode *in) p->second.dirino = pin->ino(); p->second.d_name = dn->get_name(); - dirty_items.emplace(in->ino(), 0); + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); get_ref(pin); } @@ -167,22 +180,23 @@ void OpenFileTable::notify_unlink(CInode *in) p->second.dirino = inodeno_t(0); p->second.d_name = ""; - dirty_items.emplace(in->ino(), 0); + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); put_ref(pin); } -object_t OpenFileTable::get_object_name() const +object_t OpenFileTable::get_object_name(unsigned idx) const { char s[30]; - snprintf(s, sizeof(s), "mds%d_openfiles", int(mds->get_nodeid())); + snprintf(s, sizeof(s), "mds%d_openfiles.%x", int(mds->get_nodeid()), idx); return object_t(s); } -void OpenFileTable::_encode_header(bufferlist &bl) +void OpenFileTable::_encode_header(bufferlist &bl, int j_state) { encode(omap_version, bl); - encode((__u8)journal_state, bl); + encode(omap_num_objs, bl); + encode((__u8)j_state, bl); } class C_IO_OFT_Save : public MDSIOContextBase { @@ -216,99 +230,180 @@ void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSInternalContextBa fin->complete(r); } +class C_IO_OFT_Journal : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSInternalContextBase *fin; + std::map > ops_map; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Journal(OpenFileTable *t, uint64_t s, MDSInternalContextBase *c, + std::map >& ops) : + oft(t), log_seq(s), fin(c) { + ops_map.swap(ops); + } + void finish(int r) { + oft->_journal_finish(r, log_seq, fin, ops_map); + } +}; + +void OpenFileTable::_journal_finish(int r, uint64_t log_seq, MDSInternalContextBase *c, + std::map >& ops_map) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + SnapContext snapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + + journal_state = JOURNAL_NONE; + return; +} + void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio) { dout(10) << __func__ << " log_seq " << log_seq << dendl; + + assert(num_pending_commit == 0); + num_pending_commit++; assert(log_seq >= committing_log_seq); committing_log_seq = log_seq; omap_version++; - C_GatherBuilder gather(g_ceph_context, - new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), - mds->finisher)); + C_GatherBuilder gather(g_ceph_context); SnapContext snapc; - object_t oid = get_object_name(); object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + const unsigned max_items_per_obj = 1024 * 1024; const unsigned max_write_size = mds->mdcache->max_dir_commit_size; - unsigned write_size = 0; - unsigned journal_idx = 0; - std::set journal_keys; - std::map to_update, journaled_update; - std::set to_remove, journaled_remove; - auto journal_func = [&]() { + struct omap_update_ctl { + unsigned write_size = 0; + unsigned journal_idx = 0; + bool clear = false; + std::map to_update, journaled_update; + std::set to_remove, journaled_remove; + }; + std::vector omap_updates(omap_num_objs); + + using ceph::encode; + auto journal_func = [&](unsigned idx) { + auto& ctl = omap_updates.at(idx); + ObjectOperation op; op.priority = op_prio; - if (clear_on_commit) { + if (ctl.clear) { + ctl.clear = false; op.omap_clear(); op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); - clear_on_commit = false; } - if (journal_idx == 0) { - assert(journal_state == JOURNAL_NONE); - journal_state = JOURNAL_START; + if (ctl.journal_idx == 0) { + if (journal_state == JOURNAL_NONE) + journal_state = JOURNAL_START; + else + assert(journal_state == JOURNAL_START); + bufferlist header; - _encode_header(header); + _encode_header(header, journal_state); op.omap_set_header(header); } bufferlist bl; encode(omap_version, bl); - encode(to_update, bl); - encode(to_remove, bl); + encode(ctl.to_update, bl); + encode(ctl.to_remove, bl); char key[32]; - snprintf(key, sizeof(key), "_journal.%x", journal_idx++); - journal_keys.insert(key); + snprintf(key, sizeof(key), "_journal.%x", ctl.journal_idx++); std::map tmp_map; tmp_map[key].swap(bl); op.omap_set(tmp_map); + object_t oid = get_object_name(idx); mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, gather.new_sub()); - journaled_update.merge(to_update); - journaled_remove.merge(to_remove); - to_update.clear(); - to_remove.clear(); + ctl.journaled_update.merge(ctl.to_update); + ctl.journaled_remove.merge(ctl.to_remove); + ctl.to_update.clear(); + ctl.to_remove.clear(); }; - auto commit_func = [&](bool update_header) { - ObjectOperation op; + std::map > ops_map; + + auto create_op_func = [&](unsigned idx, bool update_header) { + auto& ctl = omap_updates.at(idx); + + auto& op_vec = ops_map[idx]; + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); op.priority = op_prio; - if (clear_on_commit) { + if (ctl.clear) { + ctl.clear = false; op.omap_clear(); op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); - clear_on_commit = false; } if (update_header) { bufferlist header; - _encode_header(header); + _encode_header(header, journal_state); op.omap_set_header(header); } - if (!to_update.empty()) - op.omap_set(to_update); - if (!to_remove.empty()) - op.omap_rm_keys(to_remove); + if (!ctl.to_update.empty()) { + op.omap_set(ctl.to_update); + ctl.to_update.clear(); + } + if (!ctl.to_remove.empty()) { + op.omap_rm_keys(ctl.to_remove); + ctl.to_remove.clear(); + } + }; - mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, - gather.new_sub()); - - to_update.clear(); - to_remove.clear(); + auto submit_ops_func = [&]() { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); }; bool first_commit = !loaded_anchor_map.empty(); - using ceph::encode; + unsigned first_free_idx = 0; + unsigned old_num_objs = omap_num_objs; + if (omap_num_objs == 0) { + omap_num_objs = 1; + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + } + for (auto& it : dirty_items) { list fgls; auto p = anchor_map.find(it.first); @@ -322,7 +417,9 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p if (first_commit) { auto q = loaded_anchor_map.find(it.first); if (q != loaded_anchor_map.end()) { - bool same = (p != anchor_map.end() && p->second == q->second); + assert(p != anchor_map.end()); + p->second.omap_idx = q->second.omap_idx; + bool same = p->second == q->second; if (same) { auto r = loaded_dirfrags.lower_bound(dirfrag_t(it.first, 0)); for (auto fg : fgls) { @@ -343,22 +440,54 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p char key[32]; int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); - write_size += len + sizeof(__u32); + + int omap_idx; + if (p != anchor_map.end()) { + omap_idx = p->second.omap_idx; + if (omap_idx < 0) { + assert(it.second == DIRTY_NEW); + // find omap object to store the key + for (unsigned i = first_free_idx; i < omap_num_objs; i++) { + if (omap_num_items[i] < max_items_per_obj) + omap_idx = i; + } + if (omap_idx < 0) { + ++omap_num_objs; + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + omap_idx = omap_num_objs - 1; + } + first_free_idx = omap_idx; + + p->second.omap_idx = omap_idx; + ++omap_num_items[omap_idx]; + } + } else { + omap_idx = it.second; + unsigned& count = omap_num_items.at(omap_idx); + assert(count > 0); + --count; + if ((unsigned)omap_idx < first_free_idx && count < max_items_per_obj) + first_free_idx = omap_idx; + } + auto& ctl = omap_updates.at(omap_idx); if (p != anchor_map.end()) { bufferlist bl; encode(p->second, bl); encode(fgls, bl); - write_size += bl.length() + sizeof(__u32); - to_update[key].swap(bl); + ctl.write_size += bl.length() + len + 2 * sizeof(__u32); + ctl.to_update[key].swap(bl); } else { - to_remove.emplace(key); + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); } - if (write_size >= max_write_size) { - journal_func(); - write_size = 0; + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; } } @@ -368,60 +497,125 @@ void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_p for (auto& it : loaded_anchor_map) { char key[32]; int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); - write_size += len + sizeof(__u32); - to_remove.emplace(key); - if (write_size >= max_write_size) { - journal_func(); - write_size = 0; + int omap_idx = it.second.omap_idx; + unsigned& count = omap_num_items.at(omap_idx); + assert(count > 0); + --count; + + auto& ctl = omap_updates.at(omap_idx); + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); + + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; } } loaded_anchor_map.clear(); loaded_dirfrags.clear(); } - if (journal_idx > 0 && (!to_update.empty() || !to_remove.empty())) { - journal_func(); - write_size = 0; + { + size_t total_items = 0; + unsigned used_objs = 1; + std::list objs_to_write; + bool journaled = false; + for (unsigned i = 0; i < omap_num_objs; i++) { + total_items += omap_num_items[i]; + if (omap_updates[i].journal_idx) + journaled = true; + else if (omap_updates[i].write_size) + objs_to_write.push_back(i); + + if (omap_num_items[i] > 0) + used_objs = i + 1; + } + assert(total_items == anchor_map.size()); + // adjust omap object count + if (used_objs < omap_num_objs) { + omap_num_objs = used_objs; + omap_num_items.resize(omap_num_objs); + } + // skip journal if only one osd request is required and object count + // does not change. + if (!journaled && old_num_objs == omap_num_objs && + objs_to_write.size() <= 1) { + assert(journal_state == JOURNAL_NONE); + assert(!gather.has_subs()); + + unsigned omap_idx = objs_to_write.empty() ? 0 : objs_to_write.front(); + create_op_func(omap_idx, true); + submit_ops_func(); + return; + } } - if (journaled_update.empty() && journaled_remove.empty()) { - assert(journal_state == JOURNAL_NONE); - commit_func(true); - } else { - assert(to_update.empty() && to_remove.empty()); - assert(journal_state == JOURNAL_START); + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + if (ctl.write_size > 0) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + + if (journal_state == JOURNAL_START) { + assert(gather.has_subs()); journal_state = JOURNAL_FINISH; - bool first = true; - for (auto& it : journaled_update) { - write_size += it.first.length() + sizeof(__u32); - write_size += it.second.length() + sizeof(__u32); - to_update[it.first].swap(it.second); - - if (write_size >= max_write_size) { - commit_func(first); - first = false; - write_size = 0; - } - } - for (auto& key : journaled_remove) { - write_size += key.length() + sizeof(__u32); - to_remove.emplace(key); - - if (write_size >= max_write_size) { - commit_func(first); - first = false; - write_size = 0; - } - } - - journal_state = JOURNAL_NONE; - to_remove.insert(journal_keys.begin(), journal_keys.end()); - commit_func(true); + } else { + // only object count changes + assert(journal_state == JOURNAL_NONE); + assert(!gather.has_subs()); } - num_pending_commit++; - gather.activate(); + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + assert(ctl.to_update.empty() && ctl.to_remove.empty()); + if (ctl.journal_idx == 0) + assert(ctl.journaled_update.empty() && ctl.journaled_remove.empty()); + + bool first = true; + for (auto& it : ctl.journaled_update) { + ctl.write_size += it.first.length() + it.second.length() + 2 * sizeof(__u32); + ctl.to_update[it.first].swap(it.second); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (auto& key : ctl.journaled_remove) { + ctl.write_size += key.length() + sizeof(__u32); + ctl.to_remove.emplace(key); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (unsigned i = 0; i < ctl.journal_idx; ++i) { + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", i); + ctl.to_remove.emplace(key); + } + + // update first object's omap header if object count changes + if (ctl.clear || + ctl.journal_idx > 0 || + (omap_idx == 0 && old_num_objs != omap_num_objs)) + create_op_func(omap_idx, first); + } + + assert(!ops_map.empty()); + if (journal_state == JOURNAL_FINISH) { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Journal(this, log_seq, c, ops_map), + mds->finisher)); + gather.activate(); + } else { + submit_ops_func(); + } } class C_IO_OFT_Load : public MDSIOContextBase { @@ -434,12 +628,14 @@ public: int values_r = 0; //< Return value from OMAP value read bufferlist header_bl; std::map values; - bool more = false; + unsigned index; bool first; + bool more = false; - C_IO_OFT_Load(OpenFileTable *t, bool f) : oft(t), first(f) {} + C_IO_OFT_Load(OpenFileTable *t, unsigned i, bool f) : + oft(t), index(i), first(f) {} void finish(int r) { - oft->_load_finish(r, header_r, values_r, first, more, header_bl, values); + oft->_load_finish(r, header_r, values_r, index, first, more, header_bl, values); } }; @@ -463,22 +659,24 @@ void OpenFileTable::_recover_finish(int r) dout(10) << __func__ << ": load complete" << dendl; } + journal_state = JOURNAL_NONE; load_done = true; finish_contexts(g_ceph_context, waiting_for_load); waiting_for_load.clear(); } void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, - bool first, bool more, + unsigned idx, bool first, bool more, bufferlist &header_bl, std::map &values) { using ceph::decode; int err = -EINVAL; - auto decode_func = [this](inodeno_t ino, bufferlist &bl) { + auto decode_func = [this](unsigned idx, inodeno_t ino, bufferlist &bl) { bufferlist::iterator p = bl.begin(); + size_t count = loaded_anchor_map.size(); auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), std::piecewise_construct, std::make_tuple(ino), @@ -486,12 +684,16 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, RecoveredAnchor& anchor = it->second; decode(anchor, p); assert(ino == anchor.ino); + anchor.omap_idx = idx; anchor.auth = MDS_RANK_NONE; list fgls; decode(fgls, p); for (auto fg : fgls) loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); + + if (loaded_anchor_map.size() > count) + ++omap_num_items[idx]; }; if (op_r < 0) { @@ -503,39 +705,61 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, try { if (first) { bufferlist::iterator p = header_bl.begin(); - decode(omap_version, p); - __u8 tmp_state; - decode(tmp_state, p); - journal_state = tmp_state; + version_t version; + unsigned num_objs; + __u8 jstate; + decode(version, p); + decode(num_objs, p); + decode(jstate, p); + + if (version > omap_version) { + omap_version = version; + omap_num_objs = num_objs; + omap_num_items.resize(omap_num_objs); + journal_state = jstate; + } else if (version == omap_version) { + assert(omap_num_objs == num_objs); + if (jstate > journal_state) + journal_state = jstate; + } } for (auto& it : values) { if (it.first.compare(0, 9, "_journal.") == 0) { + if (idx >= loaded_journals.size()) + loaded_journals.resize(idx + 1); + if (journal_state == JOURNAL_FINISH) { - loaded_journal[it.first].swap(it.second); + loaded_journals[idx][it.first].swap(it.second); } else { // incomplete journal - loaded_journal[it.first].length(); + loaded_journals[idx][it.first].length(); } continue; } inodeno_t ino; sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val); - decode_func(ino, it.second); + decode_func(idx, ino, it.second); } } catch (buffer::error &e) { derr << __func__ << ": corrupted header/values: " << e.what() << dendl; goto out; } - if (more) { + if (more || idx + 1 < omap_num_objs) { // Issue another read if we're not at the end of the omap - const std::string last_key = values.rbegin()->first; + std::string last_key; + if (more) + last_key = values.rbegin()->first; + else + idx++; dout(10) << __func__ << ": continue to load from '" << last_key << "'" << dendl; - object_t oid = get_object_name(); + object_t oid = get_object_name(idx); object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - C_IO_OFT_Load *c = new C_IO_OFT_Load(this, false); + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, idx, !more); ObjectOperation op; + if (!more) + op.omap_get_header(&c->header_bl, &c->header_r); op.omap_get_vals(last_key, "", uint64_t(-1), &c->values, &c->more, &c->values_r); mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, @@ -544,10 +768,18 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, } // replay journal - if (loaded_journal.empty()) { - assert(journal_state == JOURNAL_NONE); - } else { - dout(10) << __func__ << ": recover journal" << dendl; + if (loaded_journals.size() > 0) { + dout(10) << __func__ << ": recover journal" << dendl; + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Recover(this), + mds->finisher)); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + SnapContext snapc; + + for (unsigned omap_idx = 0; omap_idx < loaded_journals.size(); omap_idx++) { + auto& loaded_journal = loaded_journals[omap_idx]; + std::vector op_vec; try { for (auto& it : loaded_journal) { @@ -567,13 +799,17 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, for (auto& q : to_update) { inodeno_t ino; sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val); - decode_func(ino, q.second); + decode_func(omap_idx, ino, q.second); } for (auto& q : to_remove) { inodeno_t ino; sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val); assert(ino.val > 0); - loaded_anchor_map.erase(ino); + if (loaded_anchor_map.erase(ino)) { + unsigned& count = omap_num_items[omap_idx]; + assert(count > 0); + --count; + } auto r = loaded_dirfrags.lower_bound(dirfrag_t(ino, 0)); while (r != loaded_dirfrags.end() && r->ino == ino) loaded_dirfrags.erase(r++); @@ -592,12 +828,14 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, goto out; } - journal_state = JOURNAL_NONE; op_vec.resize(op_vec.size() + 1); ObjectOperation& op = op_vec.back(); { bufferlist header; - _encode_header(header); + if (journal_state == JOURNAL_FINISH) + _encode_header(header, JOURNAL_FINISH); + else + _encode_header(header, JOURNAL_NONE); op.omap_set_header(header); } { @@ -609,21 +847,17 @@ void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, } loaded_journal.clear(); - C_GatherBuilder gather(g_ceph_context, - new C_OnFinisher(new C_IO_OFT_Recover(this), - mds->finisher)); - object_t oid = get_object_name(); - object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - SnapContext snapc; - + object_t oid = get_object_name(omap_idx); for (auto& op : op_vec) { mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, gather.new_sub()); } - gather.activate(); - return; + } + gather.activate(); + return; } + journal_state = JOURNAL_NONE; err = 0; dout(10) << __func__ << ": load complete" << dendl; out: @@ -643,8 +877,8 @@ void OpenFileTable::load(MDSInternalContextBase *onload) if (onload) waiting_for_load.push_back(onload); - C_IO_OFT_Load *c = new C_IO_OFT_Load(this, true); - object_t oid = get_object_name(); + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, 0, true); + object_t oid = get_object_name(0); object_locator_t oloc(mds->mdsmap->get_metadata_pool()); ObjectOperation op; diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h index b5cb151ee04..db949d9f888 100644 --- a/src/mds/OpenFileTable.h +++ b/src/mds/OpenFileTable.h @@ -67,12 +67,15 @@ protected: MDSRank *mds; version_t omap_version = 0; + unsigned omap_num_objs = 0; + std::vector omap_num_items; map anchor_map; set dirfrags; - std::map dirty_items; // ino -> dirty state - static const unsigned DIRTY_NEW = 1; + std::map dirty_items; // ino -> dirty state + static const int DIRTY_NEW = -1; + static const int DIRTY_UNDEF = -2; uint64_t committed_log_seq = 0; uint64_t committing_log_seq = 0; @@ -80,7 +83,7 @@ protected: void get_ref(CInode *in); void put_ref(CInode *in); - object_t get_object_name() const; + object_t get_object_name(unsigned idx) const; enum { JOURNAL_NONE = 0, @@ -89,26 +92,28 @@ protected: }; int journal_state = 0; - bool clear_on_commit = false; unsigned num_pending_commit = 0; - void _encode_header(bufferlist& bl); + void _encode_header(bufferlist& bl, int j_state); void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); + void _journal_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin, + std::map >& ops); - std::map loaded_journal; + std::vector > loaded_journals; map loaded_anchor_map; set loaded_dirfrags; list waiting_for_load; bool load_done = false; void _reset_states() { + omap_num_objs = 0; + omap_num_items.resize(0); journal_state = JOURNAL_NONE; - clear_on_commit = true; - loaded_journal.clear(); + loaded_journals.clear(); loaded_anchor_map.clear(); loaded_dirfrags.clear(); } void _load_finish(int op_r, int header_r, int values_r, - bool first, bool more, + unsigned idx, bool first, bool more, bufferlist &header_bl, std::map &values); void _recover_finish(int r); @@ -132,6 +137,7 @@ protected: friend class C_IO_OFT_Recover; friend class C_IO_OFT_Load; friend class C_IO_OFT_Save; + friend class C_IO_OFT_Journal; friend class C_OFT_OpenInoFinish; }; From ffebd0854e135020b61c319e10b6365a12a679a0 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 21 Mar 2018 21:08:58 +0800 Subject: [PATCH 14/14] qa/cephfs: update TestDamage for open file table Signed-off-by: Yan, Zheng --- qa/tasks/cephfs/test_damage.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py index 380b49c4b65..d56f39eed01 100644 --- a/qa/tasks/cephfs/test_damage.py +++ b/qa/tasks/cephfs/test_damage.py @@ -141,7 +141,9 @@ class TestDamage(CephFSTestCase): # Missing dirfrags for non-system dirs result in empty directory "10000000000.00000000", # PurgeQueue is auto-created if not found on startup - "500.00000000" + "500.00000000", + # open file table is auto-created if not found on startup + "mds0_openfiles.0" ]: expectation = NO_DAMAGE else: