diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py index 380b49c4b65..d56f39eed01 100644 --- a/qa/tasks/cephfs/test_damage.py +++ b/qa/tasks/cephfs/test_damage.py @@ -141,7 +141,9 @@ class TestDamage(CephFSTestCase): # Missing dirfrags for non-system dirs result in empty directory "10000000000.00000000", # PurgeQueue is auto-created if not found on startup - "500.00000000" + "500.00000000", + # open file table is auto-created if not found on startup + "mds0_openfiles.0" ]: expectation = NO_DAMAGE else: diff --git a/src/mds/Anchor.cc b/src/mds/Anchor.cc new file mode 100644 index 00000000000..99f1504ea74 --- /dev/null +++ b/src/mds/Anchor.cc @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/Anchor.h" + +#include "common/Formatter.h" + +void Anchor::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + encode(ino, bl); + encode(dirino, bl); + encode(d_name, bl); + encode(d_type, bl); + ENCODE_FINISH(bl); +} + +void Anchor::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + decode(ino, bl); + decode(dirino, bl); + decode(d_name, bl); + decode(d_type, bl); + DECODE_FINISH(bl); +} + +void Anchor::dump(Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("dirino", dirino); + f->dump_string("d_name", d_name); + f->dump_unsigned("d_type", d_type); +} + +void Anchor::generate_test_instances(list& ls) +{ + ls.push_back(new Anchor); + ls.push_back(new Anchor); + ls.back()->ino = 1; + ls.back()->dirino = 2; + ls.back()->d_name = "hello"; + ls.back()->d_type = DT_DIR; +} + +ostream& operator<<(ostream& out, const Anchor &a) +{ + return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")"; +} diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h new file mode 100644 index 00000000000..eb2d65b18e2 --- /dev/null +++ b/src/mds/Anchor.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ANCHOR_H +#define CEPH_ANCHOR_H + +#include + +#include "include/types.h" +#include "mdstypes.h" +#include "include/buffer.h" + +/* + * Anchor represents primary linkage of an inode. When adding inode to an + * anchor table, MDS ensures that the table also contains inode's ancestor + * inodes. MDS can get inode's path by looking up anchor table recursively. + */ +class Anchor { +public: + inodeno_t ino; // anchored ino + inodeno_t dirino; + std::string d_name; + __u8 d_type = 0; + + int omap_idx = -1; // stored in which omap object + + Anchor() {} + Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) : + ino(i), dirino(di), d_name(str), d_type(tp) {} + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& ls); +}; +WRITE_CLASS_ENCODER(Anchor) + +inline bool operator==(const Anchor &l, const Anchor &r) { + return l.ino == r.ino && l.dirino == r.dirino && + l.d_name == r.d_name && l.d_type == r.d_type; +} + +ostream& operator<<(ostream& out, const Anchor &a); + +class RecoveredAnchor : public Anchor { +public: + RecoveredAnchor() {} + + mds_rank_t auth = MDS_RANK_NONE; // auth hint +}; + +class OpenedAnchor : public Anchor { +public: + OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) : + Anchor(i, di, str, tp), + nref(nr) + {} + + mutable int nref = 0; // how many children +}; + +#endif diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index e261e4ecc03..4e9ba084c96 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -296,6 +296,18 @@ bool CDir::check_rstats(bool scrub) return good; } +void CDir::adjust_num_inodes_with_caps(int d) +{ + // FIXME: smarter way to decide if adding 'this' to open file table + if (num_inodes_with_caps == 0 && d > 0) + cache->open_file_table.add_dirfrag(this); + else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d) + cache->open_file_table.remove_dirfrag(this); + + num_inodes_with_caps += d; + assert(num_inodes_with_caps >= 0); +} + CDentry *CDir::lookup(std::string_view name, snapid_t snap) { dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl; @@ -574,6 +586,11 @@ void CDir::link_inode_work( CDentry *dn, CInode *in) // pin dentry? if (in->get_num_ref()) dn->get(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_link(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) @@ -650,6 +667,11 @@ void CDir::unlink_inode_work( CDentry *dn ) // unpin dentry? if (in->get_num_ref()) dn->put(CDentry::PIN_INODEPIN); + + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) + inode->mdcache->open_file_table.notify_unlink(in); + if (in->is_any_caps()) + adjust_num_inodes_with_caps(-1); // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) @@ -842,6 +864,9 @@ void CDir::steal_dentry(CDentry *dn) if (pi->accounted_rstat.rctime > fnode.rstat.rctime) fnode.rstat.rctime = pi->accounted_rstat.rctime; + if (in->is_any_caps()) + adjust_num_inodes_with_caps(1); + // move dirty inode rstat to new dirfrag if (in->is_dirty_rstat()) dirty_rstat_inodes.push_back(&in->dirty_rstat_item); @@ -923,6 +948,7 @@ void CDir::finish_old_fragment(list& waiters, bool repl num_head_items = num_head_null = 0; num_snap_items = num_snap_null = 0; + adjust_num_inodes_with_caps(-num_inodes_with_caps); // this mirrors init_fragment_pins() if (is_auth()) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 7274798c392..fab780d2fd4 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -73,25 +73,26 @@ public: } // -- state -- - static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 4); - static const unsigned STATE_FREEZINGDIR = (1<< 5); - static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit - static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_CREATING = (1<< 8); - static const unsigned STATE_IMPORTBOUND = (1<<10); - static const unsigned STATE_EXPORTBOUND = (1<<11); - static const unsigned STATE_EXPORTING = (1<<12); - static const unsigned STATE_IMPORTING = (1<<13); - static const unsigned STATE_FRAGMENTING = (1<<14); - static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs - static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting - static const unsigned STATE_ASSIMRSTAT = (1<<17); // assimilating inode->frag rstats - static const unsigned STATE_DIRTYDFT = (1<<18); // dirty dirfragtree - static const unsigned STATE_BADFRAG = (1<<19); // bad dirfrag - static const unsigned STATE_AUXSUBTREE = (1<<20); // no subtree merge + static const unsigned STATE_COMPLETE = (1<< 0); // the complete contents are in cache + static const unsigned STATE_FROZENTREE = (1<< 1); // root of tree (bounded by exports) + static const unsigned STATE_FREEZINGTREE = (1<< 2); // in process of freezing + static const unsigned STATE_FROZENDIR = (1<< 3); + static const unsigned STATE_FREEZINGDIR = (1<< 4); + static const unsigned STATE_COMMITTING = (1<< 5); // mid-commit + static const unsigned STATE_FETCHING = (1<< 6); // currenting fetching + static const unsigned STATE_CREATING = (1<< 7); + static const unsigned STATE_IMPORTBOUND = (1<< 8); + static const unsigned STATE_EXPORTBOUND = (1<< 9); + static const unsigned STATE_EXPORTING = (1<<10); + static const unsigned STATE_IMPORTING = (1<<11); + static const unsigned STATE_FRAGMENTING = (1<<12); + static const unsigned STATE_STICKY = (1<<13); // sticky pin due to inode stickydirs + static const unsigned STATE_DNPINNEDFRAG = (1<<14); // dir is refragmenting + static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats + static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree + static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag + static const unsigned STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table + static const unsigned STATE_AUXSUBTREE = (1<<19); // no subtree merge // common states static const unsigned STATE_CLEAN = 0; @@ -102,18 +103,22 @@ public: (STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG); static const unsigned MASK_STATE_IMPORT_KEPT = ( - STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY); + STATE_IMPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_STICKY | + STATE_TRACKEDBYOFT); static const unsigned MASK_STATE_EXPORT_KEPT = - (STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_STICKY); + (STATE_EXPORTING | + STATE_IMPORTBOUND | + STATE_EXPORTBOUND | + STATE_FROZENTREE | + STATE_FROZENDIR | + STATE_STICKY | + STATE_TRACKEDBYOFT); static const unsigned MASK_STATE_FRAGMENT_KEPT = - (STATE_DIRTY| + (STATE_DIRTY | STATE_EXPORTBOUND | STATE_IMPORTBOUND | STATE_AUXSUBTREE | @@ -345,6 +350,8 @@ protected: int num_dirty; + int num_inodes_with_caps = 0; + // state version_t committing_version; version_t committed_version; @@ -437,6 +444,8 @@ protected: return num_dirty; } + void adjust_num_inodes_with_caps(int d); + int64_t get_frag_size() const { return get_projected_fnode()->fragstat.size(); } diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 236c0ec95df..9d3b036beac 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2730,7 +2730,7 @@ client_t CInode::calc_ideal_loner() { if (mdcache->is_readonly()) return -1; - if (!mds_caps_wanted.empty()) + if (!get_mds_caps_wanted().empty()) return -1; int n = 0; @@ -2844,6 +2844,43 @@ void CInode::choose_lock_states(int dirty_caps) choose_lock_state(&linklock, issued); } +void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map& m) +{ + bool old_empty = mds_caps_wanted.empty(); + mds_caps_wanted.swap(m); + if (old_empty != (bool)mds_caps_wanted.empty()) { + if (old_empty) + adjust_num_caps_wanted(1); + else + adjust_num_caps_wanted(-1); + } +} + +void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) +{ + bool old_empty = mds_caps_wanted.empty(); + if (wanted) { + mds_caps_wanted[mds] = wanted; + if (old_empty) + adjust_num_caps_wanted(1); + } else if (!old_empty) { + mds_caps_wanted.erase(mds); + if (mds_caps_wanted.empty()) + adjust_num_caps_wanted(-1); + } +} + +void CInode::adjust_num_caps_wanted(int d) +{ + if (!num_caps_wanted && d > 0) + mdcache->open_file_table.add_inode(this); + else if (num_caps_wanted > 0 && num_caps_wanted == -d) + mdcache->open_file_table.remove_inode(this); + + num_caps_wanted +=d; + assert(num_caps_wanted >= 0); +} + Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm) { assert(last == CEPH_NOSNAP); @@ -2855,10 +2892,11 @@ Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm containing_realm = find_snaprealm(); containing_realm->inodes_with_caps.push_back(&item_caps); dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; - } - if (client_caps.empty()) mdcache->num_inodes_with_caps++; + if (parent) + parent->dir->adjust_num_inodes_with_caps(1); + } Capability *cap = new Capability(this, ++mdcache->last_cap_id, client); assert(client_caps.count(client) == 0); @@ -2888,6 +2926,9 @@ void CInode::remove_client_cap(client_t client) if (client == loner_cap) loner_cap = -1; + if (cap->wanted()) + adjust_num_caps_wanted(-1); + delete cap; client_caps.erase(client); if (client_caps.empty()) { @@ -2895,8 +2936,9 @@ void CInode::remove_client_cap(client_t client) put(PIN_CAPS); item_caps.remove_myself(); containing_realm = NULL; - item_open_file.remove_myself(); // unpin logsegment mdcache->num_inodes_with_caps--; + if (parent) + parent->dir->adjust_num_inodes_with_caps(-1); } //clean up advisory locks @@ -2947,7 +2989,10 @@ void CInode::clear_client_caps_after_export() remove_client_cap(client_caps.begin()->first); loner_cap = -1; want_loner_cap = -1; - mds_caps_wanted.clear(); + if (!get_mds_caps_wanted().empty()) { + mempool::mds_co::compact_map empty; + set_mds_caps_wanted(empty); + } } void CInode::export_client_caps(map& cl) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index b0adb7f54dd..dcf6d67ddd5 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -215,31 +215,33 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter; cap_map client_caps; // client -> caps mempool::mds_co::compact_map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted = 0; // [replica] what i've requested from auth + int replica_caps_wanted = 0; // [replica] what i've requested from auth + int num_caps_wanted = 0; public: mempool::mds_co::compact_map > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head @@ -690,6 +693,7 @@ public: clear_file_locks(); assert(num_projected_xattrs == 0); assert(num_projected_srnodes == 0); + assert(num_caps_wanted == 0); } @@ -974,7 +978,8 @@ public: bool is_any_nonstale_caps() { return count_nonstale_caps(); } const mempool::mds_co::compact_map& get_mds_caps_wanted() const { return mds_caps_wanted; } - mempool::mds_co::compact_map& get_mds_caps_wanted() { return mds_caps_wanted; } + void set_mds_caps_wanted(mempool::mds_co::compact_map& m); + void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted); const cap_map& get_client_caps() const { return client_caps; } Capability *get_client_cap(client_t client) { @@ -992,6 +997,9 @@ public: } } + int get_num_caps_wanted() const { return num_caps_wanted; } + void adjust_num_caps_wanted(int d); + Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0); void remove_client_cap(client_t client); void move_to_realm(SnapRealm *realm); diff --git a/src/mds/CMakeLists.txt b/src/mds/CMakeLists.txt index f9e71ae9675..92ac56b7b79 100644 --- a/src/mds/CMakeLists.txt +++ b/src/mds/CMakeLists.txt @@ -36,6 +36,8 @@ set(mds_srcs MDLog.cc MDSCacheObject.cc Mantle.cc + Anchor.cc + OpenFileTable.cc ${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc ${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc ${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc) diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc index 78c531865d8..3cd408937b3 100644 --- a/src/mds/Capability.cc +++ b/src/mds/Capability.cc @@ -13,6 +13,7 @@ */ #include "Capability.h" +#include "CInode.h" #include "common/Formatter.h" @@ -141,6 +142,17 @@ void Capability::revoke_info::generate_test_instances(listadjust_num_caps_wanted(1); + else if (_wanted && !w) + in->adjust_num_caps_wanted(-1); + } + _wanted = w; +} + void Capability::encode(bufferlist& bl) const { ENCODE_START(2, 2, bl) @@ -159,7 +171,9 @@ void Capability::decode(bufferlist::iterator &bl) decode(last_sent, bl); decode(last_issue_stamp, bl); - decode(_wanted, bl); + __u32 tmp_wanted; + decode(tmp_wanted, bl); + set_wanted(tmp_wanted); decode(_pending, bl); decode(_revokes, bl); DECODE_FINISH(bl); @@ -189,7 +203,7 @@ void Capability::generate_test_instances(list& ls) ls.push_back(new Capability); ls.back()->last_sent = 11; ls.back()->last_issue_stamp = utime_t(12, 13); - ls.back()->_wanted = 14; + ls.back()->set_wanted(14); ls.back()->_pending = 15; { auto &r = ls.back()->_revokes.emplace_back(); diff --git a/src/mds/Capability.h b/src/mds/Capability.h index 2934ab9ebd7..fc8e9b34fc7 100644 --- a/src/mds/Capability.h +++ b/src/mds/Capability.h @@ -257,10 +257,7 @@ public: // caps this client wants to hold int wanted() { return _wanted; } - void set_wanted(int w) { - _wanted = w; - //check_rdcaps_list(); - } + void set_wanted(int w); void inc_last_seq() { last_sent++; } ceph_seq_t get_last_seq() { return last_sent; } @@ -291,7 +288,7 @@ public: client_follows = other.client_follows; // wanted - _wanted = _wanted | other.wanted; + set_wanted(wanted() | other.wanted); if (auth_cap) mseq = other.mseq; } @@ -308,7 +305,7 @@ public: } // wanted - _wanted = _wanted | otherwanted; + set_wanted(wanted() | otherwanted); } void revoke() { diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 4ab6d6194fe..d4590a2451d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2263,10 +2263,7 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl; - if (m->get_caps()) - in->mds_caps_wanted[from] = m->get_caps(); - else - in->mds_caps_wanted.erase(from); + in->set_mds_caps_wanted(from, m->get_caps()); try_eval(in, CEPH_CAP_LOCKS); m->put(); @@ -2435,7 +2432,6 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock, eo->add_ino(in->ino()); metablob = &eo->metablob; le = eo; - mut->ls->open_files.push_back(&in->item_open_file); } else { EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size"); metablob = &eu->metablob; @@ -2541,27 +2537,18 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq) return; } - if (cap->wanted() == 0) { - if (cur->item_open_file.is_on_list() && - !cur->is_any_caps_wanted()) { - dout(10) << " removing unwanted file from open file list " << *cur << dendl; - cur->item_open_file.remove_myself(); - } - } else { + if (cap->wanted()) { if (cur->state_test(CInode::STATE_RECOVERING) && (cap->wanted() & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) { mds->mdcache->recovery_queue.prioritize(cur); } - if (!cur->item_open_file.is_on_list()) { - dout(10) << " adding to open file list " << *cur << dendl; + if (mdcache->open_file_table.should_log_open(cur)) { assert(cur->last == CEPH_NOSNAP); - LogSegment *ls = mds->mdlog->get_current_segment(); EOpen *le = new EOpen(mds->mdlog); mds->mdlog->start_entry(le); le->add_clean_inode(cur); - ls->open_files.push_back(&cur->item_open_file); mds->mdlog->submit_entry(le); } } @@ -3336,7 +3323,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap, bool need_issue = false; if (cap) cap->inc_suppress(); - if (in->mds_caps_wanted.empty() && + if (in->get_mds_caps_wanted().empty() && (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) { if (in->filelock.get_state() != LOCK_EXCL) file_excl(&in->filelock, &need_issue); @@ -5042,7 +5029,7 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue) assert(in->is_auth()); assert(lock->is_stable()); - assert((in->get_loner() >= 0 && in->mds_caps_wanted.empty()) || + assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) || (lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> switch (lock->get_state()) { @@ -5103,7 +5090,7 @@ void Locker::file_xsyn(SimpleLock *lock, bool *need_issue) dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl; CInode *in = static_cast(lock->get_parent()); assert(in->is_auth()); - assert(in->get_loner() >= 0 && in->mds_caps_wanted.empty()); + assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()); switch (lock->get_state()) { case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index dfa7966baa3..d226d2aee0c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -171,7 +171,8 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : filer(m->objecter, m->finisher), exceeded_size_limit(false), recovery_queue(m), - stray_manager(m, purge_queue_) + stray_manager(m, purge_queue_), + open_file_table(m) { migrator.reset(new Migrator(mds, this)); root = NULL; @@ -4803,7 +4804,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) // caps_wanted if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; + in->set_mds_caps_wanted(from, is.caps_wanted); dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) << " on " << *in << dendl; } @@ -5293,10 +5294,41 @@ void MDCache::rejoin_open_sessions_finish(map client_map rejoin_gather_finish(); } +void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret) +{ + auto p = cap_imports.find(ino); + if (p != cap_imports.end()) { + dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl; + if (ret < 0) { + cap_imports_missing.insert(ino); + } else if (ret != mds->get_nodeid()) { + for (auto q = p->second.begin(); q != p->second.end(); ++q) { + assert(q->second.count(MDS_RANK_NONE)); + assert(q->second.size() == 1); + rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); + } + cap_imports.erase(p); + } + } +} + bool MDCache::process_imported_caps() { dout(10) << "process_imported_caps" << dendl; + if (!open_file_table.is_prefetched() && + open_file_table.prefetch_inodes()) { + open_file_table.wait_for_prefetch( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + assert(rejoin_gather.count(mds->get_nodeid())); + process_imported_caps(); + }) + ) + ); + return true; + } + for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { CInode *in = get_inode(p->first); if (in) { @@ -5602,10 +5634,8 @@ void MDCache::clean_open_file_lists() CInode *in = *q; ++q; if (in->last == CEPH_NOSNAP) { - if (!in->is_any_caps_wanted()) { - dout(10) << " unlisting unwanted/capless inode " << *in << dendl; - in->item_open_file.remove_myself(); - } + dout(10) << " unlisting unwanted/capless inode " << *in << dendl; + in->item_open_file.remove_myself(); } else { if (in->client_snap_caps.empty()) { dout(10) << " unlisting flushed snap inode " << *in << dendl; @@ -7413,7 +7443,7 @@ void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin, set& gather_locks) { in->remove_replica(from); - in->mds_caps_wanted.erase(from); + in->set_mds_caps_wanted(from, 0); // note: this code calls _eval more often than it needs to! // fix lock @@ -8379,8 +8409,8 @@ struct C_MDC_OpenInoTraverseDir : public MDCacheContext { mdcache->handle_open_ino(msg, r); return; } - assert(mdcache->opening_inodes.count(ino)); - mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r); + auto& info = mdcache->opening_inodes.at(ino); + mdcache->_open_ino_traverse_dir(ino, info, r); } }; @@ -8397,8 +8427,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err { dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; + open_ino_info_t& info = opening_inodes.at(ino); CInode *in = get_inode(ino); if (in) { @@ -8473,8 +8502,7 @@ void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) { dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; + open_ino_info_t& info = opening_inodes.at(ino); CInode *in = get_inode(ino); if (in) { @@ -8527,6 +8555,8 @@ void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool if (dir->state_test(CDir::STATE_REJOINUNDEF)) assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent)); + if (mds->logger) + mds->logger->inc(l_mds_openino_dir_fetch); } int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, @@ -8688,21 +8718,22 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) dout(10) << "do_open_ino_peer " << ino << " active " << active << " all " << all << " checked " << info.checked << dendl; + mds_rank_t whoami = mds->get_nodeid(); mds_rank_t peer = MDS_RANK_NONE; - if (info.auth_hint >= 0) { + if (info.auth_hint >= 0 && info.auth_hint != whoami) { if (active.count(info.auth_hint)) { peer = info.auth_hint; info.auth_hint = MDS_RANK_NONE; } } else { for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) { + if (*p != whoami && info.checked.count(*p) == 0) { peer = *p; break; } } if (peer < 0) { - all.erase(mds->get_nodeid()); + all.erase(whoami); if (all != info.checked) { dout(10) << " waiting for more peers to be active" << dendl; } else { @@ -8716,6 +8747,8 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) if (info.discover || !info.fetch_backtrace) pa = &info.ancestors; mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer); + if (mds->logger) + mds->logger->inc(l_mds_openino_peer_discover); } } @@ -8830,8 +8863,9 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin, dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " << want_replica << dendl; - if (opening_inodes.count(ino)) { - open_ino_info_t& info = opening_inodes[ino]; + auto it = opening_inodes.find(ino); + if (it != opening_inodes.end()) { + open_ino_info_t& info = it->second; if (want_replica) { info.want_replica = true; if (want_xlocked && !info.want_xlocked) { @@ -8857,7 +8891,14 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin, info.tid = ++open_ino_last_tid; info.pool = pool >= 0 ? pool : default_file_layout.pool_id; info.waiters.push_back(fin); - do_open_ino(ino, info, 0); + if (mds->is_rejoin() && + open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) { + info.fetch_backtrace = false; + info.checking = mds->get_nodeid(); + _open_ino_traverse_dir(ino, info, 0); + } else { + do_open_ino(ino, info, 0); + } } } @@ -9588,6 +9629,8 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte { object_t oid = CInode::get_object_name(ino, frag_t(), ""); mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); + if (mds->logger) + mds->logger->inc(l_mds_openino_backtrace_fetch); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 660b0e0f60a..81635637847 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -31,6 +31,7 @@ #include "events/EMetaBlob.h" #include "RecoveryQueue.h" #include "StrayManager.h" +#include "OpenFileTable.h" #include "MDSContext.h" #include "MDSMap.h" #include "Mutation.h" @@ -596,6 +597,12 @@ public: mds_rank_t frommds=MDS_RANK_NONE) { cap_imports[ino][client][frommds] = icr; } + bool rejoin_has_cap_reconnect(inodeno_t ino) const { + return cap_imports.count(ino); + } + void add_replay_ino_alloc(inodeno_t ino) { + cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin + } const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { if (cap_imports.count(ino) && cap_imports[ino].count(client) && @@ -643,6 +650,7 @@ public: friend class C_MDC_RejoinOpenInoFinish; friend class C_MDC_RejoinSessionsOpened; void rejoin_open_ino_finish(inodeno_t ino, int ret); + void rejoin_prefetch_ino_finish(inodeno_t ino, int ret); void rejoin_open_sessions_finish(map client_map, map& sseqmap); bool process_imported_caps(); @@ -1225,6 +1233,8 @@ public: public: /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ std::set export_pin_queue; + + OpenFileTable open_file_table; }; class C_MDS_RetryRequest : public MDSInternalContext { diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index a38f2d0db20..785a7d1b41f 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -566,6 +566,17 @@ void MDLog::_journal_segment_subtree_map(MDSInternalContextBase *onsync) _submit_entry(sle, new C_MDL_Flushed(this, onsync)); } +class C_OFT_Committed : public MDSInternalContext { + MDLog *mdlog; + uint64_t seq; +public: + C_OFT_Committed(MDLog *l, uint64_t s) : + MDSInternalContext(l->mds), mdlog(l), seq(s) {} + void finish(int ret) override { + mdlog->trim_expired_segments(); + } +}; + void MDLog::trim(int m) { unsigned max_segments = g_conf->mds_log_max_segments; @@ -636,6 +647,7 @@ void MDLog::trim(int m) << journaler->get_write_safe_pos() << " < end " << ls->end << dendl; break; } + if (expiring_segments.count(ls)) { dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset << ", " << ls->num_events << " events" << dendl; @@ -657,6 +669,18 @@ void MDLog::trim(int m) } } + if (!capped && + !mds->mdcache->open_file_table.is_any_committing()) { + uint64_t last_seq = get_last_segment_seq(); + if (mds->mdcache->open_file_table.is_any_dirty() || + last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_HIGH); + submit_mutex.Lock(); + } + } + // discard expired segments and unlock submit_mutex _trim_expired_segments(); } @@ -689,8 +713,16 @@ int MDLog::trim_all() << "/" << expired_segments.size() << dendl; uint64_t last_seq = 0; - if (!segments.empty()) + if (!segments.empty()) { last_seq = get_last_segment_seq(); + if (!mds->mdcache->open_file_table.is_any_committing() && + last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) { + submit_mutex.Unlock(); + mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq), + last_seq, CEPH_MSG_PRIO_DEFAULT); + submit_mutex.Lock(); + } + } map::iterator p = segments.begin(); while (p != segments.end() && @@ -770,6 +802,8 @@ void MDLog::_trim_expired_segments() { assert(submit_mutex.is_locked_by_me()); + uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq(); + // trim expired segments? bool trimmed = false; while (!segments.empty()) { @@ -779,6 +813,12 @@ void MDLog::_trim_expired_segments() << " to expire" << dendl; break; } + + if (!capped && ls->seq >= oft_committed_seq) { + dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq + << " <= " << ls->seq << "/" << ls->offset << dendl; + break; + } dout(10) << "_trim_expired_segments trimming expired " << ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl; @@ -1429,6 +1469,9 @@ void MDLog::standby_trim_segments() dout(10) << "standby_trim_segments" << dendl; uint64_t expire_pos = journaler->get_expire_pos(); dout(10) << " expire_pos=" << expire_pos << dendl; + + mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos); + bool removed_segment = false; while (have_any_segments()) { LogSegment *seg = get_oldest_segment(); diff --git a/src/mds/MDLog.h b/src/mds/MDLog.h index 5579e5abc00..fa3704be21a 100644 --- a/src/mds/MDLog.h +++ b/src/mds/MDLog.h @@ -306,6 +306,7 @@ private: friend class C_MaybeExpiredSegment; friend class C_MDL_Flushed; + friend class C_OFT_Committed; public: void trim_expired_segments(); diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index d36d680d570..468c3e1dc50 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -1076,6 +1076,8 @@ void MDSRank::boot_start(BootStep step, int r) } else if (!standby_replaying) { dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl; purge_queue.open(NULL); + dout(2) << "boot_start " << step << ": loading open file table (async)" << dendl; + mdcache->open_file_table.load(nullptr); } if (mdsmap->get_tableserver() == whoami) { @@ -1275,8 +1277,10 @@ void MDSRank::standby_replay_restart() this, mdlog->get_journaler()->get_read_pos())); - dout(1) << " opening purge queue (async)" << dendl; + dout(1) << " opening purge_queue (async)" << dendl; purge_queue.open(NULL); + dout(1) << " opening open_file_table (async)" << dendl; + mdcache->open_file_table.load(nullptr); } else { dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch() << " (which blacklists prior instance)" << dendl; @@ -2635,6 +2639,12 @@ void MDSRank::create_logger() mds_plb.add_u64_counter( l_mds_imported_inodes, "imported_inodes", "Imported inodes", "imi", PerfCountersBuilder::PRIO_INTERESTING); + mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch", + "OpenIno incomplete directory fetchings"); + mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch", + "OpenIno backtrace fetchings"); + mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover", + "OpenIno peer inode discovers"); logger = mds_plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(logger); } diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h index f2872286742..ff10dac6daa 100644 --- a/src/mds/MDSRank.h +++ b/src/mds/MDSRank.h @@ -71,6 +71,9 @@ enum { l_mds_exported_inodes, l_mds_imported, l_mds_imported_inodes, + l_mds_openino_dir_fetch, + l_mds_openino_backtrace_fetch, + l_mds_openino_peer_discover, l_mds_last, }; diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 4bcb2193bf7..78b6db4cfea 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1553,8 +1553,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer, if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) in->clear_scatter_dirty(); - in->item_open_file.remove_myself(); - in->clear_dirty_parent(); in->clear_file_locks(); @@ -3047,7 +3045,12 @@ void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap, map cap_map; decode(cap_map, blp); if (auth_cap) - decode(in->get_mds_caps_wanted(), blp); + if (auth_cap) { + mempool::mds_co::compact_map mds_wanted; + decode(mds_wanted, blp); + mds_wanted.erase(mds->get_nodeid()); + in->set_mds_caps_wanted(mds_wanted); + } if (!cap_map.empty() || (auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) { peer_exports[in].swap(cap_map); diff --git a/src/mds/OpenFileTable.cc b/src/mds/OpenFileTable.cc new file mode 100644 index 00000000000..4bb52ba0054 --- /dev/null +++ b/src/mds/OpenFileTable.cc @@ -0,0 +1,1126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mds/CInode.h" +#include "mds/CDir.h" +#include "mds/MDSRank.h" +#include "mds/MDCache.h" +#include "osdc/Objecter.h" +#include "OpenFileTable.h" + +#include "common/config.h" +#include "common/errno.h" + +#define dout_context g_ceph_context +#define dout_subsys ceph_subsys_mds +#undef dout_prefix +#define dout_prefix _prefix(_dout, mds) +static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { + return *_dout << "mds." << mds->get_nodeid() << ".openfiles "; +} + +void OpenFileTable::get_ref(CInode *in) +{ + do { + auto p = anchor_map.find(in->ino()); + if (p != anchor_map.end()) { + assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + assert(p->second.nref > 0); + p->second.nref++; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + + auto ret = anchor_map.emplace(std::piecewise_construct, std::forward_as_tuple(in->ino()), + std::forward_as_tuple(in->ino(), (pin ? pin->ino() : inodeno_t(0)), + (dn ? dn->get_name() : string()), in->d_type(), 1)); + assert(ret.second == true); + in->state_set(CInode::STATE_TRACKEDBYOFT); + + auto ret1 = dirty_items.emplace(in->ino(), (int)DIRTY_NEW); + if (!ret1.second) { + int omap_idx = ret1.first->second; + assert(omap_idx >= 0); + ret.first->second.omap_idx = omap_idx; + } + + in = pin; + } while (in); +} + +void OpenFileTable::put_ref(CInode *in) +{ + do { + assert(in->state_test(CInode::STATE_TRACKEDBYOFT)); + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + + if (p->second.nref > 1) { + p->second.nref--; + break; + } + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn ? dn->get_dir()->get_inode() : nullptr; + if (dn) { + assert(p->second.dirino == pin->ino()); + assert(p->second.d_name == dn->get_name()); + } else { + assert(p->second.dirino == inodeno_t(0)); + assert(p->second.d_name == ""); + } + + int omap_idx = p->second.omap_idx; + anchor_map.erase(p); + in->state_clear(CInode::STATE_TRACKEDBYOFT); + + auto ret = dirty_items.emplace(in->ino(), omap_idx); + if (!ret.second) { + if (ret.first->second == DIRTY_NEW) { + assert(omap_idx < 0); + dirty_items.erase(ret.first); + } else { + assert(omap_idx >= 0); + ret.first->second = omap_idx; + } + } + + in = pin; + } while (in); +} + +void OpenFileTable::add_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + assert(p == anchor_map.end()); + } + get_ref(in); +} + +void OpenFileTable::remove_inode(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + if (!in->is_dir()) { + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref == 1); + } + put_ref(in); +} + +void OpenFileTable::add_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + assert(!dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_set(CDir::STATE_TRACKEDBYOFT); + auto ret = dirfrags.insert(dir->dirfrag()); + assert(ret.second); + get_ref(dir->get_inode()); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); +} + +void OpenFileTable::remove_dirfrag(CDir *dir) +{ + dout(10) << __func__ << " " << *dir << dendl; + assert(dir->state_test(CDir::STATE_TRACKEDBYOFT)); + dir->state_clear(CDir::STATE_TRACKEDBYOFT); + auto p = dirfrags.find(dir->dirfrag()); + assert(p != dirfrags.end()); + dirfrags.erase(p); + dirty_items.emplace(dir->ino(), (int)DIRTY_UNDEF); + put_ref(dir->get_inode()); +} + +void OpenFileTable::notify_link(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + assert(p->second.dirino == inodeno_t(0)); + assert(p->second.d_name == ""); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + + p->second.dirino = pin->ino(); + p->second.d_name = dn->get_name(); + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); + + get_ref(pin); +} + +void OpenFileTable::notify_unlink(CInode *in) +{ + dout(10) << __func__ << " " << *in << dendl; + auto p = anchor_map.find(in->ino()); + assert(p != anchor_map.end()); + assert(p->second.nref > 0); + + CDentry *dn = in->get_parent_dn(); + CInode *pin = dn->get_dir()->get_inode(); + assert(p->second.dirino == pin->ino()); + assert(p->second.d_name == dn->get_name()); + + p->second.dirino = inodeno_t(0); + p->second.d_name = ""; + dirty_items.emplace(in->ino(), (int)DIRTY_UNDEF); + + put_ref(pin); +} + +object_t OpenFileTable::get_object_name(unsigned idx) const +{ + char s[30]; + snprintf(s, sizeof(s), "mds%d_openfiles.%x", int(mds->get_nodeid()), idx); + return object_t(s); +} + +void OpenFileTable::_encode_header(bufferlist &bl, int j_state) +{ + encode(omap_version, bl); + encode(omap_num_objs, bl); + encode((__u8)j_state, bl); +} + +class C_IO_OFT_Save : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSInternalContextBase *fin; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Save(OpenFileTable *t, uint64_t s, MDSInternalContextBase *c) : + oft(t), log_seq(s), fin(c) {} + void finish(int r) { + oft->_commit_finish(r, log_seq, fin); + } +}; + +void OpenFileTable::_commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + assert(log_seq <= committing_log_seq); + assert(log_seq >= committed_log_seq); + committed_log_seq = log_seq; + num_pending_commit--; + + if (fin) + fin->complete(r); +} + +class C_IO_OFT_Journal : public MDSIOContextBase { +protected: + OpenFileTable *oft; + uint64_t log_seq; + MDSInternalContextBase *fin; + std::map > ops_map; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Journal(OpenFileTable *t, uint64_t s, MDSInternalContextBase *c, + std::map >& ops) : + oft(t), log_seq(s), fin(c) { + ops_map.swap(ops); + } + void finish(int r) { + oft->_journal_finish(r, log_seq, fin, ops_map); + } +}; + +void OpenFileTable::_journal_finish(int r, uint64_t log_seq, MDSInternalContextBase *c, + std::map >& ops_map) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + if (r < 0) { + mds->handle_write_error(r); + return; + } + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + SnapContext snapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + + journal_state = JOURNAL_NONE; + return; +} + +void OpenFileTable::commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio) +{ + dout(10) << __func__ << " log_seq " << log_seq << dendl; + + assert(num_pending_commit == 0); + num_pending_commit++; + assert(log_seq >= committing_log_seq); + committing_log_seq = log_seq; + + omap_version++; + + C_GatherBuilder gather(g_ceph_context); + + SnapContext snapc; + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + const unsigned max_items_per_obj = 1024 * 1024; + const unsigned max_write_size = mds->mdcache->max_dir_commit_size; + + struct omap_update_ctl { + unsigned write_size = 0; + unsigned journal_idx = 0; + bool clear = false; + std::map to_update, journaled_update; + std::set to_remove, journaled_remove; + }; + std::vector omap_updates(omap_num_objs); + + using ceph::encode; + auto journal_func = [&](unsigned idx) { + auto& ctl = omap_updates.at(idx); + + ObjectOperation op; + op.priority = op_prio; + + if (ctl.clear) { + ctl.clear = false; + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } + + if (ctl.journal_idx == 0) { + if (journal_state == JOURNAL_NONE) + journal_state = JOURNAL_START; + else + assert(journal_state == JOURNAL_START); + + bufferlist header; + _encode_header(header, journal_state); + op.omap_set_header(header); + } + + bufferlist bl; + encode(omap_version, bl); + encode(ctl.to_update, bl); + encode(ctl.to_remove, bl); + + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", ctl.journal_idx++); + std::map tmp_map; + tmp_map[key].swap(bl); + op.omap_set(tmp_map); + + object_t oid = get_object_name(idx); + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), 0, + gather.new_sub()); + + ctl.journaled_update.merge(ctl.to_update); + ctl.journaled_remove.merge(ctl.to_remove); + ctl.to_update.clear(); + ctl.to_remove.clear(); + }; + + std::map > ops_map; + + auto create_op_func = [&](unsigned idx, bool update_header) { + auto& ctl = omap_updates.at(idx); + + auto& op_vec = ops_map[idx]; + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + op.priority = op_prio; + + if (ctl.clear) { + ctl.clear = false; + op.omap_clear(); + op.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK); + } + + if (update_header) { + bufferlist header; + _encode_header(header, journal_state); + op.omap_set_header(header); + } + + if (!ctl.to_update.empty()) { + op.omap_set(ctl.to_update); + ctl.to_update.clear(); + } + if (!ctl.to_remove.empty()) { + op.omap_rm_keys(ctl.to_remove); + ctl.to_remove.clear(); + } + }; + + auto submit_ops_func = [&]() { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Save(this, log_seq, c), + mds->finisher)); + for (auto& it : ops_map) { + object_t oid = get_object_name(it.first); + for (auto& op : it.second) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + }; + + bool first_commit = !loaded_anchor_map.empty(); + + unsigned first_free_idx = 0; + unsigned old_num_objs = omap_num_objs; + if (omap_num_objs == 0) { + omap_num_objs = 1; + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + } + + for (auto& it : dirty_items) { + list fgls; + auto p = anchor_map.find(it.first); + if (p != anchor_map.end()) { + for (auto q = dirfrags.lower_bound(dirfrag_t(it.first, 0)); + q != dirfrags.end() && q->ino == it.first; + ++q) + fgls.push_back(q->frag); + } + + if (first_commit) { + auto q = loaded_anchor_map.find(it.first); + if (q != loaded_anchor_map.end()) { + assert(p != anchor_map.end()); + p->second.omap_idx = q->second.omap_idx; + bool same = p->second == q->second; + if (same) { + auto r = loaded_dirfrags.lower_bound(dirfrag_t(it.first, 0)); + for (auto fg : fgls) { + if (r == loaded_dirfrags.end() || !(*r == dirfrag_t(it.first, fg))) { + same = false; + break; + } + ++r; + } + if (same && r != loaded_dirfrags.end() && r->ino == it.first) + same = false; + } + loaded_anchor_map.erase(q); + if (same) + continue; + } + } + + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + + int omap_idx; + if (p != anchor_map.end()) { + omap_idx = p->second.omap_idx; + if (omap_idx < 0) { + assert(it.second == DIRTY_NEW); + // find omap object to store the key + for (unsigned i = first_free_idx; i < omap_num_objs; i++) { + if (omap_num_items[i] < max_items_per_obj) + omap_idx = i; + } + if (omap_idx < 0) { + ++omap_num_objs; + omap_num_items.resize(omap_num_objs); + omap_updates.resize(omap_num_objs); + omap_updates.back().clear = true; + omap_idx = omap_num_objs - 1; + } + first_free_idx = omap_idx; + + p->second.omap_idx = omap_idx; + ++omap_num_items[omap_idx]; + } + } else { + omap_idx = it.second; + unsigned& count = omap_num_items.at(omap_idx); + assert(count > 0); + --count; + if ((unsigned)omap_idx < first_free_idx && count < max_items_per_obj) + first_free_idx = omap_idx; + } + auto& ctl = omap_updates.at(omap_idx); + + if (p != anchor_map.end()) { + bufferlist bl; + encode(p->second, bl); + encode(fgls, bl); + + ctl.write_size += bl.length() + len + 2 * sizeof(__u32); + ctl.to_update[key].swap(bl); + } else { + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); + } + + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + + dirty_items.clear(); + + if (first_commit) { + for (auto& it : loaded_anchor_map) { + char key[32]; + int len = snprintf(key, sizeof(key), "%llx", (unsigned long long)it.first.val); + + int omap_idx = it.second.omap_idx; + unsigned& count = omap_num_items.at(omap_idx); + assert(count > 0); + --count; + + auto& ctl = omap_updates.at(omap_idx); + ctl.write_size += len + sizeof(__u32); + ctl.to_remove.emplace(key); + + if (ctl.write_size >= max_write_size) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } + + { + size_t total_items = 0; + unsigned used_objs = 1; + std::list objs_to_write; + bool journaled = false; + for (unsigned i = 0; i < omap_num_objs; i++) { + total_items += omap_num_items[i]; + if (omap_updates[i].journal_idx) + journaled = true; + else if (omap_updates[i].write_size) + objs_to_write.push_back(i); + + if (omap_num_items[i] > 0) + used_objs = i + 1; + } + assert(total_items == anchor_map.size()); + // adjust omap object count + if (used_objs < omap_num_objs) { + omap_num_objs = used_objs; + omap_num_items.resize(omap_num_objs); + } + // skip journal if only one osd request is required and object count + // does not change. + if (!journaled && old_num_objs == omap_num_objs && + objs_to_write.size() <= 1) { + assert(journal_state == JOURNAL_NONE); + assert(!gather.has_subs()); + + unsigned omap_idx = objs_to_write.empty() ? 0 : objs_to_write.front(); + create_op_func(omap_idx, true); + submit_ops_func(); + return; + } + } + + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + if (ctl.write_size > 0) { + journal_func(omap_idx); + ctl.write_size = 0; + } + } + + if (journal_state == JOURNAL_START) { + assert(gather.has_subs()); + journal_state = JOURNAL_FINISH; + } else { + // only object count changes + assert(journal_state == JOURNAL_NONE); + assert(!gather.has_subs()); + } + + for (unsigned omap_idx = 0; omap_idx < omap_updates.size(); omap_idx++) { + auto& ctl = omap_updates[omap_idx]; + assert(ctl.to_update.empty() && ctl.to_remove.empty()); + if (ctl.journal_idx == 0) + assert(ctl.journaled_update.empty() && ctl.journaled_remove.empty()); + + bool first = true; + for (auto& it : ctl.journaled_update) { + ctl.write_size += it.first.length() + it.second.length() + 2 * sizeof(__u32); + ctl.to_update[it.first].swap(it.second); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (auto& key : ctl.journaled_remove) { + ctl.write_size += key.length() + sizeof(__u32); + ctl.to_remove.emplace(key); + if (ctl.write_size >= max_write_size) { + create_op_func(omap_idx, first); + ctl.write_size = 0; + first = false; + } + } + + for (unsigned i = 0; i < ctl.journal_idx; ++i) { + char key[32]; + snprintf(key, sizeof(key), "_journal.%x", i); + ctl.to_remove.emplace(key); + } + + // update first object's omap header if object count changes + if (ctl.clear || + ctl.journal_idx > 0 || + (omap_idx == 0 && old_num_objs != omap_num_objs)) + create_op_func(omap_idx, first); + } + + assert(!ops_map.empty()); + if (journal_state == JOURNAL_FINISH) { + gather.set_finisher(new C_OnFinisher(new C_IO_OFT_Journal(this, log_seq, c, ops_map), + mds->finisher)); + gather.activate(); + } else { + submit_ops_func(); + } +} + +class C_IO_OFT_Load : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } + +public: + int header_r = 0; //< Return value from OMAP header read + int values_r = 0; //< Return value from OMAP value read + bufferlist header_bl; + std::map values; + unsigned index; + bool first; + bool more = false; + + C_IO_OFT_Load(OpenFileTable *t, unsigned i, bool f) : + oft(t), index(i), first(f) {} + void finish(int r) { + oft->_load_finish(r, header_r, values_r, index, first, more, header_bl, values); + } +}; + +class C_IO_OFT_Recover : public MDSIOContextBase { +protected: + OpenFileTable *oft; + MDSRank *get_mds() override { return oft->mds; } +public: + C_IO_OFT_Recover(OpenFileTable *t) : oft(t) {} + void finish(int r) { + oft->_recover_finish(r); + } +}; + +void OpenFileTable::_recover_finish(int r) +{ + if (r < 0) { + derr << __func__ << " got " << cpp_strerror(r) << dendl; + _reset_states(); + } else { + dout(10) << __func__ << ": load complete" << dendl; + } + + journal_state = JOURNAL_NONE; + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + +void OpenFileTable::_load_finish(int op_r, int header_r, int values_r, + unsigned idx, bool first, bool more, + bufferlist &header_bl, + std::map &values) +{ + using ceph::decode; + int err = -EINVAL; + + auto decode_func = [this](unsigned idx, inodeno_t ino, bufferlist &bl) { + bufferlist::iterator p = bl.begin(); + + size_t count = loaded_anchor_map.size(); + auto it = loaded_anchor_map.emplace_hint(loaded_anchor_map.end(), + std::piecewise_construct, + std::make_tuple(ino), + std::make_tuple()); + RecoveredAnchor& anchor = it->second; + decode(anchor, p); + assert(ino == anchor.ino); + anchor.omap_idx = idx; + anchor.auth = MDS_RANK_NONE; + + list fgls; + decode(fgls, p); + for (auto fg : fgls) + loaded_dirfrags.insert(loaded_dirfrags.end(), dirfrag_t(anchor.ino, fg)); + + if (loaded_anchor_map.size() > count) + ++omap_num_items[idx]; + }; + + if (op_r < 0) { + derr << __func__ << " got " << cpp_strerror(op_r) << dendl; + err = op_r; + goto out; + } + + try { + if (first) { + bufferlist::iterator p = header_bl.begin(); + version_t version; + unsigned num_objs; + __u8 jstate; + decode(version, p); + decode(num_objs, p); + decode(jstate, p); + + if (version > omap_version) { + omap_version = version; + omap_num_objs = num_objs; + omap_num_items.resize(omap_num_objs); + journal_state = jstate; + } else if (version == omap_version) { + assert(omap_num_objs == num_objs); + if (jstate > journal_state) + journal_state = jstate; + } + } + + for (auto& it : values) { + if (it.first.compare(0, 9, "_journal.") == 0) { + if (idx >= loaded_journals.size()) + loaded_journals.resize(idx + 1); + + if (journal_state == JOURNAL_FINISH) { + loaded_journals[idx][it.first].swap(it.second); + } else { // incomplete journal + loaded_journals[idx][it.first].length(); + } + continue; + } + + inodeno_t ino; + sscanf(it.first.c_str(), "%llx", (unsigned long long*)&ino.val); + decode_func(idx, ino, it.second); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted header/values: " << e.what() << dendl; + goto out; + } + + if (more || idx + 1 < omap_num_objs) { + // Issue another read if we're not at the end of the omap + std::string last_key; + if (more) + last_key = values.rbegin()->first; + else + idx++; + dout(10) << __func__ << ": continue to load from '" << last_key << "'" << dendl; + object_t oid = get_object_name(idx); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, idx, !more); + ObjectOperation op; + if (!more) + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals(last_key, "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); + return; + } + + // replay journal + if (loaded_journals.size() > 0) { + dout(10) << __func__ << ": recover journal" << dendl; + + C_GatherBuilder gather(g_ceph_context, + new C_OnFinisher(new C_IO_OFT_Recover(this), + mds->finisher)); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + SnapContext snapc; + + for (unsigned omap_idx = 0; omap_idx < loaded_journals.size(); omap_idx++) { + auto& loaded_journal = loaded_journals[omap_idx]; + + std::vector op_vec; + try { + for (auto& it : loaded_journal) { + if (journal_state != JOURNAL_FINISH) + continue; + bufferlist::iterator p = it.second.begin(); + version_t version; + std::map to_update; + std::set to_remove; + decode(version, p); + if (version != omap_version) + continue; + decode(to_update, p); + decode(to_remove, p); + it.second.clear(); + + for (auto& q : to_update) { + inodeno_t ino; + sscanf(q.first.c_str(), "%llx", (unsigned long long*)&ino.val); + decode_func(omap_idx, ino, q.second); + } + for (auto& q : to_remove) { + inodeno_t ino; + sscanf(q.c_str(), "%llx",(unsigned long long*)&ino.val); + assert(ino.val > 0); + if (loaded_anchor_map.erase(ino)) { + unsigned& count = omap_num_items[omap_idx]; + assert(count > 0); + --count; + } + auto r = loaded_dirfrags.lower_bound(dirfrag_t(ino, 0)); + while (r != loaded_dirfrags.end() && r->ino == ino) + loaded_dirfrags.erase(r++); + } + + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + op.priority = CEPH_MSG_PRIO_HIGH; + if (!to_update.empty()) + op.omap_set(to_update); + if (!to_remove.empty()) + op.omap_rm_keys(to_remove); + } + } catch (buffer::error &e) { + derr << __func__ << ": corrupted journal: " << e.what() << dendl; + goto out; + } + + op_vec.resize(op_vec.size() + 1); + ObjectOperation& op = op_vec.back(); + { + bufferlist header; + if (journal_state == JOURNAL_FINISH) + _encode_header(header, JOURNAL_FINISH); + else + _encode_header(header, JOURNAL_NONE); + op.omap_set_header(header); + } + { + // remove journal + std::set to_remove; + for (auto &it : loaded_journal) + to_remove.emplace(it.first); + op.omap_rm_keys(to_remove); + } + loaded_journal.clear(); + + object_t oid = get_object_name(omap_idx); + for (auto& op : op_vec) { + mds->objecter->mutate(oid, oloc, op, snapc, ceph::real_clock::now(), + 0, gather.new_sub()); + } + } + gather.activate(); + return; + } + + journal_state = JOURNAL_NONE; + err = 0; + dout(10) << __func__ << ": load complete" << dendl; +out: + + if (err < 0) + _reset_states(); + + load_done = true; + finish_contexts(g_ceph_context, waiting_for_load); + waiting_for_load.clear(); +} + +void OpenFileTable::load(MDSInternalContextBase *onload) +{ + dout(10) << __func__ << dendl; + assert(!load_done); + if (onload) + waiting_for_load.push_back(onload); + + C_IO_OFT_Load *c = new C_IO_OFT_Load(this, 0, true); + object_t oid = get_object_name(0); + object_locator_t oloc(mds->mdsmap->get_metadata_pool()); + + ObjectOperation op; + op.omap_get_header(&c->header_bl, &c->header_r); + op.omap_get_vals("", "", uint64_t(-1), + &c->values, &c->more, &c->values_r); + + mds->objecter->read(oid, oloc, op, CEPH_NOSNAP, nullptr, 0, + new C_OnFinisher(c, mds->finisher)); +} + +bool OpenFileTable::get_ancestors(inodeno_t ino, vector& ancestors, + mds_rank_t& auth_hint) +{ + auto p = loaded_anchor_map.find(ino); + if (p == loaded_anchor_map.end()) + return false; + + inodeno_t dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + return false; + + bool first = true; + ancestors.clear(); + while (true) { + ancestors.push_back(inode_backpointer_t(dirino, p->second.d_name, 0)); + + p = loaded_anchor_map.find(dirino); + if (p == loaded_anchor_map.end()) + break; + + if (first) + auth_hint = p->second.auth; + + dirino = p->second.dirino; + if (dirino == inodeno_t(0)) + break; + + first = false; + } + return true; +} + +class C_OFT_OpenInoFinish: public MDSInternalContextBase { + OpenFileTable *oft; + inodeno_t ino; + MDSRank *get_mds() override { return oft->mds; } +public: + C_OFT_OpenInoFinish(OpenFileTable *t, inodeno_t i) : oft(t), ino(i) {} + void finish(int r) override { + oft->_open_ino_finish(ino, r); + } +}; + +void OpenFileTable::_open_ino_finish(inodeno_t ino, int r) +{ + if (prefetch_state == DIR_INODES && r >= 0 && ino != inodeno_t(0)) { + auto p = loaded_anchor_map.find(ino); + assert(p != loaded_anchor_map.end()); + p->second.auth = mds_rank_t(r); + } + + if (r != mds->get_nodeid()) + mds->mdcache->rejoin_prefetch_ino_finish(ino, r); + + num_opening_inodes--; + if (num_opening_inodes == 0) { + if (prefetch_state == DIR_INODES) { + prefetch_state = DIRFRAGS; + _prefetch_dirfrags(); + } else if (prefetch_state == FILE_INODES) { + prefetch_state = DONE; + logseg_destroyed_inos.clear(); + destroyed_inos_set.clear(); + finish_contexts(g_ceph_context, waiting_for_prefetch); + waiting_for_prefetch.clear(); + } else { + assert(0); + } + } +} + +void OpenFileTable::_prefetch_dirfrags() +{ + dout(10) << __func__ << dendl; + assert(prefetch_state == DIRFRAGS); + + MDCache *mdcache = mds->mdcache; + list fetch_queue; + + CInode *last_in = nullptr; + for (auto df : loaded_dirfrags) { + CInode *diri; + if (last_in && last_in->ino() == df.ino) { + diri = last_in; + } else { + diri = mdcache->get_inode(df.ino); + if (!diri) + continue; + last_in = diri; + } + if (diri->state_test(CInode::STATE_REJOINUNDEF)) + continue; + + CDir *dir = diri->get_dirfrag(df.frag); + if (dir) { + if (dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } else { + list fgls; + diri->dirfragtree.get_leaves_under(df.frag, fgls); + for (auto fg : fgls) { + if (diri->is_auth()) { + dir = diri->get_or_open_dirfrag(mdcache, fg); + } else { + dir = diri->get_dirfrag(fg); + } + if (dir && dir->is_auth() && !dir->is_complete()) + fetch_queue.push_back(dir); + } + } + } + + MDSGatherBuilder gather(g_ceph_context); + for (auto dir : fetch_queue) { + if (dir->state_test(CDir::STATE_REJOINUNDEF)) + assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); + dir->fetch(gather.new_sub()); + } + + auto finish_func = [this](int r) { + prefetch_state = FILE_INODES; + _prefetch_inodes(); + }; + if (gather.has_subs()) { + gather.set_finisher( + new MDSInternalContextWrapper(mds, + new FunctionContext(finish_func))); + gather.activate(); + } else { + finish_func(0); + } +} + +void OpenFileTable::_prefetch_inodes() +{ + dout(10) << __func__ << " state " << prefetch_state << dendl; + assert(!num_opening_inodes); + num_opening_inodes = 1; + + int64_t pool; + if (prefetch_state == DIR_INODES) + pool = mds->mdsmap->get_metadata_pool(); + else if (prefetch_state == FILE_INODES) + pool = mds->mdsmap->get_first_data_pool(); + else + assert(0); + + MDCache *mdcache = mds->mdcache; + + if (destroyed_inos_set.empty()) { + for (auto& it : logseg_destroyed_inos) + destroyed_inos_set.insert(it.second.begin(), it.second.end()); + } + + for (auto& it : loaded_anchor_map) { + if (destroyed_inos_set.count(it.first)) + continue; + if (it.second.d_type == DT_DIR) { + if (prefetch_state != DIR_INODES) + continue; + if (MDS_INO_IS_MDSDIR(it.first)) { + it.second.auth = MDS_INO_MDSDIR_OWNER(it.first); + continue; + } + if (MDS_INO_IS_STRAY(it.first)) { + it.second.auth = MDS_INO_STRAY_OWNER(it.first); + continue; + } + } else { + if (prefetch_state != FILE_INODES) + continue; + // load all file inodes for MDCache::identify_files_to_recover() + } + CInode *in = mdcache->get_inode(it.first); + if (in) + continue; + + num_opening_inodes++; + mdcache->open_ino(it.first, pool, new C_OFT_OpenInoFinish(this, it.first), false); + } + + _open_ino_finish(inodeno_t(0), 0); +} + +bool OpenFileTable::prefetch_inodes() +{ + dout(10) << __func__ << dendl; + assert(!prefetch_state); + prefetch_state = DIR_INODES; + + if (!load_done) { + wait_for_load( + new MDSInternalContextWrapper(mds, + new FunctionContext([this](int r) { + _prefetch_inodes(); + }) + ) + ); + return true; + } + + _prefetch_inodes(); + return !is_prefetched(); +} + +bool OpenFileTable::should_log_open(CInode *in) +{ + if (in->state_test(CInode::STATE_TRACKEDBYOFT)) { + // inode just journaled + if (in->last_journaled >= committing_log_seq) + return false; + // item not dirty. it means the item has already been saved + auto p = dirty_items.find(in->ino()); + if (p == dirty_items.end()) + return false; + } + return true; +} + +void OpenFileTable::note_destroyed_inos(uint64_t seq, const vector& inos) +{ + auto& vec = logseg_destroyed_inos[seq]; + vec.insert(vec.end(), inos.begin(), inos.end()); +} + +void OpenFileTable::trim_destroyed_inos(uint64_t seq) +{ + auto p = logseg_destroyed_inos.begin(); + while (p != logseg_destroyed_inos.end()) { + if (p->first >= seq) + break; + logseg_destroyed_inos.erase(p++); + } +} diff --git a/src/mds/OpenFileTable.h b/src/mds/OpenFileTable.h new file mode 100644 index 00000000000..db949d9f888 --- /dev/null +++ b/src/mds/OpenFileTable.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef OPEN_FILE_TABLE_H +#define OPEN_FILE_TABLE_H + +#include "mdstypes.h" +#include "Anchor.h" + +class CDir; +class CInode; +class MDSRank; +class MDSInternalContextBase; + +class OpenFileTable +{ +public: + OpenFileTable(MDSRank *m) : mds(m) {} + + void add_inode(CInode *in); + void remove_inode(CInode *in); + void add_dirfrag(CDir *dir); + void remove_dirfrag(CDir *dir); + void notify_link(CInode *in); + void notify_unlink(CInode *in); + bool is_any_dirty() const { return !dirty_items.empty(); } + + void commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio); + uint64_t get_committed_log_seq() const { return committed_log_seq; } + uint64_t get_committing_log_seq() const { return committing_log_seq; } + bool is_any_committing() const { return num_pending_commit > 0; } + + void load(MDSInternalContextBase *c); + bool is_loaded() const { return load_done; } + void wait_for_load(MDSInternalContextBase *c) { + assert(!load_done); + waiting_for_load.push_back(c); + } + + bool get_ancestors(inodeno_t ino, vector& ancestors, + mds_rank_t& auth_hint); + + bool prefetch_inodes(); + bool is_prefetched() const { return prefetch_state == DONE; } + void wait_for_prefetch(MDSInternalContextBase *c) { + assert(!is_prefetched()); + waiting_for_prefetch.push_back(c); + } + + bool should_log_open(CInode *in); + + void note_destroyed_inos(uint64_t seq, const vector& inos); + void trim_destroyed_inos(uint64_t seq); + +protected: + MDSRank *mds; + + version_t omap_version = 0; + unsigned omap_num_objs = 0; + std::vector omap_num_items; + + map anchor_map; + set dirfrags; + + std::map dirty_items; // ino -> dirty state + static const int DIRTY_NEW = -1; + static const int DIRTY_UNDEF = -2; + + uint64_t committed_log_seq = 0; + uint64_t committing_log_seq = 0; + + void get_ref(CInode *in); + void put_ref(CInode *in); + + object_t get_object_name(unsigned idx) const; + + enum { + JOURNAL_NONE = 0, + JOURNAL_START = 1, + JOURNAL_FINISH = 2, + }; + int journal_state = 0; + + unsigned num_pending_commit = 0; + void _encode_header(bufferlist& bl, int j_state); + void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin); + void _journal_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin, + std::map >& ops); + + std::vector > loaded_journals; + map loaded_anchor_map; + set loaded_dirfrags; + list waiting_for_load; + bool load_done = false; + + void _reset_states() { + omap_num_objs = 0; + omap_num_items.resize(0); + journal_state = JOURNAL_NONE; + loaded_journals.clear(); + loaded_anchor_map.clear(); + loaded_dirfrags.clear(); + } + void _load_finish(int op_r, int header_r, int values_r, + unsigned idx, bool first, bool more, + bufferlist &header_bl, + std::map &values); + void _recover_finish(int r); + + enum { + DIR_INODES = 1, + DIRFRAGS = 2, + FILE_INODES = 3, + DONE = 4, + }; + unsigned prefetch_state = 0; + unsigned num_opening_inodes = 0; + list waiting_for_prefetch; + void _open_ino_finish(inodeno_t ino, int r); + void _prefetch_inodes(); + void _prefetch_dirfrags(); + + std::map > logseg_destroyed_inos; + std::set destroyed_inos_set; + + friend class C_IO_OFT_Recover; + friend class C_IO_OFT_Load; + friend class C_IO_OFT_Save; + friend class C_IO_OFT_Journal; + friend class C_OFT_OpenInoFinish; +}; + +#endif diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 1e5854a0b42..d2c29d8ebd0 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -214,6 +214,10 @@ void Server::dispatch(Message *m) if (req->is_replay()) { dout(3) << "queuing replayed op" << dendl; queue_replay = true; + if (req->head.ino && + !session->have_completed_request(req->get_reqid().tid, nullptr)) { + mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino)); + } } else if (req->get_retry_attempt()) { // process completed request in clientreplay stage. The completed request // might have created new file/directorie. This guarantees MDS sends a reply @@ -3441,12 +3445,10 @@ void Server::handle_client_open(MDRequestRef& mdr) // make sure this inode gets into the journal if (cur->is_auth() && cur->last == CEPH_NOSNAP && - !cur->item_open_file.is_on_list()) { - LogSegment *ls = mds->mdlog->get_current_segment(); + mdcache->open_file_table.should_log_open(cur)) { EOpen *le = new EOpen(mds->mdlog); mdlog->start_entry(le); le->add_clean_inode(cur); - ls->open_files.push_back(&cur->item_open_file); mdlog->submit_entry(le); } @@ -3661,8 +3663,6 @@ void Server::handle_client_openc(MDRequestRef& mdr) // make sure this inode gets into the journal le->metablob.add_opened_ino(in->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&in->item_open_file); C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows); @@ -4297,8 +4297,6 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode) // make sure ino gets into the journal le->metablob.add_opened_ino(in->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&in->item_open_file); mdr->o_trunc = true; @@ -5286,8 +5284,6 @@ void Server::handle_client_mkdir(MDRequestRef& mdr) // make sure this inode gets into the journal le->metablob.add_opened_ino(newi->ino()); - LogSegment *ls = mds->mdlog->get_current_segment(); - ls->open_files.push_back(&newi->item_open_file); journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi)); } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index ed796b9b071..f90dfb92bc6 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -155,22 +155,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o while (!p.end()) { CInode *in = *p; ++p; - if (in->last == CEPH_NOSNAP && in->is_auth() && - !in->is_ambiguous_auth() && in->is_any_caps()) { - if (in->is_any_caps_wanted()) { - dout(20) << "try_to_expire requeueing open file " << *in << dendl; - if (!le) { - le = new EOpen(mds->mdlog); - mds->mdlog->start_entry(le); - } - le->add_clean_inode(in); - ls->open_files.push_back(&in->item_open_file); - } else { - // drop inodes that aren't wanted - dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in << dendl; - in->item_open_file.remove_myself(); - } - } else if (in->last != CEPH_NOSNAP && !in->client_snap_caps.empty()) { + if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) { // journal snap inodes that need flush. This simplify the mds failover hanlding dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl; if (!le) { @@ -180,16 +165,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o le->add_clean_inode(in); ls->open_files.push_back(&in->item_open_file); } else { - /* - * we can get a capless inode here if we replay an open file, the client fails to - * reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS - * it's ok for the client to replay an open on a file it doesn't have in it's cache - * anymore. - * - * this makes the mds less sensitive to strict open_file consistency, although it does - * make it easier to miss subtle problems. - */ - dout(20) << "try_to_expire not requeueing and delisting capless file " << *in << dendl; + // open files are tracked by open file table, no need to journal them again in->item_open_file.remove_myself(); } } @@ -1619,21 +1595,24 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup) } // destroyed inodes - for (vector::iterator p = destroyed_inodes.begin(); - p != destroyed_inodes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (in) { - dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; - CDentry *parent = in->get_parent_dn(); - mds->mdcache->remove_inode(in); - if (parent) { - dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; - assert(parent->get_linkage()->is_null()); + if (!destroyed_inodes.empty()) { + for (vector::iterator p = destroyed_inodes.begin(); + p != destroyed_inodes.end(); + ++p) { + CInode *in = mds->mdcache->get_inode(*p); + if (in) { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl; + CDentry *parent = in->get_parent_dn(); + mds->mdcache->remove_inode(in); + if (parent) { + dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl; + assert(parent->get_linkage()->is_null()); + } + } else { + dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; } - } else { - dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl; } + mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes); } // client requests