Merge PR #20132 into master

* refs/pull/20132/head:
	qa/cephfs: update TestDamage for open file table
	mds: allow storing open file table in multiple omaps
	mds: differentiate Anchor types to clarify purpose
	mds: add perf counter for 'open ino' operation
	mds: protect open file table against partial omap update
	mds: add dirfrags whose child inodes have caps to open file table
	mds: don't try prefetching destroyed inodes
	mds: don't try opening inodes that haven't been created
	mds: don't re-requeue open files to head of log
	mds: use open file table to speed up mds recovery
	mds: introduce open file table
	mds: track how many clients/mds want caps for each inode
	mds: cleanup MDCache::opening_inodes access
	mds: cleanup CInode/CDir states definition

Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2018-03-30 22:25:10 -07:00
commit b7fce64601
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
22 changed files with 1736 additions and 155 deletions

View File

@ -141,7 +141,9 @@ class TestDamage(CephFSTestCase):
# Missing dirfrags for non-system dirs result in empty directory
"10000000000.00000000",
# PurgeQueue is auto-created if not found on startup
"500.00000000"
"500.00000000",
# open file table is auto-created if not found on startup
"mds0_openfiles.0"
]:
expectation = NO_DAMAGE
else:

60
src/mds/Anchor.cc Normal file
View File

@ -0,0 +1,60 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2018 Red Hat
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "mds/Anchor.h"
#include "common/Formatter.h"
void Anchor::encode(bufferlist &bl) const
{
ENCODE_START(1, 1, bl);
encode(ino, bl);
encode(dirino, bl);
encode(d_name, bl);
encode(d_type, bl);
ENCODE_FINISH(bl);
}
void Anchor::decode(bufferlist::iterator &bl)
{
DECODE_START(1, bl);
decode(ino, bl);
decode(dirino, bl);
decode(d_name, bl);
decode(d_type, bl);
DECODE_FINISH(bl);
}
void Anchor::dump(Formatter *f) const
{
f->dump_unsigned("ino", ino);
f->dump_unsigned("dirino", dirino);
f->dump_string("d_name", d_name);
f->dump_unsigned("d_type", d_type);
}
void Anchor::generate_test_instances(list<Anchor*>& ls)
{
ls.push_back(new Anchor);
ls.push_back(new Anchor);
ls.back()->ino = 1;
ls.back()->dirino = 2;
ls.back()->d_name = "hello";
ls.back()->d_type = DT_DIR;
}
ostream& operator<<(ostream& out, const Anchor &a)
{
return out << "a(" << a.ino << " " << a.dirino << "/'" << a.d_name << "' " << a.d_type << ")";
}

73
src/mds/Anchor.h Normal file
View File

@ -0,0 +1,73 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2018 Red Hat
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef CEPH_ANCHOR_H
#define CEPH_ANCHOR_H
#include <string>
#include "include/types.h"
#include "mdstypes.h"
#include "include/buffer.h"
/*
* Anchor represents primary linkage of an inode. When adding inode to an
* anchor table, MDS ensures that the table also contains inode's ancestor
* inodes. MDS can get inode's path by looking up anchor table recursively.
*/
class Anchor {
public:
inodeno_t ino; // anchored ino
inodeno_t dirino;
std::string d_name;
__u8 d_type = 0;
int omap_idx = -1; // stored in which omap object
Anchor() {}
Anchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp) :
ino(i), dirino(di), d_name(str), d_type(tp) {}
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const;
static void generate_test_instances(list<Anchor*>& ls);
};
WRITE_CLASS_ENCODER(Anchor)
inline bool operator==(const Anchor &l, const Anchor &r) {
return l.ino == r.ino && l.dirino == r.dirino &&
l.d_name == r.d_name && l.d_type == r.d_type;
}
ostream& operator<<(ostream& out, const Anchor &a);
class RecoveredAnchor : public Anchor {
public:
RecoveredAnchor() {}
mds_rank_t auth = MDS_RANK_NONE; // auth hint
};
class OpenedAnchor : public Anchor {
public:
OpenedAnchor(inodeno_t i, inodeno_t di, std::string_view str, __u8 tp, int nr) :
Anchor(i, di, str, tp),
nref(nr)
{}
mutable int nref = 0; // how many children
};
#endif

View File

@ -296,6 +296,18 @@ bool CDir::check_rstats(bool scrub)
return good;
}
void CDir::adjust_num_inodes_with_caps(int d)
{
// FIXME: smarter way to decide if adding 'this' to open file table
if (num_inodes_with_caps == 0 && d > 0)
cache->open_file_table.add_dirfrag(this);
else if (num_inodes_with_caps > 0 && num_inodes_with_caps == -d)
cache->open_file_table.remove_dirfrag(this);
num_inodes_with_caps += d;
assert(num_inodes_with_caps >= 0);
}
CDentry *CDir::lookup(std::string_view name, snapid_t snap)
{
dout(20) << "lookup (" << snap << ", '" << name << "')" << dendl;
@ -574,6 +586,11 @@ void CDir::link_inode_work( CDentry *dn, CInode *in)
// pin dentry?
if (in->get_num_ref())
dn->get(CDentry::PIN_INODEPIN);
if (in->state_test(CInode::STATE_TRACKEDBYOFT))
inode->mdcache->open_file_table.notify_link(in);
if (in->is_any_caps())
adjust_num_inodes_with_caps(1);
// adjust auth pin count
if (in->auth_pins + in->nested_auth_pins)
@ -650,6 +667,11 @@ void CDir::unlink_inode_work( CDentry *dn )
// unpin dentry?
if (in->get_num_ref())
dn->put(CDentry::PIN_INODEPIN);
if (in->state_test(CInode::STATE_TRACKEDBYOFT))
inode->mdcache->open_file_table.notify_unlink(in);
if (in->is_any_caps())
adjust_num_inodes_with_caps(-1);
// unlink auth_pin count
if (in->auth_pins + in->nested_auth_pins)
@ -842,6 +864,9 @@ void CDir::steal_dentry(CDentry *dn)
if (pi->accounted_rstat.rctime > fnode.rstat.rctime)
fnode.rstat.rctime = pi->accounted_rstat.rctime;
if (in->is_any_caps())
adjust_num_inodes_with_caps(1);
// move dirty inode rstat to new dirfrag
if (in->is_dirty_rstat())
dirty_rstat_inodes.push_back(&in->dirty_rstat_item);
@ -923,6 +948,7 @@ void CDir::finish_old_fragment(list<MDSInternalContextBase*>& waiters, bool repl
num_head_items = num_head_null = 0;
num_snap_items = num_snap_null = 0;
adjust_num_inodes_with_caps(-num_inodes_with_caps);
// this mirrors init_fragment_pins()
if (is_auth())

View File

@ -73,25 +73,26 @@ public:
}
// -- state --
static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache
static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports)
static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing
static const unsigned STATE_FROZENDIR = (1<< 4);
static const unsigned STATE_FREEZINGDIR = (1<< 5);
static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit
static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching
static const unsigned STATE_CREATING = (1<< 8);
static const unsigned STATE_IMPORTBOUND = (1<<10);
static const unsigned STATE_EXPORTBOUND = (1<<11);
static const unsigned STATE_EXPORTING = (1<<12);
static const unsigned STATE_IMPORTING = (1<<13);
static const unsigned STATE_FRAGMENTING = (1<<14);
static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs
static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting
static const unsigned STATE_ASSIMRSTAT = (1<<17); // assimilating inode->frag rstats
static const unsigned STATE_DIRTYDFT = (1<<18); // dirty dirfragtree
static const unsigned STATE_BADFRAG = (1<<19); // bad dirfrag
static const unsigned STATE_AUXSUBTREE = (1<<20); // no subtree merge
static const unsigned STATE_COMPLETE = (1<< 0); // the complete contents are in cache
static const unsigned STATE_FROZENTREE = (1<< 1); // root of tree (bounded by exports)
static const unsigned STATE_FREEZINGTREE = (1<< 2); // in process of freezing
static const unsigned STATE_FROZENDIR = (1<< 3);
static const unsigned STATE_FREEZINGDIR = (1<< 4);
static const unsigned STATE_COMMITTING = (1<< 5); // mid-commit
static const unsigned STATE_FETCHING = (1<< 6); // currenting fetching
static const unsigned STATE_CREATING = (1<< 7);
static const unsigned STATE_IMPORTBOUND = (1<< 8);
static const unsigned STATE_EXPORTBOUND = (1<< 9);
static const unsigned STATE_EXPORTING = (1<<10);
static const unsigned STATE_IMPORTING = (1<<11);
static const unsigned STATE_FRAGMENTING = (1<<12);
static const unsigned STATE_STICKY = (1<<13); // sticky pin due to inode stickydirs
static const unsigned STATE_DNPINNEDFRAG = (1<<14); // dir is refragmenting
static const unsigned STATE_ASSIMRSTAT = (1<<15); // assimilating inode->frag rstats
static const unsigned STATE_DIRTYDFT = (1<<16); // dirty dirfragtree
static const unsigned STATE_BADFRAG = (1<<17); // bad dirfrag
static const unsigned STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
static const unsigned STATE_AUXSUBTREE = (1<<19); // no subtree merge
// common states
static const unsigned STATE_CLEAN = 0;
@ -102,18 +103,22 @@ public:
(STATE_COMPLETE|STATE_DIRTY|STATE_DIRTYDFT|STATE_BADFRAG);
static const unsigned MASK_STATE_IMPORT_KEPT =
(
STATE_IMPORTING
|STATE_IMPORTBOUND|STATE_EXPORTBOUND
|STATE_FROZENTREE
|STATE_STICKY);
STATE_IMPORTING |
STATE_IMPORTBOUND |
STATE_EXPORTBOUND |
STATE_FROZENTREE |
STATE_STICKY |
STATE_TRACKEDBYOFT);
static const unsigned MASK_STATE_EXPORT_KEPT =
(STATE_EXPORTING
|STATE_IMPORTBOUND|STATE_EXPORTBOUND
|STATE_FROZENTREE
|STATE_FROZENDIR
|STATE_STICKY);
(STATE_EXPORTING |
STATE_IMPORTBOUND |
STATE_EXPORTBOUND |
STATE_FROZENTREE |
STATE_FROZENDIR |
STATE_STICKY |
STATE_TRACKEDBYOFT);
static const unsigned MASK_STATE_FRAGMENT_KEPT =
(STATE_DIRTY|
(STATE_DIRTY |
STATE_EXPORTBOUND |
STATE_IMPORTBOUND |
STATE_AUXSUBTREE |
@ -345,6 +350,8 @@ protected:
int num_dirty;
int num_inodes_with_caps = 0;
// state
version_t committing_version;
version_t committed_version;
@ -437,6 +444,8 @@ protected:
return num_dirty;
}
void adjust_num_inodes_with_caps(int d);
int64_t get_frag_size() const {
return get_projected_fnode()->fragstat.size();
}

View File

@ -2730,7 +2730,7 @@ client_t CInode::calc_ideal_loner()
{
if (mdcache->is_readonly())
return -1;
if (!mds_caps_wanted.empty())
if (!get_mds_caps_wanted().empty())
return -1;
int n = 0;
@ -2844,6 +2844,43 @@ void CInode::choose_lock_states(int dirty_caps)
choose_lock_state(&linklock, issued);
}
void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
{
bool old_empty = mds_caps_wanted.empty();
mds_caps_wanted.swap(m);
if (old_empty != (bool)mds_caps_wanted.empty()) {
if (old_empty)
adjust_num_caps_wanted(1);
else
adjust_num_caps_wanted(-1);
}
}
void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
{
bool old_empty = mds_caps_wanted.empty();
if (wanted) {
mds_caps_wanted[mds] = wanted;
if (old_empty)
adjust_num_caps_wanted(1);
} else if (!old_empty) {
mds_caps_wanted.erase(mds);
if (mds_caps_wanted.empty())
adjust_num_caps_wanted(-1);
}
}
void CInode::adjust_num_caps_wanted(int d)
{
if (!num_caps_wanted && d > 0)
mdcache->open_file_table.add_inode(this);
else if (num_caps_wanted > 0 && num_caps_wanted == -d)
mdcache->open_file_table.remove_inode(this);
num_caps_wanted +=d;
assert(num_caps_wanted >= 0);
}
Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm *conrealm)
{
assert(last == CEPH_NOSNAP);
@ -2855,10 +2892,11 @@ Capability *CInode::add_client_cap(client_t client, Session *session, SnapRealm
containing_realm = find_snaprealm();
containing_realm->inodes_with_caps.push_back(&item_caps);
dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
}
if (client_caps.empty())
mdcache->num_inodes_with_caps++;
if (parent)
parent->dir->adjust_num_inodes_with_caps(1);
}
Capability *cap = new Capability(this, ++mdcache->last_cap_id, client);
assert(client_caps.count(client) == 0);
@ -2888,6 +2926,9 @@ void CInode::remove_client_cap(client_t client)
if (client == loner_cap)
loner_cap = -1;
if (cap->wanted())
adjust_num_caps_wanted(-1);
delete cap;
client_caps.erase(client);
if (client_caps.empty()) {
@ -2895,8 +2936,9 @@ void CInode::remove_client_cap(client_t client)
put(PIN_CAPS);
item_caps.remove_myself();
containing_realm = NULL;
item_open_file.remove_myself(); // unpin logsegment
mdcache->num_inodes_with_caps--;
if (parent)
parent->dir->adjust_num_inodes_with_caps(-1);
}
//clean up advisory locks
@ -2947,7 +2989,10 @@ void CInode::clear_client_caps_after_export()
remove_client_cap(client_caps.begin()->first);
loner_cap = -1;
want_loner_cap = -1;
mds_caps_wanted.clear();
if (!get_mds_caps_wanted().empty()) {
mempool::mds_co::compact_map<int32_t,int32_t> empty;
set_mds_caps_wanted(empty);
}
}
void CInode::export_client_caps(map<client_t,Capability::Export>& cl)

View File

@ -215,31 +215,33 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS);
// -- state --
static const int STATE_EXPORTING = (1<<2); // on nonauth bystander.
static const int STATE_OPENINGDIR = (1<<5);
static const int STATE_FREEZING = (1<<7);
static const int STATE_FROZEN = (1<<8);
static const int STATE_AMBIGUOUSAUTH = (1<<9);
static const int STATE_EXPORTINGCAPS = (1<<10);
static const int STATE_NEEDSRECOVER = (1<<11);
static const int STATE_RECOVERING = (1<<12);
static const int STATE_PURGING = (1<<13);
static const int STATE_DIRTYPARENT = (1<<14);
static const int STATE_DIRTYRSTAT = (1<<15);
static const int STATE_STRAYPINNED = (1<<16);
static const int STATE_FROZENAUTHPIN = (1<<17);
static const int STATE_DIRTYPOOL = (1<<18);
static const int STATE_REPAIRSTATS = (1<<19);
static const int STATE_MISSINGOBJS = (1<<20);
static const int STATE_EVALSTALECAPS = (1<<21);
static const int STATE_QUEUEDEXPORTPIN = (1<<22);
static const int STATE_EXPORTING = (1<<0); // on nonauth bystander.
static const int STATE_OPENINGDIR = (1<<1);
static const int STATE_FREEZING = (1<<2);
static const int STATE_FROZEN = (1<<3);
static const int STATE_AMBIGUOUSAUTH = (1<<4);
static const int STATE_EXPORTINGCAPS = (1<<5);
static const int STATE_NEEDSRECOVER = (1<<6);
static const int STATE_RECOVERING = (1<<7);
static const int STATE_PURGING = (1<<8);
static const int STATE_DIRTYPARENT = (1<<9);
static const int STATE_DIRTYRSTAT = (1<<10);
static const int STATE_STRAYPINNED = (1<<11);
static const int STATE_FROZENAUTHPIN = (1<<12);
static const int STATE_DIRTYPOOL = (1<<13);
static const int STATE_REPAIRSTATS = (1<<14);
static const int STATE_MISSINGOBJS = (1<<15);
static const int STATE_EVALSTALECAPS = (1<<16);
static const int STATE_QUEUEDEXPORTPIN = (1<<17);
static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
// orphan inode needs notification of releasing reference
static const int STATE_ORPHAN = STATE_NOTIFYREF;
static const int MASK_STATE_EXPORTED =
(STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
static const int MASK_STATE_EXPORT_KEPT =
(STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
(STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT);
// -- waiters --
static const uint64_t WAIT_DIR = (1<<0);
@ -565,7 +567,8 @@ protected:
using cap_map = mempool::mds_co::map<client_t, Capability*>;
cap_map client_caps; // client -> caps
mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
int replica_caps_wanted = 0; // [replica] what i've requested from auth
int replica_caps_wanted = 0; // [replica] what i've requested from auth
int num_caps_wanted = 0;
public:
mempool::mds_co::compact_map<int, mempool::mds_co::set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
@ -690,6 +693,7 @@ public:
clear_file_locks();
assert(num_projected_xattrs == 0);
assert(num_projected_srnodes == 0);
assert(num_caps_wanted == 0);
}
@ -974,7 +978,8 @@ public:
bool is_any_nonstale_caps() { return count_nonstale_caps(); }
const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m);
void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted);
const cap_map& get_client_caps() const { return client_caps; }
Capability *get_client_cap(client_t client) {
@ -992,6 +997,9 @@ public:
}
}
int get_num_caps_wanted() const { return num_caps_wanted; }
void adjust_num_caps_wanted(int d);
Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0);
void remove_client_cap(client_t client);
void move_to_realm(SnapRealm *realm);

View File

@ -36,6 +36,8 @@ set(mds_srcs
MDLog.cc
MDSCacheObject.cc
Mantle.cc
Anchor.cc
OpenFileTable.cc
${CMAKE_SOURCE_DIR}/src/common/TrackedOp.cc
${CMAKE_SOURCE_DIR}/src/common/MemoryModel.cc
${CMAKE_SOURCE_DIR}/src/osdc/Journaler.cc)

View File

@ -13,6 +13,7 @@
*/
#include "Capability.h"
#include "CInode.h"
#include "common/Formatter.h"
@ -141,6 +142,17 @@ void Capability::revoke_info::generate_test_instances(list<Capability::revoke_in
* Capability
*/
void Capability::set_wanted(int w) {
CInode *in = get_inode();
if (in) {
if (!_wanted && w)
in->adjust_num_caps_wanted(1);
else if (_wanted && !w)
in->adjust_num_caps_wanted(-1);
}
_wanted = w;
}
void Capability::encode(bufferlist& bl) const
{
ENCODE_START(2, 2, bl)
@ -159,7 +171,9 @@ void Capability::decode(bufferlist::iterator &bl)
decode(last_sent, bl);
decode(last_issue_stamp, bl);
decode(_wanted, bl);
__u32 tmp_wanted;
decode(tmp_wanted, bl);
set_wanted(tmp_wanted);
decode(_pending, bl);
decode(_revokes, bl);
DECODE_FINISH(bl);
@ -189,7 +203,7 @@ void Capability::generate_test_instances(list<Capability*>& ls)
ls.push_back(new Capability);
ls.back()->last_sent = 11;
ls.back()->last_issue_stamp = utime_t(12, 13);
ls.back()->_wanted = 14;
ls.back()->set_wanted(14);
ls.back()->_pending = 15;
{
auto &r = ls.back()->_revokes.emplace_back();

View File

@ -257,10 +257,7 @@ public:
// caps this client wants to hold
int wanted() { return _wanted; }
void set_wanted(int w) {
_wanted = w;
//check_rdcaps_list();
}
void set_wanted(int w);
void inc_last_seq() { last_sent++; }
ceph_seq_t get_last_seq() { return last_sent; }
@ -291,7 +288,7 @@ public:
client_follows = other.client_follows;
// wanted
_wanted = _wanted | other.wanted;
set_wanted(wanted() | other.wanted);
if (auth_cap)
mseq = other.mseq;
}
@ -308,7 +305,7 @@ public:
}
// wanted
_wanted = _wanted | otherwanted;
set_wanted(wanted() | otherwanted);
}
void revoke() {

View File

@ -2263,10 +2263,7 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m)
dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
if (m->get_caps())
in->mds_caps_wanted[from] = m->get_caps();
else
in->mds_caps_wanted.erase(from);
in->set_mds_caps_wanted(from, m->get_caps());
try_eval(in, CEPH_CAP_LOCKS);
m->put();
@ -2435,7 +2432,6 @@ bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
eo->add_ino(in->ino());
metablob = &eo->metablob;
le = eo;
mut->ls->open_files.push_back(&in->item_open_file);
} else {
EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size");
metablob = &eu->metablob;
@ -2541,27 +2537,18 @@ void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
return;
}
if (cap->wanted() == 0) {
if (cur->item_open_file.is_on_list() &&
!cur->is_any_caps_wanted()) {
dout(10) << " removing unwanted file from open file list " << *cur << dendl;
cur->item_open_file.remove_myself();
}
} else {
if (cap->wanted()) {
if (cur->state_test(CInode::STATE_RECOVERING) &&
(cap->wanted() & (CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_WR))) {
mds->mdcache->recovery_queue.prioritize(cur);
}
if (!cur->item_open_file.is_on_list()) {
dout(10) << " adding to open file list " << *cur << dendl;
if (mdcache->open_file_table.should_log_open(cur)) {
assert(cur->last == CEPH_NOSNAP);
LogSegment *ls = mds->mdlog->get_current_segment();
EOpen *le = new EOpen(mds->mdlog);
mds->mdlog->start_entry(le);
le->add_clean_inode(cur);
ls->open_files.push_back(&cur->item_open_file);
mds->mdlog->submit_entry(le);
}
}
@ -3336,7 +3323,7 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
bool need_issue = false;
if (cap)
cap->inc_suppress();
if (in->mds_caps_wanted.empty() &&
if (in->get_mds_caps_wanted().empty() &&
(in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) {
if (in->filelock.get_state() != LOCK_EXCL)
file_excl(&in->filelock, &need_issue);
@ -5042,7 +5029,7 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue)
assert(in->is_auth());
assert(lock->is_stable());
assert((in->get_loner() >= 0 && in->mds_caps_wanted.empty()) ||
assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) ||
(lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> <anything else>
switch (lock->get_state()) {
@ -5103,7 +5090,7 @@ void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl;
CInode *in = static_cast<CInode *>(lock->get_parent());
assert(in->is_auth());
assert(in->get_loner() >= 0 && in->mds_caps_wanted.empty());
assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty());
switch (lock->get_state()) {
case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break;

View File

@ -171,7 +171,8 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
filer(m->objecter, m->finisher),
exceeded_size_limit(false),
recovery_queue(m),
stray_manager(m, purge_queue_)
stray_manager(m, purge_queue_),
open_file_table(m)
{
migrator.reset(new Migrator(mds, this));
root = NULL;
@ -4803,7 +4804,7 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong)
// caps_wanted
if (is.caps_wanted) {
in->mds_caps_wanted[from] = is.caps_wanted;
in->set_mds_caps_wanted(from, is.caps_wanted);
dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
<< " on " << *in << dendl;
}
@ -5293,10 +5294,41 @@ void MDCache::rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map
rejoin_gather_finish();
}
void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
{
auto p = cap_imports.find(ino);
if (p != cap_imports.end()) {
dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
if (ret < 0) {
cap_imports_missing.insert(ino);
} else if (ret != mds->get_nodeid()) {
for (auto q = p->second.begin(); q != p->second.end(); ++q) {
assert(q->second.count(MDS_RANK_NONE));
assert(q->second.size() == 1);
rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
}
cap_imports.erase(p);
}
}
}
bool MDCache::process_imported_caps()
{
dout(10) << "process_imported_caps" << dendl;
if (!open_file_table.is_prefetched() &&
open_file_table.prefetch_inodes()) {
open_file_table.wait_for_prefetch(
new MDSInternalContextWrapper(mds,
new FunctionContext([this](int r) {
assert(rejoin_gather.count(mds->get_nodeid()));
process_imported_caps();
})
)
);
return true;
}
for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
CInode *in = get_inode(p->first);
if (in) {
@ -5602,10 +5634,8 @@ void MDCache::clean_open_file_lists()
CInode *in = *q;
++q;
if (in->last == CEPH_NOSNAP) {
if (!in->is_any_caps_wanted()) {
dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
in->item_open_file.remove_myself();
}
dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
in->item_open_file.remove_myself();
} else {
if (in->client_snap_caps.empty()) {
dout(10) << " unlisting flushed snap inode " << *in << dendl;
@ -7413,7 +7443,7 @@ void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
set<SimpleLock *>& gather_locks)
{
in->remove_replica(from);
in->mds_caps_wanted.erase(from);
in->set_mds_caps_wanted(from, 0);
// note: this code calls _eval more often than it needs to!
// fix lock
@ -8379,8 +8409,8 @@ struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
mdcache->handle_open_ino(msg, r);
return;
}
assert(mdcache->opening_inodes.count(ino));
mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r);
auto& info = mdcache->opening_inodes.at(ino);
mdcache->_open_ino_traverse_dir(ino, info, r);
}
};
@ -8397,8 +8427,7 @@ void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err
{
dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
assert(opening_inodes.count(ino));
open_ino_info_t& info = opening_inodes[ino];
open_ino_info_t& info = opening_inodes.at(ino);
CInode *in = get_inode(ino);
if (in) {
@ -8473,8 +8502,7 @@ void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
{
dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
assert(opening_inodes.count(ino));
open_ino_info_t& info = opening_inodes[ino];
open_ino_info_t& info = opening_inodes.at(ino);
CInode *in = get_inode(ino);
if (in) {
@ -8527,6 +8555,8 @@ void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool
if (dir->state_test(CDir::STATE_REJOINUNDEF))
assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
if (mds->logger)
mds->logger->inc(l_mds_openino_dir_fetch);
}
int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
@ -8688,21 +8718,22 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
dout(10) << "do_open_ino_peer " << ino << " active " << active
<< " all " << all << " checked " << info.checked << dendl;
mds_rank_t whoami = mds->get_nodeid();
mds_rank_t peer = MDS_RANK_NONE;
if (info.auth_hint >= 0) {
if (info.auth_hint >= 0 && info.auth_hint != whoami) {
if (active.count(info.auth_hint)) {
peer = info.auth_hint;
info.auth_hint = MDS_RANK_NONE;
}
} else {
for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) {
if (*p != whoami && info.checked.count(*p) == 0) {
peer = *p;
break;
}
}
if (peer < 0) {
all.erase(mds->get_nodeid());
all.erase(whoami);
if (all != info.checked) {
dout(10) << " waiting for more peers to be active" << dendl;
} else {
@ -8716,6 +8747,8 @@ void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
if (info.discover || !info.fetch_backtrace)
pa = &info.ancestors;
mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer);
if (mds->logger)
mds->logger->inc(l_mds_openino_peer_discover);
}
}
@ -8830,8 +8863,9 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
<< want_replica << dendl;
if (opening_inodes.count(ino)) {
open_ino_info_t& info = opening_inodes[ino];
auto it = opening_inodes.find(ino);
if (it != opening_inodes.end()) {
open_ino_info_t& info = it->second;
if (want_replica) {
info.want_replica = true;
if (want_xlocked && !info.want_xlocked) {
@ -8857,7 +8891,14 @@ void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin,
info.tid = ++open_ino_last_tid;
info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
info.waiters.push_back(fin);
do_open_ino(ino, info, 0);
if (mds->is_rejoin() &&
open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
info.fetch_backtrace = false;
info.checking = mds->get_nodeid();
_open_ino_traverse_dir(ino, info, 0);
} else {
do_open_ino(ino, info, 0);
}
}
}
@ -9588,6 +9629,8 @@ void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Conte
{
object_t oid = CInode::get_object_name(ino, frag_t(), "");
mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
if (mds->logger)
mds->logger->inc(l_mds_openino_backtrace_fetch);
}

View File

@ -31,6 +31,7 @@
#include "events/EMetaBlob.h"
#include "RecoveryQueue.h"
#include "StrayManager.h"
#include "OpenFileTable.h"
#include "MDSContext.h"
#include "MDSMap.h"
#include "Mutation.h"
@ -596,6 +597,12 @@ public:
mds_rank_t frommds=MDS_RANK_NONE) {
cap_imports[ino][client][frommds] = icr;
}
bool rejoin_has_cap_reconnect(inodeno_t ino) const {
return cap_imports.count(ino);
}
void add_replay_ino_alloc(inodeno_t ino) {
cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
}
const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
if (cap_imports.count(ino) &&
cap_imports[ino].count(client) &&
@ -643,6 +650,7 @@ public:
friend class C_MDC_RejoinOpenInoFinish;
friend class C_MDC_RejoinSessionsOpened;
void rejoin_open_ino_finish(inodeno_t ino, int ret);
void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
void rejoin_open_sessions_finish(map<client_t,entity_inst_t> client_map,
map<client_t,uint64_t>& sseqmap);
bool process_imported_caps();
@ -1225,6 +1233,8 @@ public:
public:
/* Because exports may fail, this set lets us keep track of inodes that need exporting. */
std::set<CInode *> export_pin_queue;
OpenFileTable open_file_table;
};
class C_MDS_RetryRequest : public MDSInternalContext {

View File

@ -566,6 +566,17 @@ void MDLog::_journal_segment_subtree_map(MDSInternalContextBase *onsync)
_submit_entry(sle, new C_MDL_Flushed(this, onsync));
}
class C_OFT_Committed : public MDSInternalContext {
MDLog *mdlog;
uint64_t seq;
public:
C_OFT_Committed(MDLog *l, uint64_t s) :
MDSInternalContext(l->mds), mdlog(l), seq(s) {}
void finish(int ret) override {
mdlog->trim_expired_segments();
}
};
void MDLog::trim(int m)
{
unsigned max_segments = g_conf->mds_log_max_segments;
@ -636,6 +647,7 @@ void MDLog::trim(int m)
<< journaler->get_write_safe_pos() << " < end " << ls->end << dendl;
break;
}
if (expiring_segments.count(ls)) {
dout(5) << "trim already expiring segment " << ls->seq << "/" << ls->offset
<< ", " << ls->num_events << " events" << dendl;
@ -657,6 +669,18 @@ void MDLog::trim(int m)
}
}
if (!capped &&
!mds->mdcache->open_file_table.is_any_committing()) {
uint64_t last_seq = get_last_segment_seq();
if (mds->mdcache->open_file_table.is_any_dirty() ||
last_seq > mds->mdcache->open_file_table.get_committed_log_seq()) {
submit_mutex.Unlock();
mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq),
last_seq, CEPH_MSG_PRIO_HIGH);
submit_mutex.Lock();
}
}
// discard expired segments and unlock submit_mutex
_trim_expired_segments();
}
@ -689,8 +713,16 @@ int MDLog::trim_all()
<< "/" << expired_segments.size() << dendl;
uint64_t last_seq = 0;
if (!segments.empty())
if (!segments.empty()) {
last_seq = get_last_segment_seq();
if (!mds->mdcache->open_file_table.is_any_committing() &&
last_seq > mds->mdcache->open_file_table.get_committing_log_seq()) {
submit_mutex.Unlock();
mds->mdcache->open_file_table.commit(new C_OFT_Committed(this, last_seq),
last_seq, CEPH_MSG_PRIO_DEFAULT);
submit_mutex.Lock();
}
}
map<uint64_t,LogSegment*>::iterator p = segments.begin();
while (p != segments.end() &&
@ -770,6 +802,8 @@ void MDLog::_trim_expired_segments()
{
assert(submit_mutex.is_locked_by_me());
uint64_t oft_committed_seq = mds->mdcache->open_file_table.get_committed_log_seq();
// trim expired segments?
bool trimmed = false;
while (!segments.empty()) {
@ -779,6 +813,12 @@ void MDLog::_trim_expired_segments()
<< " to expire" << dendl;
break;
}
if (!capped && ls->seq >= oft_committed_seq) {
dout(10) << "_trim_expired_segments open file table committedseq " << oft_committed_seq
<< " <= " << ls->seq << "/" << ls->offset << dendl;
break;
}
dout(10) << "_trim_expired_segments trimming expired "
<< ls->seq << "/0x" << std::hex << ls->offset << std::dec << dendl;
@ -1429,6 +1469,9 @@ void MDLog::standby_trim_segments()
dout(10) << "standby_trim_segments" << dendl;
uint64_t expire_pos = journaler->get_expire_pos();
dout(10) << " expire_pos=" << expire_pos << dendl;
mds->mdcache->open_file_table.trim_destroyed_inos(expire_pos);
bool removed_segment = false;
while (have_any_segments()) {
LogSegment *seg = get_oldest_segment();

View File

@ -306,6 +306,7 @@ private:
friend class C_MaybeExpiredSegment;
friend class C_MDL_Flushed;
friend class C_OFT_Committed;
public:
void trim_expired_segments();

View File

@ -1076,6 +1076,8 @@ void MDSRank::boot_start(BootStep step, int r)
} else if (!standby_replaying) {
dout(2) << "boot_start " << step << ": opening purge queue (async)" << dendl;
purge_queue.open(NULL);
dout(2) << "boot_start " << step << ": loading open file table (async)" << dendl;
mdcache->open_file_table.load(nullptr);
}
if (mdsmap->get_tableserver() == whoami) {
@ -1275,8 +1277,10 @@ void MDSRank::standby_replay_restart()
this,
mdlog->get_journaler()->get_read_pos()));
dout(1) << " opening purge queue (async)" << dendl;
dout(1) << " opening purge_queue (async)" << dendl;
purge_queue.open(NULL);
dout(1) << " opening open_file_table (async)" << dendl;
mdcache->open_file_table.load(nullptr);
} else {
dout(1) << " waiting for osdmap " << mdsmap->get_last_failure_osd_epoch()
<< " (which blacklists prior instance)" << dendl;
@ -2635,6 +2639,12 @@ void MDSRank::create_logger()
mds_plb.add_u64_counter(
l_mds_imported_inodes, "imported_inodes", "Imported inodes", "imi",
PerfCountersBuilder::PRIO_INTERESTING);
mds_plb.add_u64_counter(l_mds_openino_dir_fetch, "openino_dir_fetch",
"OpenIno incomplete directory fetchings");
mds_plb.add_u64_counter(l_mds_openino_backtrace_fetch, "openino_backtrace_fetch",
"OpenIno backtrace fetchings");
mds_plb.add_u64_counter(l_mds_openino_peer_discover, "openino_peer_discover",
"OpenIno peer inode discovers");
logger = mds_plb.create_perf_counters();
g_ceph_context->get_perfcounters_collection()->add(logger);
}

View File

@ -71,6 +71,9 @@ enum {
l_mds_exported_inodes,
l_mds_imported,
l_mds_imported_inodes,
l_mds_openino_dir_fetch,
l_mds_openino_backtrace_fetch,
l_mds_openino_peer_discover,
l_mds_last,
};

View File

@ -1553,8 +1553,6 @@ void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
in->clear_scatter_dirty();
in->item_open_file.remove_myself();
in->clear_dirty_parent();
in->clear_file_locks();
@ -3047,7 +3045,12 @@ void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
map<client_t,Capability::Export> cap_map;
decode(cap_map, blp);
if (auth_cap)
decode(in->get_mds_caps_wanted(), blp);
if (auth_cap) {
mempool::mds_co::compact_map<int32_t,int32_t> mds_wanted;
decode(mds_wanted, blp);
mds_wanted.erase(mds->get_nodeid());
in->set_mds_caps_wanted(mds_wanted);
}
if (!cap_map.empty() ||
(auth_cap && (in->get_caps_wanted() & ~CEPH_CAP_PIN))) {
peer_exports[in].swap(cap_map);

1126
src/mds/OpenFileTable.cc Normal file

File diff suppressed because it is too large Load Diff

144
src/mds/OpenFileTable.h Normal file
View File

@ -0,0 +1,144 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2018 Red Hat
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef OPEN_FILE_TABLE_H
#define OPEN_FILE_TABLE_H
#include "mdstypes.h"
#include "Anchor.h"
class CDir;
class CInode;
class MDSRank;
class MDSInternalContextBase;
class OpenFileTable
{
public:
OpenFileTable(MDSRank *m) : mds(m) {}
void add_inode(CInode *in);
void remove_inode(CInode *in);
void add_dirfrag(CDir *dir);
void remove_dirfrag(CDir *dir);
void notify_link(CInode *in);
void notify_unlink(CInode *in);
bool is_any_dirty() const { return !dirty_items.empty(); }
void commit(MDSInternalContextBase *c, uint64_t log_seq, int op_prio);
uint64_t get_committed_log_seq() const { return committed_log_seq; }
uint64_t get_committing_log_seq() const { return committing_log_seq; }
bool is_any_committing() const { return num_pending_commit > 0; }
void load(MDSInternalContextBase *c);
bool is_loaded() const { return load_done; }
void wait_for_load(MDSInternalContextBase *c) {
assert(!load_done);
waiting_for_load.push_back(c);
}
bool get_ancestors(inodeno_t ino, vector<inode_backpointer_t>& ancestors,
mds_rank_t& auth_hint);
bool prefetch_inodes();
bool is_prefetched() const { return prefetch_state == DONE; }
void wait_for_prefetch(MDSInternalContextBase *c) {
assert(!is_prefetched());
waiting_for_prefetch.push_back(c);
}
bool should_log_open(CInode *in);
void note_destroyed_inos(uint64_t seq, const vector<inodeno_t>& inos);
void trim_destroyed_inos(uint64_t seq);
protected:
MDSRank *mds;
version_t omap_version = 0;
unsigned omap_num_objs = 0;
std::vector<unsigned> omap_num_items;
map<inodeno_t, OpenedAnchor> anchor_map;
set<dirfrag_t> dirfrags;
std::map<inodeno_t, int> dirty_items; // ino -> dirty state
static const int DIRTY_NEW = -1;
static const int DIRTY_UNDEF = -2;
uint64_t committed_log_seq = 0;
uint64_t committing_log_seq = 0;
void get_ref(CInode *in);
void put_ref(CInode *in);
object_t get_object_name(unsigned idx) const;
enum {
JOURNAL_NONE = 0,
JOURNAL_START = 1,
JOURNAL_FINISH = 2,
};
int journal_state = 0;
unsigned num_pending_commit = 0;
void _encode_header(bufferlist& bl, int j_state);
void _commit_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin);
void _journal_finish(int r, uint64_t log_seq, MDSInternalContextBase *fin,
std::map<unsigned, std::vector<ObjectOperation> >& ops);
std::vector<std::map<std::string, bufferlist> > loaded_journals;
map<inodeno_t, RecoveredAnchor> loaded_anchor_map;
set<dirfrag_t> loaded_dirfrags;
list<MDSInternalContextBase*> waiting_for_load;
bool load_done = false;
void _reset_states() {
omap_num_objs = 0;
omap_num_items.resize(0);
journal_state = JOURNAL_NONE;
loaded_journals.clear();
loaded_anchor_map.clear();
loaded_dirfrags.clear();
}
void _load_finish(int op_r, int header_r, int values_r,
unsigned idx, bool first, bool more,
bufferlist &header_bl,
std::map<std::string, bufferlist> &values);
void _recover_finish(int r);
enum {
DIR_INODES = 1,
DIRFRAGS = 2,
FILE_INODES = 3,
DONE = 4,
};
unsigned prefetch_state = 0;
unsigned num_opening_inodes = 0;
list<MDSInternalContextBase*> waiting_for_prefetch;
void _open_ino_finish(inodeno_t ino, int r);
void _prefetch_inodes();
void _prefetch_dirfrags();
std::map<uint64_t, vector<inodeno_t> > logseg_destroyed_inos;
std::set<inodeno_t> destroyed_inos_set;
friend class C_IO_OFT_Recover;
friend class C_IO_OFT_Load;
friend class C_IO_OFT_Save;
friend class C_IO_OFT_Journal;
friend class C_OFT_OpenInoFinish;
};
#endif

View File

@ -214,6 +214,10 @@ void Server::dispatch(Message *m)
if (req->is_replay()) {
dout(3) << "queuing replayed op" << dendl;
queue_replay = true;
if (req->head.ino &&
!session->have_completed_request(req->get_reqid().tid, nullptr)) {
mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
}
} else if (req->get_retry_attempt()) {
// process completed request in clientreplay stage. The completed request
// might have created new file/directorie. This guarantees MDS sends a reply
@ -3441,12 +3445,10 @@ void Server::handle_client_open(MDRequestRef& mdr)
// make sure this inode gets into the journal
if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
!cur->item_open_file.is_on_list()) {
LogSegment *ls = mds->mdlog->get_current_segment();
mdcache->open_file_table.should_log_open(cur)) {
EOpen *le = new EOpen(mds->mdlog);
mdlog->start_entry(le);
le->add_clean_inode(cur);
ls->open_files.push_back(&cur->item_open_file);
mdlog->submit_entry(le);
}
@ -3661,8 +3663,6 @@ void Server::handle_client_openc(MDRequestRef& mdr)
// make sure this inode gets into the journal
le->metablob.add_opened_ino(in->ino());
LogSegment *ls = mds->mdlog->get_current_segment();
ls->open_files.push_back(&in->item_open_file);
C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in, follows);
@ -4297,8 +4297,6 @@ void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
// make sure ino gets into the journal
le->metablob.add_opened_ino(in->ino());
LogSegment *ls = mds->mdlog->get_current_segment();
ls->open_files.push_back(&in->item_open_file);
mdr->o_trunc = true;
@ -5286,8 +5284,6 @@ void Server::handle_client_mkdir(MDRequestRef& mdr)
// make sure this inode gets into the journal
le->metablob.add_opened_ino(newi->ino());
LogSegment *ls = mds->mdlog->get_current_segment();
ls->open_files.push_back(&newi->item_open_file);
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
}

View File

@ -155,22 +155,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o
while (!p.end()) {
CInode *in = *p;
++p;
if (in->last == CEPH_NOSNAP && in->is_auth() &&
!in->is_ambiguous_auth() && in->is_any_caps()) {
if (in->is_any_caps_wanted()) {
dout(20) << "try_to_expire requeueing open file " << *in << dendl;
if (!le) {
le = new EOpen(mds->mdlog);
mds->mdlog->start_entry(le);
}
le->add_clean_inode(in);
ls->open_files.push_back(&in->item_open_file);
} else {
// drop inodes that aren't wanted
dout(20) << "try_to_expire not requeueing and delisting unwanted file " << *in << dendl;
in->item_open_file.remove_myself();
}
} else if (in->last != CEPH_NOSNAP && !in->client_snap_caps.empty()) {
if (in->last != CEPH_NOSNAP && in->is_auth() && !in->client_snap_caps.empty()) {
// journal snap inodes that need flush. This simplify the mds failover hanlding
dout(20) << "try_to_expire requeueing snap needflush inode " << *in << dendl;
if (!le) {
@ -180,16 +165,7 @@ void LogSegment::try_to_expire(MDSRank *mds, MDSGatherBuilder &gather_bld, int o
le->add_clean_inode(in);
ls->open_files.push_back(&in->item_open_file);
} else {
/*
* we can get a capless inode here if we replay an open file, the client fails to
* reconnect it, but does REPLAY an open request (that adds it to the logseg). AFAICS
* it's ok for the client to replay an open on a file it doesn't have in it's cache
* anymore.
*
* this makes the mds less sensitive to strict open_file consistency, although it does
* make it easier to miss subtle problems.
*/
dout(20) << "try_to_expire not requeueing and delisting capless file " << *in << dendl;
// open files are tracked by open file table, no need to journal them again
in->item_open_file.remove_myself();
}
}
@ -1619,21 +1595,24 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
}
// destroyed inodes
for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
p != destroyed_inodes.end();
++p) {
CInode *in = mds->mdcache->get_inode(*p);
if (in) {
dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
CDentry *parent = in->get_parent_dn();
mds->mdcache->remove_inode(in);
if (parent) {
dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
assert(parent->get_linkage()->is_null());
if (!destroyed_inodes.empty()) {
for (vector<inodeno_t>::iterator p = destroyed_inodes.begin();
p != destroyed_inodes.end();
++p) {
CInode *in = mds->mdcache->get_inode(*p);
if (in) {
dout(10) << "EMetaBlob.replay destroyed " << *p << ", dropping " << *in << dendl;
CDentry *parent = in->get_parent_dn();
mds->mdcache->remove_inode(in);
if (parent) {
dout(10) << "EMetaBlob.replay unlinked from dentry " << *parent << dendl;
assert(parent->get_linkage()->is_null());
}
} else {
dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
}
} else {
dout(10) << "EMetaBlob.replay destroyed " << *p << ", not in cache" << dendl;
}
mds->mdcache->open_file_table.note_destroyed_inos(logseg->seq, destroyed_inodes);
}
// client requests