mds: introduce fine-grained discover dirfrag wait queue

Current discover dirfrag code only allows discover one dirfrag at
a time. This can cause deadlock if there are directories that are
fragmented to several dirfrags. For example:

mds.0                        mds.1
-----------------------------------------------------------------
                             freeze subtree (1.*) with bound (2.1*)
discover (2.0*) ->
                             handle discover (2.0*), frozen tree, wait
                          <- export subtree (1.*) to with bound (2.1*)
discover (2.1*), wait

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
This commit is contained in:
Yan, Zheng 2014-01-13 10:26:08 +08:00
parent 2c909cda0e
commit 1ff776669b
4 changed files with 87 additions and 44 deletions

View File

@ -531,7 +531,7 @@ protected:
map< inodeno_t, list<Context*> > waiting_on_ino;
public:
bool is_waiting_for_dentry(const char *dname, snapid_t snap) {
bool is_waiting_for_dentry(const string& dname, snapid_t snap) {
return waiting_on_dentry.count(string_snap_t(dname, snap));
}
void add_dentry_waiter(const string& dentry, snapid_t snap, Context *c);

View File

@ -1959,6 +1959,30 @@ bool CInode::is_freezing()
return false;
}
void CInode::add_dir_waiter(frag_t fg, Context *c)
{
if (waiting_on_dir.empty())
get(PIN_DIRWAITER);
waiting_on_dir[fg].push_back(c);
dout(10) << "add_dir_waiter frag " << fg << " " << c << " on " << *this << dendl;
}
void CInode::take_dir_waiting(frag_t fg, list<Context*>& ls)
{
if (waiting_on_dir.empty())
return;
map<frag_t, list<Context*> >::iterator p = waiting_on_dir.find(fg);
if (p != waiting_on_dir.end()) {
dout(10) << "take_dir_waiting frag " << fg << " on " << *this << dendl;
ls.splice(ls.end(), p->second);
waiting_on_dir.erase(p);
if (waiting_on_dir.empty())
put(PIN_DIRWAITER);
}
}
void CInode::add_waiter(uint64_t tag, Context *c)
{
dout(10) << "add_waiter tag " << std::hex << tag << std::dec << " " << c
@ -1979,6 +2003,23 @@ void CInode::add_waiter(uint64_t tag, Context *c)
MDSCacheObject::add_waiter(tag, c);
}
void CInode::take_waiting(uint64_t mask, list<Context*>& ls)
{
if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
// take all dentry waiters
while (!waiting_on_dir.empty()) {
map<frag_t, list<Context*> >::iterator p = waiting_on_dir.begin();
dout(10) << "take_waiting dirfrag " << p->first << " on " << *this << dendl;
ls.splice(ls.end(), p->second);
waiting_on_dir.erase(p);
}
put(PIN_DIRWAITER);
}
// waiting
MDSCacheObject::take_waiting(mask, ls);
}
bool CInode::freeze_inode(int auth_pin_allowance)
{
assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins

View File

@ -110,6 +110,7 @@ public:
static const int PIN_DIRTYRSTAT = 21;
static const int PIN_EXPORTINGCAPS = 22;
static const int PIN_DIRTYPARENT = 23;
static const int PIN_DIRWAITER = 24;
const char *pin_name(int p) {
switch (p) {
@ -135,6 +136,7 @@ public:
case PIN_NEEDSNAPFLUSH: return "needsnapflush";
case PIN_DIRTYRSTAT: return "dirtyrstat";
case PIN_DIRTYPARENT: return "dirtyparent";
case PIN_DIRWAITER: return "dirwaiter";
default: return generic_pin_name(p);
}
}
@ -570,10 +572,17 @@ private:
_decode_locks_state(p, is_new);
}
// -- waiting --
protected:
map<frag_t, list<Context*> > waiting_on_dir;
public:
void add_dir_waiter(frag_t fg, Context *c);
void take_dir_waiting(frag_t fg, list<Context*>& ls);
bool is_waiting_for_dir(frag_t fg) {
return waiting_on_dir.count(fg);
}
void add_waiter(uint64_t tag, Context *c);
void take_waiting(uint64_t tag, list<Context*>& ls);
// -- encode/decode helpers --
void _encode_base(bufferlist& bl);
@ -584,7 +593,6 @@ private:
void _decode_locks_state(bufferlist::iterator& p, bool is_new);
void _decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waiters);
// -- import/export --
void encode_export(bufferlist& bl);
void finish_export(utime_t now);

View File

@ -9727,7 +9727,7 @@ void MDCache::discover_dir_frag(CInode *base,
dout(7) << "discover_dir_frag " << df
<< " from mds." << from << dendl;
if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // FIXME: this is kind of weak!
if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
discover_info_t& d = _create_discover(from);
d.ino = base->ino();
d.frag = approx_fg;
@ -9736,7 +9736,7 @@ void MDCache::discover_dir_frag(CInode *base,
}
if (onfinish)
base->add_waiter(CInode::WAIT_DIR, onfinish);
base->add_dir_waiter(approx_fg, onfinish);
}
struct C_MDC_RetryDiscoverPath : public Context {
@ -9779,10 +9779,12 @@ void MDCache::discover_path(CInode *base,
return;
}
frag_t fg = base->pick_dirfrag(want_path[0]);
if ((want_xlocked && want_path.depth() == 1) ||
!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // FIXME: weak!
!base->is_waiting_for_dir(fg) || !onfinish) {
discover_info_t& d = _create_discover(from);
d.ino = base->ino();
d.frag = fg;
d.snap = snap;
d.want_path = want_path;
d.want_base_dir = true;
@ -9792,7 +9794,7 @@ void MDCache::discover_path(CInode *base,
// register + wait
if (onfinish)
base->add_waiter(CInode::WAIT_DIR, onfinish);
base->add_dir_waiter(fg, onfinish);
}
struct C_MDC_RetryDiscoverPath2 : public Context {
@ -10329,46 +10331,36 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
if (who >= 0)
dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
// try again?
if (m->get_error_dentry().length()) {
// wanted a dentry
frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
CDir *dir = cur->get_dirfrag(fg);
filepath relpath(m->get_error_dentry(), 0);
frag_t fg = m->get_base_dir_frag();
CDir *dir = cur->get_dirfrag(fg);
if (cur->is_waiter_for(CInode::WAIT_DIR)) {
if (cur->is_auth() || dir)
cur->take_waiting(CInode::WAIT_DIR, finished);
else
discover_path(cur, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked(), who);
} else
dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
if (dir) {
// don't actaully need the hint, now
if (dir->is_waiting_for_dentry(m->get_error_dentry().c_str(), m->get_wanted_snapid())) {
if (dir->is_auth() || dir->lookup(m->get_error_dentry()))
dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
m->get_wanted_snapid(), finished);
else
discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
} else
dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
<< m->get_error_dentry() << dendl;
}
} else {
// wanted dir or ino
frag_t fg = m->get_base_dir_frag();
CDir *dir = cur->get_dirfrag(fg);
if (cur->is_waiter_for(CInode::WAIT_DIR)) {
if (cur->is_auth() || dir)
if (m->get_wanted_base_dir()) {
if (cur->is_waiting_for_dir(fg)) {
if (cur->is_auth())
cur->take_waiting(CInode::WAIT_DIR, finished);
else if (dir)
cur->take_dir_waiting(fg, finished);
else
discover_dir_frag(cur, fg, 0, who);
} else
dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
}
// try again?
if (m->get_error_dentry().length()) {
// wanted a dentry
if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
m->get_wanted_snapid(), finished);
} else {
filepath relpath(m->get_error_dentry(), 0);
discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked());
}
} else
dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
<< m->get_error_dentry() << dendl;
} else {
if (dir && m->get_wanted_ino() && dir->is_waiting_for_ino(m->get_wanted_ino())) {
if (dir->is_auth() || get_inode(m->get_wanted_ino()))
dir->take_ino_waiting(m->get_wanted_ino(), finished);
@ -10428,7 +10420,7 @@ CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, int from,
dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl;
// get waiters
diri->take_waiting(CInode::WAIT_DIR, finished);
diri->take_dir_waiting(df.frag, finished);
}
return dir;
@ -11510,11 +11502,13 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
// refragment
list<Context*> waiters;
list<CDir*> resultfrags;
adjust_dir_fragments(diri, base, bits,
resultfrags, waiters, false);
adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false);
if (g_conf->mds_debug_frag)
diri->verify_dirfrags();
for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p)
diri->take_dir_waiting((*p)->get_frag(), waiters);
/*
// add new replica dirs values
bufferlist::iterator p = notify->basebl.begin();