ceph/branches/riccardo/monitor2/mds/MDCache.cc
riccardo80 07ac5d3e74 creating branch for distributed monitor
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1068 29311d96-e01e-0410-9327-a35deaab8ce9
2007-02-01 05:43:23 +00:00

2583 lines
68 KiB
C++

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "MDCache.h"
#include "MDStore.h"
#include "MDS.h"
#include "Server.h"
#include "Locker.h"
#include "MDLog.h"
#include "MDBalancer.h"
#include "AnchorClient.h"
#include "Migrator.h"
#include "Renamer.h"
#include "MDSMap.h"
#include "CInode.h"
#include "CDir.h"
#include "include/filepath.h"
#include "msg/Message.h"
#include "msg/Messenger.h"
#include "common/Logger.h"
#include "osdc/Filer.h"
#include "events/EUnlink.h"
#include "events/EPurgeFinish.h"
#include "messages/MGenericMessage.h"
#include "messages/MDiscover.h"
#include "messages/MDiscoverReply.h"
//#include "messages/MInodeUpdate.h"
#include "messages/MDirUpdate.h"
#include "messages/MCacheExpire.h"
#include "messages/MInodeFileCaps.h"
#include "messages/MInodeLink.h"
#include "messages/MInodeLinkAck.h"
#include "messages/MInodeUnlink.h"
#include "messages/MInodeUnlinkAck.h"
#include "messages/MLock.h"
#include "messages/MDentryUnlink.h"
#include "messages/MClientRequest.h"
#include "messages/MClientFileCaps.h"
#include "IdAllocator.h"
#include "common/Timer.h"
#include <assert.h>
#include <errno.h>
#include <iostream>
#include <string>
#include <map>
using namespace std;
#include "config.h"
#undef dout
#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache "
MDCache::MDCache(MDS *m)
{
mds = m;
migrator = new Migrator(mds, this);
renamer = new Renamer(mds, this);
root = NULL;
lru.lru_set_max(g_conf.mds_cache_size);
lru.lru_set_midpoint(g_conf.mds_cache_mid);
did_shutdown_exports = false;
shutdown_commits = 0;
}
MDCache::~MDCache()
{
delete migrator;
delete renamer;
}
void MDCache::log_stat(Logger *logger)
{
if (get_root()) {
logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load());
logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load());
}
logger->set("c", lru.lru_get_size());
logger->set("cpin", lru.lru_get_num_pinned());
logger->set("ctop", lru.lru_get_top());
logger->set("cbot", lru.lru_get_bot());
logger->set("cptail", lru.lru_get_pintail());
}
//
bool MDCache::shutdown()
{
if (lru.lru_get_size() > 0) {
dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl;
//show_cache();
show_imports();
//dump();
}
return true;
}
// MDCache
CInode *MDCache::create_inode()
{
CInode *in = new CInode(this);
// zero
memset(&in->inode, 0, sizeof(inode_t));
// assign ino
in->inode.ino = mds->idalloc->alloc_id();
in->inode.nlink = 1; // FIXME
in->inode.layout = g_OSD_FileLayout;
add_inode(in); // add
return in;
}
void MDCache::destroy_inode(CInode *in)
{
mds->idalloc->reclaim_id(in->ino());
remove_inode(in);
}
void MDCache::add_inode(CInode *in)
{
// add to lru, inode map
assert(inode_map.size() == lru.lru_get_size());
lru.lru_insert_mid(in);
assert(inode_map.count(in->ino()) == 0); // should be no dup inos!
inode_map[ in->ino() ] = in;
assert(inode_map.size() == lru.lru_get_size());
}
void MDCache::remove_inode(CInode *o)
{
dout(14) << "remove_inode " << *o << endl;
if (o->get_parent_dn()) {
// FIXME: multiple parents?
CDentry *dn = o->get_parent_dn();
assert(!dn->is_dirty());
if (dn->is_sync())
dn->dir->remove_dentry(dn); // unlink inode AND hose dentry
else
dn->dir->unlink_inode(dn); // leave dentry
}
inode_map.erase(o->ino()); // remove from map
lru.lru_remove(o); // remove from lru
}
void MDCache::rename_file(CDentry *srcdn,
CDentry *destdn)
{
CInode *in = srcdn->inode;
// unlink src
srcdn->dir->unlink_inode(srcdn);
// unlink old inode?
if (destdn->inode) destdn->dir->unlink_inode(destdn);
// link inode w/ dentry
destdn->dir->link_inode( destdn, in );
}
void MDCache::set_root(CInode *in)
{
assert(root == 0);
root = in;
root->state_set(CINODE_STATE_ROOT);
}
void MDCache::add_import(CDir *dir)
{
imports.insert(dir);
dir->state_set(CDIR_STATE_IMPORT);
dir->get(CDIR_PIN_IMPORT);
}
// **************
// Inode purging -- reliably removing deleted file's objects
class C_MDC_PurgeFinish : public Context {
MDCache *mdc;
inodeno_t ino;
public:
C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
void finish(int r) {
mdc->purge_inode_finish(ino);
}
};
class C_MDC_PurgeFinish2 : public Context {
MDCache *mdc;
inodeno_t ino;
public:
C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {}
void finish(int r) {
mdc->purge_inode_finish_2(ino);
}
};
/* purge_inode in
* will be called by on unlink or rmdir
* caller responsible for journaling an appropriate EUnlink or ERmdir
*/
void MDCache::purge_inode(inode_t &inode)
{
dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl;
// take note
assert(purging.count(inode.ino) == 0);
purging[inode.ino] = inode;
// remove
mds->filer->remove(inode, 0, inode.size,
0, new C_MDC_PurgeFinish(this, inode.ino));
}
void MDCache::purge_inode_finish(inodeno_t ino)
{
dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl;
// log completion
mds->mdlog->submit_entry(new EPurgeFinish(ino),
new C_MDC_PurgeFinish2(this, ino));
}
void MDCache::purge_inode_finish_2(inodeno_t ino)
{
dout(10) << "purge_inode_finish_2 " << ino << endl;
// remove from purging list
purging.erase(ino);
// tell anyone who cares (log flusher?)
list<Context*> ls;
ls.swap(waiting_for_purge[ino]);
waiting_for_purge.erase(ino);
finish_contexts(ls, 0);
// reclaim ino?
}
void MDCache::start_recovered_purges()
{
for (map<inodeno_t,inode_t>::iterator p = purging.begin();
p != purging.end();
++p) {
dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl;
mds->filer->remove(p->second, 0, p->second.size,
0, new C_MDC_PurgeFinish(this, p->first));
}
}
bool MDCache::trim(int max)
{
// empty? short cut.
if (lru.lru_get_size() == 0) return true;
if (max < 0) {
max = lru.lru_get_max();
if (!max) return false;
}
map<int, MCacheExpire*> expiremap;
dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl;
assert(expiremap.empty());
while (lru.lru_get_size() > (unsigned)max) {
CInode *in = (CInode*)lru.lru_expire();
if (!in) break; //return false;
if (in->dir) {
// notify dir authority?
int auth = in->dir->authority();
if (auth != mds->get_nodeid()) {
dout(17) << "sending expire to mds" << auth << " on " << *in->dir << endl;
if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce);
}
}
// notify inode authority?
{
int auth = in->authority();
if (auth != mds->get_nodeid()) {
assert(!in->is_auth());
dout(17) << "sending expire to mds" << auth << " on " << *in << endl;
if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid());
expiremap[auth]->add_inode(in->ino(), in->replica_nonce);
} else {
assert(in->is_auth());
}
}
CInode *diri = NULL;
if (in->parent)
diri = in->parent->dir->inode;
if (in->is_root()) {
dout(7) << "just trimmed root, cache now empty." << endl;
root = NULL;
}
// last link?
if (in->inode.nlink == 0) {
dout(17) << "last link, removing file content " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS!
mds->filer->zero(in->inode,
0, in->inode.size,
NULL, NULL); // FIXME
}
// remove it
dout(15) << "trim removing " << *in << " " << in << endl;
remove_inode(in);
delete in;
if (diri) {
// dir incomplete!
diri->dir->state_clear(CDIR_STATE_COMPLETE);
// reexport?
if (diri->dir->is_import() && // import
diri->dir->get_size() == 0 && // no children
!diri->is_root()) // not root
migrator->export_empty_import(diri->dir);
}
if (mds->logger) mds->logger->inc("cex");
}
/* hack
if (lru.lru_get_size() == max) {
int i;
dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl;
CInode *cur = (CInode*)lru.lru_tophead;
i = 1;
while (cur) {
dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << " " << *cur << endl;
cur = (CInode*)cur->lru_next;
}
dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl;
cur = (CInode*)lru.lru_bothead;
i = 1;
while (cur) {
dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << " " << *cur << endl;
cur = (CInode*)cur->lru_next;
}
}
*/
// send expires
for (map<int, MCacheExpire*>::iterator it = expiremap.begin();
it != expiremap.end();
it++) {
dout(7) << "sending cache_expire to " << it->first << endl;
mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
}
return true;
}
class C_MDC_ShutdownCommit : public Context {
MDCache *mdc;
public:
C_MDC_ShutdownCommit(MDCache *mdc) {
this->mdc = mdc;
}
void finish(int r) {
mdc->shutdown_commits--;
}
};
class C_MDC_ShutdownCheck : public Context {
MDCache *mdc;
Mutex *lock;
public:
C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {}
void finish(int) {
lock->Lock();
mdc->shutdown_check();
lock->Unlock();
}
};
void MDCache::shutdown_check()
{
dout(0) << "shutdown_check at " << g_clock.now() << endl;
// cache
int o = g_conf.debug_mds;
g_conf.debug_mds = 10;
show_cache();
g_conf.debug_mds = o;
g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
// this
dout(0) << "lru size now " << lru.lru_get_size() << endl;
dout(0) << "log len " << mds->mdlog->get_num_events() << endl;
if (exports.size())
dout(0) << "still have " << exports.size() << " exports" << endl;
if (mds->filer->is_active())
dout(0) << "filer still active" << endl;
}
void MDCache::shutdown_start()
{
dout(1) << "shutdown_start" << endl;
if (g_conf.mds_shutdown_check)
g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock));
}
bool MDCache::shutdown_pass()
{
dout(7) << "shutdown_pass" << endl;
//assert(mds->is_shutting_down());
if (mds->is_stopped()) {
dout(7) << " already shut down" << endl;
show_cache();
show_imports();
return true;
}
// unhash dirs?
if (!hashdirs.empty()) {
// unhash any of my dirs?
for (set<CDir*>::iterator it = hashdirs.begin();
it != hashdirs.end();
it++) {
CDir *dir = *it;
if (!dir->is_auth()) continue;
if (dir->is_unhashing()) continue;
migrator->unhash_dir(dir);
}
dout(7) << "waiting for dirs to unhash" << endl;
return false;
}
// commit dirs?
if (g_conf.mds_commit_on_shutdown) {
if (shutdown_commits < 0) {
dout(1) << "shutdown_pass committing all dirty dirs" << endl;
shutdown_commits = 0;
for (hash_map<inodeno_t, CInode*>::iterator it = inode_map.begin();
it != inode_map.end();
it++) {
CInode *in = it->second;
// commit any dirty dir that's ours
if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) {
mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this));
shutdown_commits++;
}
}
}
// commits?
if (shutdown_commits > 0) {
dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl;
return false;
}
}
// flush anything we can from the cache
trim(0);
dout(5) << "cache size now " << lru.lru_get_size() << endl;
// (wait for) flush log?
if (g_conf.mds_log_flush_on_shutdown &&
mds->mdlog->get_num_events()) {
dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl;
return false;
}
// send all imports back to 0.
if (mds->get_nodeid() != 0 && !did_shutdown_exports) {
// flush what i can from the cache first..
trim(0);
// export to root
for (set<CDir*>::iterator it = imports.begin();
it != imports.end();
) {
CDir *im = *it;
it++;
if (im->inode->is_root()) continue;
if (im->is_frozen() || im->is_freezing()) continue;
dout(7) << "sending " << *im << " back to mds0" << endl;
migrator->export_dir(im,0);
}
did_shutdown_exports = true;
}
// waiting for imports? (e.g. root?)
if (exports.size()) {
dout(7) << "still have " << exports.size() << " exports" << endl;
//show_cache();
return false;
}
// filer active?
if (mds->filer->is_active()) {
dout(7) << "filer still active" << endl;
return false;
}
// close root?
if (mds->get_nodeid() == 0 &&
lru.lru_get_size() == 1 &&
root &&
root->dir &&
root->dir->is_import() &&
root->dir->get_ref() == 1) { // 1 is the import!
// un-import
dout(7) << "removing root import" << endl;
imports.erase(root->dir);
root->dir->state_clear(CDIR_STATE_IMPORT);
root->dir->put(CDIR_PIN_IMPORT);
if (root->is_pinned_by(CINODE_PIN_DIRTY)) {
dout(7) << "clearing root dirty flag" << endl;
root->put(CINODE_PIN_DIRTY);
}
trim(0);
assert(inode_map.size() == lru.lru_get_size());
}
// imports?
if (!imports.empty()) {
dout(7) << "still have " << imports.size() << " imports" << endl;
show_cache();
return false;
}
// done?
if (lru.lru_get_size() > 0) {
dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl;
show_cache();
//dump();
return false;
}
// done!
dout(1) << "shutdown done." << endl;
return true;
}
int MDCache::open_root(Context *c)
{
int whoami = mds->get_nodeid();
// open root inode
if (whoami == 0) {
// i am root inode
CInode *root = new CInode(this);
memset(&root->inode, 0, sizeof(inode_t));
root->inode.ino = 1;
root->inode.hash_seed = 0; // not hashed!
// make it up (FIXME)
root->inode.mode = 0755 | INODE_MODE_DIR;
root->inode.size = 0;
root->inode.ctime = 0;
root->inode.mtime = g_clock.gettime();
root->inode.nlink = 1;
root->inode.layout = g_OSD_MDDirLayout;
set_root( root );
add_inode( root );
// root directory too
assert(root->dir == NULL);
root->set_dir( new CDir(root, mds, true) );
root->dir->set_dir_auth( 0 ); // me!
root->dir->dir_rep = CDIR_REP_ALL; //NONE;
// root is sort of technically an import (from a vacuum)
imports.insert( root->dir );
root->dir->state_set(CDIR_STATE_IMPORT);
root->dir->get(CDIR_PIN_IMPORT);
if (c) {
c->finish(0);
delete c;
}
} else {
// request inode from root mds
if (waiting_for_root.empty()) {
dout(7) << "discovering root" << endl;
filepath want;
MDiscover *req = new MDiscover(whoami,
0,
want,
false); // there _is_ no base dir for the root inode
mds->send_message_mds(req, 0, MDS_PORT_CACHE);
} else {
dout(7) << "waiting for root" << endl;
}
// wait
waiting_for_root.push_back(c);
}
return 0;
}
// ========= messaging ==============
void MDCache::dispatch(Message *m)
{
switch (m->get_type()) {
case MSG_MDS_DISCOVER:
handle_discover((MDiscover*)m);
break;
case MSG_MDS_DISCOVERREPLY:
handle_discover_reply((MDiscoverReply*)m);
break;
/*
case MSG_MDS_INODEUPDATE:
handle_inode_update((MInodeUpdate*)m);
break;
*/
case MSG_MDS_INODELINK:
handle_inode_link((MInodeLink*)m);
break;
case MSG_MDS_INODELINKACK:
handle_inode_link_ack((MInodeLinkAck*)m);
break;
case MSG_MDS_DIRUPDATE:
handle_dir_update((MDirUpdate*)m);
break;
case MSG_MDS_CACHEEXPIRE:
handle_cache_expire((MCacheExpire*)m);
break;
case MSG_MDS_DENTRYUNLINK:
handle_dentry_unlink((MDentryUnlink*)m);
break;
default:
dout(7) << "cache unknown message " << m->get_type() << endl;
assert(0);
break;
}
}
/* path_traverse
*
* return values:
* <0 : traverse error (ENOTDIR, ENOENT)
* 0 : success
* >0 : delayed or forwarded
*
* Notes:
* onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_
* you aren't absolutely certain that the path actually exists. If it doesn't,
* the context is needed to pass a (failure) result code.
*/
class C_MDC_TraverseDiscover : public Context {
Context *onfinish, *ondelay;
public:
C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) {
this->ondelay = ondelay;
this->onfinish = onfinish;
}
void finish(int r) {
//dout(10) << "TraverseDiscover r = " << r << endl;
if (r < 0 && onfinish) { // ENOENT on discover, pass back to caller.
onfinish->finish(r);
} else {
ondelay->finish(r); // retry as usual
}
delete onfinish;
delete ondelay;
}
};
int MDCache::path_traverse(filepath& origpath,
vector<CDentry*>& trace,
bool follow_trailing_symlink,
Message *req,
Context *ondelay,
int onfail,
Context *onfinish,
bool is_client_req) // true if req is MClientRequest .. gross, FIXME
{
int whoami = mds->get_nodeid();
set< pair<CInode*, string> > symlinks_resolved; // keep a list of symlinks we touch to avoid loops
bool noperm = false;
if (onfail == MDS_TRAVERSE_DISCOVER ||
onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true;
// root
CInode *cur = get_root();
if (cur == NULL) {
dout(7) << "traverse: i don't have root" << endl;
open_root(ondelay);
if (onfinish) delete onfinish;
return 1;
}
// start trace
trace.clear();
// make our own copy, since we'll modify when we hit symlinks
filepath path = origpath;
unsigned depth = 0;
while (depth < path.depth()) {
dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl;
// ENOTDIR?
if (!cur->is_dir()) {
dout(7) << "traverse: " << *cur << " not a dir " << endl;
delete ondelay;
if (onfinish) {
onfinish->finish(-ENOTDIR);
delete onfinish;
}
return -ENOTDIR;
}
// open dir
if (!cur->dir) {
if (cur->dir_is_auth()) {
// parent dir frozen_dir?
if (cur->is_frozen_dir()) {
dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl;
cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
if (onfinish) delete onfinish;
return 1;
}
cur->get_or_open_dir(mds);
assert(cur->dir);
} else {
// discover dir from/via inode auth
assert(!cur->is_auth());
if (cur->waiting_for(CINODE_WAIT_DIR)) {
dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl;
} else {
filepath want = path.postfixpath(depth);
dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl;
mds->send_message_mds(new MDiscover(mds->get_nodeid(),
cur->ino(),
want,
true), // need this dir too
cur->authority(), MDS_PORT_CACHE);
}
cur->add_waiter(CINODE_WAIT_DIR, ondelay);
if (onfinish) delete onfinish;
return 1;
}
}
// frozen?
/*
if (cur->dir->is_frozen()) {
// doh!
// FIXME: traverse is allowed?
dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl;
cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay);
if (onfinish) delete onfinish;
return 1;
}
*/
// must read directory hard data (permissions, x bit) to traverse
if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) {
if (onfinish) delete onfinish;
return 1;
}
// check permissions?
// XXX
// ..?
if (path[depth] == "..") {
trace.pop_back();
depth++;
cur = cur->get_parent_inode();
dout(10) << "traverse: following .. back to " << *cur << endl;
continue;
}
// dentry
CDentry *dn = cur->dir->lookup(path[depth]);
// null and last_bit and xlocked by me?
if (dn && dn->is_null() &&
dn->is_xlockedbyme(req) &&
depth == path.depth()-1) {
dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl;
trace.push_back(dn);
break; // done!
}
if (dn && !dn->is_null()) {
// dentry exists. xlocked?
if (!noperm && dn->is_xlockedbyother(req)) {
dout(10) << "traverse: xlocked dentry at " << *dn << endl;
cur->dir->add_waiter(CDIR_WAIT_DNREAD,
path[depth],
ondelay);
if (onfinish) delete onfinish;
return 1;
}
// do we have inode?
if (!dn->inode) {
assert(dn->is_remote());
// do i have it?
CInode *in = get_inode(dn->get_remote_ino());
if (in) {
dout(7) << "linking in remote in " << *in << endl;
dn->link_remote(in);
} else {
dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl;
open_remote_ino(dn->get_remote_ino(), req,
ondelay);
return 1;
}
}
// symlink?
if (dn->inode->is_symlink() &&
(follow_trailing_symlink || depth < path.depth()-1)) {
// symlink, resolve!
filepath sym = dn->inode->symlink;
dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl;
// break up path components
// /head/symlink/tail
filepath head = path.prefixpath(depth);
filepath tail = path.postfixpath(depth+1);
dout(10) << "traverse: path head = " << head << endl;
dout(10) << "traverse: path tail = " << tail << endl;
if (symlinks_resolved.count(pair<CInode*,string>(dn->inode, tail.get_path()))) {
dout(10) << "already hit this symlink, bailing to avoid the loop" << endl;
return -ELOOP;
}
symlinks_resolved.insert(pair<CInode*,string>(dn->inode, tail.get_path()));
// start at root?
if (dn->inode->symlink[0] == '/') {
// absolute
trace.clear();
depth = 0;
path = tail;
dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl;
} else {
// relative
path = head;
path.append(sym);
path.append(tail);
dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl;
}
continue;
} else {
// keep going.
// forwarder wants replicas?
if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) {
dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl;
if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() &&
cur->dir->is_auth() &&
cur->dir->is_rep() &&
cur->dir->is_open_by(req->get_source().num()) &&
dn->get_inode()->is_auth()
) {
assert(req->get_source().is_mds());
int from = req->get_source().num();
if (dn->get_inode()->is_cached_by(from)) {
dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by "
<< req->get_source() << " dn " << *dn << endl;
} else {
dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl;
MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino());
reply->add_dentry( dn->get_name(), !dn->can_read());
reply->add_inode( dn->inode->replicate_to( from ) );
mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE);
}
}
}
trace.push_back(dn);
cur = dn->inode;
touch_inode(cur);
depth++;
continue;
}
}
// MISS. don't have it.
int dauth = cur->dir->dentry_authority( path[depth] );
dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl;
if (dauth == whoami) {
// dentry is mine.
if (cur->dir->is_complete()) {
// file not found
delete ondelay;
if (onfinish) {
onfinish->finish(-ENOENT);
delete onfinish;
}
return -ENOENT;
} else {
//wrong?
//if (onfail == MDS_TRAVERSE_DISCOVER)
// return -1;
// directory isn't complete; reload
dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl;
touch_inode(cur);
mds->mdstore->fetch_dir(cur->dir, ondelay);
if (mds->logger) mds->logger->inc("cmiss");
if (onfinish) delete onfinish;
return 1;
}
} else {
// dentry is not mine.
/* no, let's let auth handle the discovery/replication ..
if (onfail == MDS_TRAVERSE_FORWARD &&
onfinish == 0 && // no funnyness
cur->dir->is_rep()) {
dout(5) << "trying to discover in popular dir " << *cur->dir << endl;
onfail = MDS_TRAVERSE_DISCOVER;
}
*/
if ((onfail == MDS_TRAVERSE_DISCOVER ||
onfail == MDS_TRAVERSE_DISCOVERXLOCK)) {
// discover
filepath want = path.postfixpath(depth);
if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) {
dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
} else {
dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl;
touch_inode(cur);
mds->send_message_mds(new MDiscover(mds->get_nodeid(),
cur->ino(),
want,
false),
dauth, MDS_PORT_CACHE);
if (mds->logger) mds->logger->inc("dis");
}
// delay processing of current request.
// delay finish vs ondelay until result of traverse, so that ENOENT can be
// passed to onfinish if necessary
cur->dir->add_waiter(CDIR_WAIT_DENTRY,
path[depth],
new C_MDC_TraverseDiscover(onfinish, ondelay));
if (mds->logger) mds->logger->inc("cmiss");
return 1;
}
if (onfail == MDS_TRAVERSE_FORWARD) {
// forward
dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl;
if (is_client_req && cur->dir->is_rep()) {
dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl;
((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino());
req->clear_payload(); // reencode!
}
mds->send_message_mds(req, dauth, req->get_dest_port());
//show_imports();
if (mds->logger) mds->logger->inc("cfw");
if (onfinish) delete onfinish;
delete ondelay;
return 2;
}
if (onfail == MDS_TRAVERSE_FAIL) {
delete ondelay;
if (onfinish) {
onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority!
delete onfinish;
}
return -ENOENT; // not necessarily exactly true....
}
}
assert(0); // i shouldn't get here
}
// success.
delete ondelay;
if (onfinish) {
onfinish->finish(0);
delete onfinish;
}
return 0;
}
void MDCache::open_remote_dir(CInode *diri,
Context *fin)
{
dout(10) << "open_remote_dir on " << *diri << endl;
assert(diri->is_dir());
assert(!diri->dir_is_auth());
assert(!diri->is_auth());
assert(diri->dir == 0);
filepath want; // no dentries, i just want the dir open
mds->send_message_mds(new MDiscover(mds->get_nodeid(),
diri->ino(),
want,
true), // need the dir open
diri->authority(), MDS_PORT_CACHE);
diri->add_waiter(CINODE_WAIT_DIR, fin);
}
class C_MDC_OpenRemoteInoLookup : public Context {
MDCache *mdc;
inodeno_t ino;
Message *req;
Context *onfinish;
public:
vector<Anchor*> anchortrace;
C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) {
this->mdc = mdc;
this->ino = ino;
this->req = req;
this->onfinish = onfinish;
}
void finish(int r) {
assert(r == 0);
if (r == 0)
mdc->open_remote_ino_2(ino, req, anchortrace, onfinish);
else {
onfinish->finish(r);
delete onfinish;
}
}
};
void MDCache::open_remote_ino(inodeno_t ino,
Message *req,
Context *onfinish)
{
dout(7) << "open_remote_ino on " << ino << endl;
C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish);
mds->anchorclient->lookup(ino, c->anchortrace, c);
}
void MDCache::open_remote_ino_2(inodeno_t ino,
Message *req,
vector<Anchor*>& anchortrace,
Context *onfinish)
{
dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl;
// construct path
filepath path;
for (unsigned i=0; i<anchortrace.size(); i++)
path.add_dentry(anchortrace[i]->ref_dn);
dout(7) << " path is " << path << endl;
vector<CDentry*> trace;
int r = path_traverse(path, trace, false,
req,
onfinish, // delay actually
MDS_TRAVERSE_DISCOVER);
if (r > 0) return;
onfinish->finish(r);
delete onfinish;
}
// path pins
bool MDCache::path_pin(vector<CDentry*>& trace,
Message *m,
Context *c)
{
// verify everything is pinnable
for (vector<CDentry*>::iterator it = trace.begin();
it != trace.end();
it++) {
CDentry *dn = *it;
if (!dn->is_pinnable(m)) {
// wait
if (c) {
dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl;
dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE,
dn->name,
c);
} else {
dout(10) << "path_pin can't pin, no waiter, failing." << endl;
}
return false;
}
}
// pin!
for (vector<CDentry*>::iterator it = trace.begin();
it != trace.end();
it++) {
(*it)->pin(m);
dout(11) << "path_pinned " << *(*it) << endl;
}
delete c;
return true;
}
void MDCache::path_unpin(vector<CDentry*>& trace,
Message *m)
{
for (vector<CDentry*>::iterator it = trace.begin();
it != trace.end();
it++) {
CDentry *dn = *it;
dn->unpin(m);
dout(11) << "path_unpinned " << *dn << endl;
// did we completely unpin a waiter?
if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) {
// return state to sync, in case the unpinner flails
dn->lockstate = DN_LOCK_SYNC;
// run finisher right now to give them a fair shot.
dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name);
}
}
}
void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
{
CInode *parent = in->get_parent_inode();
if (parent) {
make_trace(trace, parent);
CDentry *dn = in->get_parent_dn();
dout(15) << "make_trace adding " << *dn << endl;
trace.push_back(dn);
}
}
bool MDCache::request_start(Message *req,
CInode *ref,
vector<CDentry*>& trace)
{
assert(active_requests.count(req) == 0);
// pin path
if (trace.size()) {
if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false;
}
dout(7) << "request_start " << *req << endl;
// add to map
active_requests[req].ref = ref;
if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace;
// request pins
request_pin_inode(req, ref);
if (mds->logger) mds->logger->inc("req");
return true;
}
void MDCache::request_pin_inode(Message *req, CInode *in)
{
if (active_requests[req].request_pins.count(in) == 0) {
in->request_pin_get();
active_requests[req].request_pins.insert(in);
}
}
void MDCache::request_pin_dir(Message *req, CDir *dir)
{
if (active_requests[req].request_dir_pins.count(dir) == 0) {
dir->request_pin_get();
active_requests[req].request_dir_pins.insert(dir);
}
}
void MDCache::request_cleanup(Message *req)
{
assert(active_requests.count(req) == 1);
// leftover xlocks?
if (active_requests[req].xlocks.size()) {
set<CDentry*> dns = active_requests[req].xlocks;
for (set<CDentry*>::iterator it = dns.begin();
it != dns.end();
it++) {
CDentry *dn = *it;
dout(7) << "request_cleanup leftover xlock " << *dn << endl;
mds->locker->dentry_xlock_finish(dn);
// queue finishers
dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue);
// remove clean, null dentry? (from a failed rename or whatever)
if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) {
dn->dir->remove_dentry(dn);
}
}
assert(active_requests[req].xlocks.empty()); // we just finished finished them
}
// foreign xlocks?
if (active_requests[req].foreign_xlocks.size()) {
set<CDentry*> dns = active_requests[req].foreign_xlocks;
active_requests[req].foreign_xlocks.clear();
for (set<CDentry*>::iterator it = dns.begin();
it != dns.end();
it++) {
CDentry *dn = *it;
dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl;
assert(dn->is_xlocked());
int dauth = dn->dir->dentry_authority(dn->name);
MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid());
m->set_dn(dn->dir->ino(), dn->name);
mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
}
}
// unpin paths
for (map< CDentry*, vector<CDentry*> >::iterator it = active_requests[req].traces.begin();
it != active_requests[req].traces.end();
it++) {
path_unpin(it->second, req);
}
// request pins
for (set<CInode*>::iterator it = active_requests[req].request_pins.begin();
it != active_requests[req].request_pins.end();
it++) {
(*it)->request_pin_put();
}
for (set<CDir*>::iterator it = active_requests[req].request_dir_pins.begin();
it != active_requests[req].request_dir_pins.end();
it++) {
(*it)->request_pin_put();
}
// remove from map
active_requests.erase(req);
// log some stats *****
if (mds->logger) {
mds->logger->set("c", lru.lru_get_size());
mds->logger->set("cpin", lru.lru_get_num_pinned());
mds->logger->set("ctop", lru.lru_get_top());
mds->logger->set("cbot", lru.lru_get_bot());
mds->logger->set("cptail", lru.lru_get_pintail());
//mds->logger->set("buf",buffer_total_alloc);
}
if (g_conf.log_pins) {
// pin
for (int i=0; i<CINODE_NUM_PINS; i++) {
if (mds->logger2) mds->logger2->set(cinode_pin_names[i],
cinode_pins[i]);
}
/*
for (map<int,int>::iterator it = cdir_pins.begin();
it != cdir_pins.end();
it++) {
//string s = "D";
//s += cdir_pin_names[it->first];
if (mds->logger2) mds->logger2->set(//s,
cdir_pin_names[it->first],
it->second);
}
*/
}
}
void MDCache::request_finish(Message *req)
{
dout(7) << "request_finish " << *req << endl;
request_cleanup(req);
delete req; // delete req
if (mds->logger) mds->logger->inc("reply");
//dump();
}
void MDCache::request_forward(Message *req, int who, int port)
{
if (!port) port = MDS_PORT_SERVER;
dout(7) << "request_forward to " << who << " req " << *req << endl;
request_cleanup(req);
mds->send_message_mds(req, who, port);
if (mds->logger) mds->logger->inc("fw");
}
// ANCHORS
class C_MDC_AnchorInode : public Context {
CInode *in;
public:
C_MDC_AnchorInode(CInode *in) {
this->in = in;
}
void finish(int r) {
if (r == 0) {
assert(in->inode.anchored == false);
in->inode.anchored = true;
in->state_clear(CINODE_STATE_ANCHORING);
in->put(CINODE_PIN_ANCHORING);
in->mark_dirty();
}
// trigger
in->finish_waiting(CINODE_WAIT_ANCHORED, r);
}
};
void MDCache::anchor_inode(CInode *in, Context *onfinish)
{
assert(in->is_auth());
// already anchoring?
if (in->state_test(CINODE_STATE_ANCHORING)) {
dout(7) << "anchor_inode already anchoring " << *in << endl;
// wait
in->add_waiter(CINODE_WAIT_ANCHORED,
onfinish);
} else {
dout(7) << "anchor_inode anchoring " << *in << endl;
// auth: do it
in->state_set(CINODE_STATE_ANCHORING);
in->get(CINODE_PIN_ANCHORING);
// wait
in->add_waiter(CINODE_WAIT_ANCHORED,
onfinish);
// make trace
vector<Anchor*> trace;
in->make_anchor_trace(trace);
// do it
mds->anchorclient->create(in->ino(), trace,
new C_MDC_AnchorInode( in ));
}
}
void MDCache::handle_inode_link(MInodeLink *m)
{
CInode *in = get_inode(m->get_ino());
assert(in);
if (!in->is_auth()) {
assert(in->is_proxy());
dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl;
mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
return;
}
dout(7) << "handle_inode_link on " << *in << endl;
if (!in->is_anchored()) {
assert(in->inode.nlink == 1);
dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl;
anchor_inode(in,
new C_MDS_RetryMessage(mds, m));
return;
}
in->inode.nlink++;
in->mark_dirty();
// reply
dout(7) << " nlink++, now " << in->inode.nlink++ << endl;
mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE);
delete m;
}
void MDCache::handle_inode_link_ack(MInodeLinkAck *m)
{
CInode *in = get_inode(m->get_ino());
assert(in);
dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl;
in->finish_waiting(CINODE_WAIT_LINK,
m->is_success() ? 1:-1);
}
// REPLICAS
void MDCache::handle_discover(MDiscover *dis)
{
int whoami = mds->get_nodeid();
// from me to me?
if (dis->get_asker() == whoami) {
dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl;
delete dis;
return;
}
CInode *cur = 0;
MDiscoverReply *reply = 0;
//filepath fullpath;
// get started.
if (dis->get_base_ino() == 0) {
// wants root
dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl;
assert(mds->get_nodeid() == 0);
assert(root->is_auth());
//fullpath = dis->get_want();
// add root
reply = new MDiscoverReply(0);
reply->add_inode( root->replicate_to( dis->get_asker() ) );
dout(10) << "added root " << *root << endl;
cur = root;
} else {
// there's a base inode
cur = get_inode(dis->get_base_ino());
assert(cur);
if (dis->wants_base_dir()) {
dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl;
} else {
dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl;
}
assert(cur->is_dir());
// crazyness?
if (!cur->dir && !cur->is_auth()) {
int iauth = cur->authority();
dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl;
mds->send_message_mds( dis, iauth, MDS_PORT_CACHE);
return;
}
// frozen_dir?
if (!cur->dir && cur->is_frozen_dir()) {
dout(7) << "is frozen_dir, waiting" << endl;
cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
new C_MDS_RetryMessage(mds, dis));
return;
}
if (!cur->dir)
cur->get_or_open_dir(mds);
assert(cur->dir);
dout(10) << "dir is " << *cur->dir << endl;
// create reply
reply = new MDiscoverReply(cur->ino());
}
assert(reply);
assert(cur);
/*
// first traverse and make sure we won't have to do any waiting
dout(10) << "traversing full discover path = " << fullpath << endl;
vector<CInode*> trav;
int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL);
if (r > 0)
return; // fw or delay
dout(10) << "traverse finish w/o blocking, continuing" << endl;
// ok, now we know we won't block on dentry locks or readdir.
*/
// add content
// do some fidgeting to include a dir if they asked for the base dir, or just root.
for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) {
// add dir
if (reply->is_empty() && !dis->wants_base_dir()) {
dout(7) << "they don't want the base dir" << endl;
} else {
// is it actaully a dir at all?
if (!cur->is_dir()) {
dout(7) << "not a dir " << *cur << endl;
reply->set_flag_error_dir();
break;
}
// add dir
if (!cur->dir_is_auth()) {
dout(7) << *cur << " dir auth is someone else, i'm done" << endl;
break;
}
// did we hit a frozen_dir?
if (!cur->dir && cur->is_frozen_dir()) {
dout(7) << *cur << " is frozen_dir, stopping" << endl;
break;
}
if (!cur->dir) cur->get_or_open_dir(mds);
reply->add_dir( new CDirDiscover( cur->dir,
cur->dir->open_by_add( dis->get_asker() ) ) );
dout(7) << "added dir " << *cur->dir << endl;
}
if (dis->get_want().depth() == 0) break;
// lookup dentry
int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) );
if (dentry_auth != mds->get_nodeid()) {
dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl;
break; // that's it for us!
}
// get inode
CDentry *dn = cur->dir->lookup( dis->get_dentry(i) );
/*
if (dn && !dn->can_read()) { // xlocked?
dout(7) << "waiting on " << *dn << endl;
cur->dir->add_waiter(CDIR_WAIT_DNREAD,
dn->name,
new C_MDS_RetryMessage(mds, dis));
return;
}
*/
if (dn) {
if (!dn->inode && dn->is_sync()) {
dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl;
reply->set_flag_error_dn( dis->get_dentry(i) );
break; // don't replicate null but non-locked dentries.
}
reply->add_dentry( dis->get_dentry(i), !dn->can_read() );
dout(7) << "added dentry " << *dn << endl;
if (!dn->inode) break; // we're done.
}
if (dn && dn->inode) {
CInode *next = dn->inode;
assert(next->is_auth());
// add inode
//int nonce = next->cached_by_add(dis->get_asker());
reply->add_inode( next->replicate_to( dis->get_asker() ) );
dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl;
// descend
cur = next;
} else {
// don't have inode?
if (cur->dir->is_complete()) {
// set error flag in reply
dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl;
reply->set_flag_error_dn( dis->get_dentry(i) );
break;
} else {
// readdir
dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl;
//mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis));
//break; // send what we have so far
mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis));
return;
}
}
}
// how did we do.
if (reply->is_empty()) {
// discard empty reply
delete reply;
if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) &&
!cur->dir->is_auth()) {
// fwd to dir auth
int dirauth = cur->dir->authority();
if (dirauth == dis->get_asker()) {
dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right?
//assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it.
delete dis;
} else {
dout(7) << "fwd to dir auth " << dirauth << endl;
mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE );
}
return;
}
dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl;
//assert(0);
} else {
// send back to asker
dout(7) << "sending result back to asker mds" << dis->get_asker() << endl;
mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE);
}
// done.
delete dis;
}
void MDCache::handle_discover_reply(MDiscoverReply *m)
{
// starting point
CInode *cur;
list<Context*> finished, error;
if (m->has_root()) {
// nowhere!
dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl;
assert(!root);
assert(m->get_base_ino() == 0);
assert(!m->has_base_dentry());
assert(!m->has_base_dir());
// add in root
cur = new CInode(this, false);
m->get_inode(0).update_inode(cur);
// root
set_root( cur );
add_inode( cur );
dout(7) << " got root: " << *cur << endl;
// take waiters
finished.swap(waiting_for_root);
} else {
// grab inode
cur = get_inode(m->get_base_ino());
if (!cur) {
dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl;
delete m;
return;
}
dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl;
}
// fyi
if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl;
if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl;
dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl;
// loop over discover results.
// indexese follow each ([[dir] dentry] inode)
// can start, end with any type.
for (int i=m->has_root(); i<m->get_depth(); i++) {
dout(10) << "discover_reply i=" << i << " cur " << *cur << endl;
// dir
if ((i > 0) ||
(i == 0 && m->has_base_dir())) {
if (cur->dir) {
// had it
/* this is strange, but it happens when:
we discover multiple dentries under a dir.
bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one).
this is actually good, since (dir aside) they're asking for different information.
*/
dout(7) << "had " << *cur->dir;
m->get_dir(i).update_dir(cur->dir);
dout2(7) << ", now " << *cur->dir << endl;
} else {
// add it (_replica_)
cur->set_dir( new CDir(cur, mds, false) );
m->get_dir(i).update_dir(cur->dir);
dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl;
// get waiters
cur->take_waiting(CINODE_WAIT_DIR, finished);
}
}
// dentry error?
if (i == m->get_depth()-1 &&
m->is_flag_error_dn()) {
// error!
assert(cur->is_dir());
if (cur->dir) {
dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl;
cur->dir->take_waiting(CDIR_WAIT_DENTRY,
m->get_error_dentry(),
error);
} else {
dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl;
cur->take_waiting(CINODE_WAIT_DIR, error);
}
break;
}
if (i >= m->get_num_dentries()) break;
// dentry
dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl;
CDentry *dn = 0;
if (i > 0 ||
m->has_base_dentry()) {
dn = cur->dir->lookup( m->get_dentry(i) );
if (dn) {
dout(7) << "had " << *dn << endl;
} else {
dn = cur->dir->add_dentry( m->get_dentry(i) );
if (m->get_dentry_xlock(i)) {
dout(7) << " new dentry is xlock " << *dn << endl;
dn->lockstate = DN_LOCK_XLOCK;
dn->xlockedby = 0;
}
dout(7) << "added " << *dn << endl;
}
cur->dir->take_waiting(CDIR_WAIT_DENTRY,
m->get_dentry(i),
finished);
}
if (i >= m->get_num_inodes()) break;
// inode
dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl;
CInode *in = get_inode( m->get_inode(i).get_ino() );
assert(dn);
if (in) {
dout(7) << "had " << *in << endl;
// fix nonce
dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl;
in->replica_nonce = m->get_inode(i).get_replica_nonce();
if (dn && in != dn->inode) {
dout(7) << " but it's not linked via dentry " << *dn << endl;
// link
if (dn->inode) {
dout(7) << "dentry WAS linked to " << *dn->inode << endl;
assert(0); // WTF.
}
dn->dir->link_inode(dn, in);
}
}
else {
assert(dn->inode == 0); // better not be something else linked to this dentry...
// didn't have it.
in = new CInode(this, false);
m->get_inode(i).update_inode(in);
// link in
add_inode( in );
dn->dir->link_inode(dn, in);
dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl;
}
// onward!
cur = in;
}
// dir error at the end there?
if (m->is_flag_error_dir()) {
dout(7) << " flag_error on dir " << *cur << endl;
assert(!cur->is_dir());
cur->take_waiting(CINODE_WAIT_DIR, error);
}
// finish errors directly
finish_contexts(error, -ENOENT);
mds->queue_finished(finished);
// done
delete m;
}
/*
int MDCache::send_inode_updates(CInode *in)
{
assert(in->is_auth());
for (set<int>::iterator it = in->cached_by_begin();
it != in->cached_by_end();
it++) {
dout(7) << "sending inode_update on " << *in << " to " << *it << endl;
assert(*it != mds->get_nodeid());
mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE);
}
return 0;
}
void MDCache::handle_inode_update(MInodeUpdate *m)
{
inodeno_t ino = m->get_ino();
CInode *in = get_inode(m->get_ino());
if (!in) {
//dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl;
dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl;
MCacheExpire *expire = new MCacheExpire(mds->get_nodeid());
expire->add_inode(m->get_ino(), m->get_nonce());
mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE);
goto out;
}
if (in->is_auth()) {
dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl;
assert(0); // this should never happen
}
dout(7) << "inode_update on " << *in << endl;
// update! NOTE dir_auth is unaffected by this.
in->decode_basic_state(m->get_payload());
out:
// done
delete m;
}
*/
void MDCache::handle_cache_expire(MCacheExpire *m)
{
int from = m->get_from();
int source = m->get_source().num();
map<int, MCacheExpire*> proxymap;
if (m->get_from() == source) {
dout(7) << "cache_expire from " << from << endl;
} else {
dout(7) << "cache_expire from " << from << " via " << source << endl;
}
// inodes
for (map<inodeno_t,int>::iterator it = m->get_inodes().begin();
it != m->get_inodes().end();
it++) {
CInode *in = get_inode(it->first);
int nonce = it->second;
if (!in) {
dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl;
assert(in); // i should be authority, or proxy .. and pinned
}
if (!in->is_auth()) {
int newauth = in->authority();
dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl;
assert(newauth >= 0);
if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
assert(in->state_test(CINODE_STATE_PROXY));
if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
proxymap[newauth]->add_inode(it->first, it->second);
continue;
}
// check nonce
if (from == mds->get_nodeid()) {
// my cache_expire, and the export_dir giving auth back to me crossed paths!
// we can ignore this. no danger of confusion since the two parties are both me.
dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl;
}
else if (nonce == in->get_cached_by_nonce(from)) {
// remove from our cached_by
dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl;
in->cached_by_remove(from);
in->mds_caps_wanted.erase(from);
// note: this code calls _eval more often than it needs to!
// fix lock
if (in->hardlock.is_gathering(from)) {
in->hardlock.gather_set.erase(from);
if (in->hardlock.gather_set.size() == 0)
mds->locker->inode_hard_eval(in);
}
if (in->filelock.is_gathering(from)) {
in->filelock.gather_set.erase(from);
if (in->filelock.gather_set.size() == 0)
mds->locker->inode_file_eval(in);
}
// alone now?
if (!in->is_cached_by_anyone()) {
mds->locker->inode_hard_eval(in);
mds->locker->inode_file_eval(in);
}
}
else {
// this is an old nonce, ignore expire.
dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl;
assert(in->get_cached_by_nonce(from) > nonce);
}
}
// dirs
for (map<inodeno_t,int>::iterator it = m->get_dirs().begin();
it != m->get_dirs().end();
it++) {
CInode *diri = get_inode(it->first);
CDir *dir = diri->dir;
int nonce = it->second;
if (!dir) {
dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
assert(dir); // i should be authority, or proxy ... and pinned
}
if (!dir->is_auth()) {
int newauth = dir->authority();
dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl;
if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl;
assert(dir->is_proxy());
assert(newauth >= 0);
assert(dir->state_test(CDIR_STATE_PROXY));
if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
proxymap[newauth]->add_dir(it->first, it->second);
continue;
}
// check nonce
if (from == mds->get_nodeid()) {
dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME! ignoring" << endl;
}
else if (nonce == dir->get_open_by_nonce(from)) {
// remove from our cached_by
dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl;
dir->open_by_remove(from);
}
else {
// this is an old nonce, ignore expire.
dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl;
assert(dir->get_open_by_nonce(from) > nonce);
}
}
// send proxy forwards
for (map<int, MCacheExpire*>::iterator it = proxymap.begin();
it != proxymap.end();
it++) {
dout(7) << "sending proxy forward to " << it->first << endl;
mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE);
}
// done
delete m;
}
int MDCache::send_dir_updates(CDir *dir, bool bcast)
{
// this is an FYI, re: replication
set<int> who = dir->open_by;
if (bcast)
who = mds->get_mds_map()->get_mds();
dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl;
string path;
dir->inode->make_path(path);
int whoami = mds->get_nodeid();
for (set<int>::iterator it = who.begin();
it != who.end();
it++) {
if (*it == whoami) continue;
//if (*it == except) continue;
dout(7) << "sending dir_update on " << *dir << " to " << *it << endl;
mds->send_message_mds(new MDirUpdate(dir->ino(),
dir->dir_rep,
dir->dir_rep_by,
path,
bcast),
*it, MDS_PORT_CACHE);
}
return 0;
}
void MDCache::handle_dir_update(MDirUpdate *m)
{
CInode *in = get_inode(m->get_ino());
if (!in || !in->dir) {
dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl;
// discover it?
if (m->should_discover()) {
m->tried_discover(); // only once!
vector<CDentry*> trace;
filepath path = m->get_path();
dout(5) << "trying discover on dir_update for " << path << endl;
int r = path_traverse(path, trace, true,
m, new C_MDS_RetryMessage(mds, m),
MDS_TRAVERSE_DISCOVER);
if (r > 0)
return;
if (r == 0) {
assert(in);
open_remote_dir(in, new C_MDS_RetryMessage(mds, m));
return;
}
assert(0);
}
goto out;
}
// update
dout(5) << "dir_update on " << *in->dir << endl;
in->dir->dir_rep = m->get_dir_rep();
in->dir->dir_rep_by = m->get_dir_rep_by();
// done
out:
delete m;
}
class C_MDC_DentryUnlink : public Context {
public:
MDCache *mdc;
CDentry *dn;
CDir *dir;
Context *c;
C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) {
this->mdc = mdc;
this->dn = dn;
this->dir = dir;
this->c = c;
}
void finish(int r) {
assert(r == 0);
mdc->dentry_unlink_finish(dn, dir, c);
}
};
// NAMESPACE FUN
void MDCache::dentry_unlink(CDentry *dn, Context *c)
{
CDir *dir = dn->dir;
string dname = dn->name;
assert(dn->lockstate == DN_LOCK_XLOCK);
// i need the inode to do any of this properly
assert(dn->inode);
// log it
if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME
mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode),
NULL); // FIXME FIXME FIXME
// tell replicas
if (dir->is_open_by_anyone()) {
for (set<int>::iterator it = dir->open_by_begin();
it != dir->open_by_end();
it++) {
dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl;
mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE);
}
// don't need ack.
}
// inode deleted?
if (dn->is_primary()) {
assert(dn->inode->is_auth());
dn->inode->inode.nlink--;
if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs
// last link?
if (dn->inode->inode.nlink == 0) {
// truly dangling
if (dn->inode->dir) {
// mark dir clean too, since it now dne!
assert(dn->inode->dir->is_auth());
dn->inode->dir->state_set(CDIR_STATE_DELETED);
dn->inode->dir->remove_null_dentries();
dn->inode->dir->mark_clean();
}
// mark it clean, it's dead
if (dn->inode->is_dirty())
dn->inode->mark_clean();
} else {
// migrate to inode file
dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl;
// dangling but still linked.
assert(dn->inode->is_anchored());
// unlink locally
CInode *in = dn->inode;
dn->dir->unlink_inode( dn );
dn->mark_dirty();
// mark it dirty!
in->mark_dirty();
// update anchor to point to inode file+mds
vector<Anchor*> atrace;
in->make_anchor_trace(atrace);
assert(atrace.size() == 1); // it's dangling
mds->anchorclient->update(in->ino(), atrace,
new C_MDC_DentryUnlink(this, dn, dir, c));
return;
}
}
else if (dn->is_remote()) {
// need to dec nlink on primary
if (dn->inode->is_auth()) {
// awesome, i can do it
dout(7) << "remote target is local, nlink--" << endl;
dn->inode->inode.nlink--;
dn->inode->mark_dirty();
if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) ||
(!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) {
dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl;
// remove anchor (async)
mds->anchorclient->destroy(dn->inode->ino(), NULL);
}
} else {
int auth = dn->inode->authority();
dout(7) << "remote target is remote, sending unlink request to " << auth << endl;
mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()),
auth, MDS_PORT_CACHE);
// unlink locally
CInode *in = dn->inode;
dn->dir->unlink_inode( dn );
dn->mark_dirty();
// add waiter
in->add_waiter(CINODE_WAIT_UNLINK, c);
return;
}
}
else
assert(0); // unlink on null dentry??
// unlink locally
dn->dir->unlink_inode( dn );
dn->mark_dirty();
// finish!
dentry_unlink_finish(dn, dir, c);
}
void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c)
{
dout(7) << "dentry_unlink_finish on " << *dn << endl;
string dname = dn->name;
// unpin dir / unxlock
mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking
// did i empty out an imported dir?
if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0)
migrator->export_empty_import(dir);
// wake up any waiters
dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue);
c->finish(0);
}
void MDCache::handle_dentry_unlink(MDentryUnlink *m)
{
CInode *diri = get_inode(m->get_dirino());
CDir *dir = 0;
if (diri) dir = diri->dir;
if (!diri || !dir) {
dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl;
}
else {
CDentry *dn = dir->lookup(m->get_dn());
if (!dn) {
dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl;
} else {
dout(7) << "handle_dentry_unlink on " << *dn << endl;
// dir?
if (dn->inode) {
if (dn->inode->dir) {
dn->inode->dir->state_set(CDIR_STATE_DELETED);
dn->inode->dir->remove_null_dentries();
}
}
string dname = dn->name;
// unlink
dn->dir->remove_dentry(dn);
// wake up
//dir->finish_waiting(CDIR_WAIT_DNREAD, dname);
dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue);
}
}
delete m;
return;
}
void MDCache::handle_inode_unlink(MInodeUnlink *m)
{
CInode *in = get_inode(m->get_ino());
assert(in);
// proxy?
if (in->is_proxy()) {
dout(7) << "handle_inode_unlink proxy on " << *in << endl;
mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
return;
}
assert(in->is_auth());
// do it.
dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl;
assert(in->inode.nlink > 0);
in->inode.nlink--;
if (in->state_test(CINODE_STATE_DANGLING)) {
// already dangling.
// last link?
if (in->inode.nlink == 0) {
dout(7) << "last link, marking clean and removing anchor" << endl;
in->mark_clean(); // mark it clean.
// remove anchor (async)
mds->anchorclient->destroy(in->ino(), NULL);
}
else {
in->mark_dirty();
}
} else {
// has primary link still.
assert(in->inode.nlink >= 1);
in->mark_dirty();
if (in->inode.nlink == 1) {
dout(7) << "nlink=1, removing anchor" << endl;
// remove anchor (async)
mds->anchorclient->destroy(in->ino(), NULL);
}
}
// ack
mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE);
}
void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m)
{
CInode *in = get_inode(m->get_ino());
assert(in);
dout(7) << "handle_inode_unlink_ack on " << *in << endl;
in->finish_waiting(CINODE_WAIT_UNLINK, 0);
}
/*
* some import/export helpers
*/
/** con = get_auth_container(dir)
* Returns the directory in which authority is delegated for *dir.
* This may be because a directory is an import, or because it is hashed
* and we are nested underneath an inode in that dir (that hashes to us).
* Thus do not assume con->is_auth()! It is_auth() || is_hashed().
*/
CDir *MDCache::get_auth_container(CDir *dir)
{
CDir *imp = dir; // might be *dir
// find the underlying import or hash that delegates dir
while (true) {
if (imp->is_import()) break; // import
imp = imp->get_parent_dir();
assert(imp);
if (imp->is_hashed()) break; // hash
}
return imp;
}
void MDCache::find_nested_exports(CDir *dir, set<CDir*>& s)
{
CDir *import = get_auth_container(dir);
find_nested_exports_under(import, dir, s);
}
void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s)
{
dout(10) << "find_nested_exports for " << *dir << endl;
dout(10) << "find_nested_exports_under import " << *import << endl;
if (import == dir) {
// yay, my job is easy!
for (set<CDir*>::iterator p = nested_exports[import].begin();
p != nested_exports[import].end();
p++) {
CDir *nested = *p;
s.insert(nested);
dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
}
return;
}
// ok, my job is annoying.
for (set<CDir*>::iterator p = nested_exports[import].begin();
p != nested_exports[import].end();
p++) {
CDir *nested = *p;
dout(12) << "find_nested_exports checking " << *nested << endl;
// trace back to import, or dir
CDir *cur = nested->get_parent_dir();
while (!cur->is_import() || cur == dir) {
if (cur == dir) {
s.insert(nested);
dout(10) << "find_nested_exports " << *dir << " " << *nested << endl;
break;
} else {
cur = cur->get_parent_dir();
}
}
}
}
// ==============================================================
// debug crap
void MDCache::show_imports()
{
mds->balancer->show_imports();
}
void MDCache::show_cache()
{
dout(7) << "show_cache" << endl;
for (hash_map<inodeno_t,CInode*>::iterator it = inode_map.begin();
it != inode_map.end();
it++) {
dout(7) << *((*it).second) << endl;
CDentry *dn = (*it).second->get_parent_dn();
if (dn)
dout(7) << " dn " << *dn << endl;
if ((*it).second->dir)
dout(7) << " subdir " << *(*it).second->dir << endl;
}
}