// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include "MDS.h" #include "Server.h" #include "Locker.h" #include "MDCache.h" #include "MDLog.h" #include "Migrator.h" #include "MDBalancer.h" #include "AnchorClient.h" #include "IdAllocator.h" #include "msg/Messenger.h" #include "messages/MClientSession.h" #include "messages/MClientRequest.h" #include "messages/MClientReply.h" #include "messages/MClientReconnect.h" #include "messages/MClientFileCaps.h" #include "messages/MMDSSlaveRequest.h" #include "messages/MLock.h" #include "messages/MDentryUnlink.h" #include "events/EString.h" #include "events/EUpdate.h" #include "events/ESlaveUpdate.h" #include "events/ESession.h" #include "events/EOpen.h" #include "include/filepath.h" #include "common/Timer.h" #include "common/Logger.h" #include "common/LogType.h" #include #include #include #include using namespace std; #include "config.h" #define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " #define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " void Server::reopen_logger(utime_t start, bool append) { static LogType mdserver_logtype; static bool didit = false; if (!didit) { didit = true; mdserver_logtype.add_inc("hcreq"); // handle client req mdserver_logtype.add_inc("hsreq"); // slave mdserver_logtype.add_inc("hcsess"); // client session mdserver_logtype.add_inc("dcreq"); // dispatch client req mdserver_logtype.add_inc("dsreq"); // slave } if (logger) delete logger; // logger char name[80]; sprintf(name, "mds%d.server", mds->get_nodeid()); logger = new Logger(name, &mdserver_logtype, append); logger->set_start(start); } void Server::dispatch(Message *m) { switch (m->get_type()) { case CEPH_MSG_CLIENT_RECONNECT: handle_client_reconnect((MClientReconnect*)m); return; } // active? if (!mds->is_active() && !mds->is_stopping()) { dout(3) << "not active yet, waiting" << dendl; mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); return; } switch (m->get_type()) { case CEPH_MSG_CLIENT_SESSION: handle_client_session((MClientSession*)m); return; case CEPH_MSG_CLIENT_REQUEST: handle_client_request((MClientRequest*)m); return; case MSG_MDS_SLAVE_REQUEST: handle_slave_request((MMDSSlaveRequest*)m); return; } dout(1) << "server unknown message " << m->get_type() << dendl; assert(0); } // ---------------------------------------------------------- // SESSION management class C_MDS_session_finish : public Context { MDS *mds; entity_inst_t client_inst; bool open; version_t cmapv; public: C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : mds(m), client_inst(ci), open(s), cmapv(mv) { } void finish(int r) { assert(r == 0); mds->server->_session_logged(client_inst, open, cmapv); } }; void Server::handle_client_session(MClientSession *m) { dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl; int from = m->get_source().num(); bool open = m->op == MClientSession::OP_REQUEST_OPEN; if (open) { if (mds->clientmap.have_session(from)) { dout(10) << "already open, dropping this req" << dendl; delete m; return; } if (mds->clientmap.is_opening(from)) { dout(10) << "already opening, dropping this req" << dendl; delete m; return; } mds->clientmap.add_opening(from); } else { if (mds->clientmap.is_closing(from)) { dout(10) << "already closing, dropping this req" << dendl; delete m; return; } if (m->seq < mds->clientmap.get_push_seq(from)) { dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from) << ", dropping" << dendl; delete m; return; } assert(m->seq == mds->clientmap.get_push_seq(from)); mds->clientmap.add_closing(from); } // journal it version_t cmapv = mds->clientmap.inc_projected(); dout(10) << " clientmap v " << mds->clientmap.get_version() << " pv " << cmapv << dendl; mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); delete m; if (logger) logger->inc("hcsess"); } void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) { dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") << " " << cmapv << dendl; // apply int from = client_inst.name.num(); if (open) { assert(mds->clientmap.is_opening(from)); mds->clientmap.open_session(client_inst); } else if (mds->clientmap.is_closing(from)) { mds->clientmap.close_session(from); // purge completed requests from clientmap mds->clientmap.trim_completed_requests(client_inst.name, 0); } else { // close must have been canceled (by an import?) ... assert(!open); mds->clientmap.noop(); } assert(cmapv == mds->clientmap.get_version()); // reply if (open) mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst); else mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst); } void Server::prepare_force_open_sessions(map& cm) { version_t cmapv = mds->clientmap.inc_projected(); dout(10) << "prepare_force_open_sessions " << cmapv << " on " << cm.size() << " clients" << dendl; for (map::iterator p = cm.begin(); p != cm.end(); ++p) { mds->clientmap.add_opening(p->first); } } void Server::finish_force_open_sessions(map& cm) { version_t v = mds->clientmap.get_version(); dout(10) << "finish_force_open_sessions on " << cm.size() << " clients, v " << v << " -> " << (v+1) << dendl; for (map::iterator p = cm.begin(); p != cm.end(); ++p) { if (mds->clientmap.is_closing(p->first)) { dout(15) << "force_open_sessions canceling close on " << p->second << dendl; mds->clientmap.remove_closing(p->first); continue; } if (mds->clientmap.have_session(p->first)) { dout(15) << "force_open_sessions have session " << p->second << dendl; continue; } dout(10) << "force_open_sessions opening " << p->second << dendl; mds->clientmap.open_session(p->second); mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), p->second); } mds->clientmap.set_version(v+1); } void Server::terminate_sessions() { dout(2) << "terminate_sessions" << dendl; // kill them off. clients will retry etc. for (set::const_iterator p = mds->clientmap.get_session_set().begin(); p != mds->clientmap.get_session_set().end(); ++p) { if (mds->clientmap.is_closing(*p)) continue; mds->clientmap.add_closing(*p); version_t cmapv = mds->clientmap.inc_projected(); mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); } } void Server::reconnect_clients() { // reconnect with clients if (mds->clientmap.get_session_set().empty()) { dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl; reconnect_gather_finish(); return; } dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << dendl; mds->bcast_mds_map(); // send mdsmap to all client sessions // init gather list reconnect_start = g_clock.now(); client_reconnect_gather = mds->clientmap.get_session_set(); dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl; } void Server::handle_client_reconnect(MClientReconnect *m) { dout(7) << "handle_client_reconnect " << m->get_source() << dendl; int from = m->get_source().num(); if (m->closed) { dout(7) << " client had no session, removing from clientmap" << dendl; mds->clientmap.add_closing(from); version_t cmapv = mds->clientmap.inc_projected(); mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); } else { // caps for (map::iterator p = m->inode_caps.begin(); p != m->inode_caps.end(); ++p) { CInode *in = mdcache->get_inode(p->first); if (in && in->is_auth()) { // we recovered it, and it's ours. take note. dout(15) << "open caps on " << *in << dendl; in->reconnect_cap(from, p->second); reconnected_caps.insert(in); continue; } filepath path = m->inode_path[p->first]; if ((in && !in->is_auth()) || !mds->mdcache->path_is_mine(path)) { // not mine. dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first] << ", will pass off to authority" << dendl; // mark client caps stale. inode_t fake_inode; fake_inode.ino = p->first; MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_EXPORT, fake_inode, 0, 0, // doesn't matter. p->second.wanted); // doesn't matter. mds->send_message_client(stale, m->get_source_inst()); // add to cap export list. mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second); } else { // mine. fetch later. dout(0) << "missing " << p->first << " " << m->inode_path[p->first] << " (mine), will load later" << dendl; mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second, -1); // "from" me. } } } // remove from gather set client_reconnect_gather.erase(from); if (client_reconnect_gather.empty()) reconnect_gather_finish(); delete m; } /* * called by mdcache, late in rejoin (right before acks are sent) */ void Server::process_reconnected_caps() { dout(10) << "process_reconnected_caps" << dendl; // adjust filelock state appropriately for (set::iterator p = reconnected_caps.begin(); p != reconnected_caps.end(); ++p) { CInode *in = *p; int issued = in->get_caps_issued(); if (in->is_auth()) { // wr? if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { in->filelock.set_state(LOCK_LONER); } else { in->filelock.set_state(LOCK_MIXED); } } } else { // note that client should perform stale/reap cleanup during reconnect. assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? if (in->filelock.is_xlocked()) in->filelock.set_state(LOCK_LOCK); else in->filelock.set_state(LOCK_SYNC); // might have been lock, previously } dout(15) << " issued " << cap_string(issued) << " chose " << in->filelock << " on " << *in << dendl; } reconnected_caps.clear(); // clean up } void Server::client_reconnect_failure(int from) { dout(5) << "client_reconnect_failure on client" << from << dendl; if (mds->is_reconnect() && client_reconnect_gather.count(from)) { client_reconnect_gather.erase(from); if (client_reconnect_gather.empty()) reconnect_gather_finish(); } } void Server::reconnect_gather_finish() { dout(7) << "reconnect_gather_finish" << dendl; mds->reconnect_done(); } /******* * some generic stuff for finishing off requests */ /* * send generic response (just and error code) */ void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) { reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); } /* * send given reply * include a trace to tracei */ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) { MClientRequest *req = mdr->client_request; dout(10) << "reply_request " << reply->get_result() << " (" << strerror(-reply->get_result()) << ") " << *req << dendl; // note result code in clientmap? if (!req->is_idempotent()) mds->clientmap.add_completed_request(mdr->reqid); /* if (tracei && !tracei->hack_accessed) { tracei->hack_accessed = true; mds->logger->inc("newt"); if (tracei->parent && tracei->parent->dir->hack_num_accessed >= 0) { tracei->parent->dir->hack_num_accessed++; if (tracei->parent->dir->hack_num_accessed == 1) mds->logger->inc("dirt1"); if (tracei->parent->dir->hack_num_accessed == 2) mds->logger->inc("dirt2"); if (tracei->parent->dir->hack_num_accessed == 3) mds->logger->inc("dirt3"); if (tracei->parent->dir->hack_num_accessed == 4) mds->logger->inc("dirt4"); if (tracei->parent->dir->hack_num_accessed == 5) mds->logger->inc("dirt5"); } } */ // include trace if (tracei) reply->set_trace_dist( tracei, mds->get_nodeid() ); reply->set_mdsmap_epoch(mds->mdsmap->get_epoch()); // send reply if (req->get_client_inst().name.is_mds()) delete reply; // mds doesn't need a reply else messenger->send_message(reply, req->get_client_inst()); // finish request mdcache->request_finish(mdr); if (tracei && tracei->get_parent_dn() && tracei->get_parent_dn()->is_remote()) mdcache->eval_remote(tracei->get_parent_dn()); } /*** * process a client request */ void Server::handle_client_request(MClientRequest *req) { dout(4) << "handle_client_request " << *req << dendl; if (logger) logger->inc("hcreq"); if (!mds->is_active() && !(mds->is_stopping() && req->get_client_inst().name.is_mds())) { dout(5) << " not active (or stopping+mds), discarding request." << dendl; delete req; return; } if (!mdcache->get_root()) { dout(5) << "need to open root" << dendl; mdcache->open_root(new C_MDS_RetryMessage(mds, req)); return; } // active session? if (req->get_client_inst().name.is_client() && !mds->clientmap.have_session(req->get_client_inst().name.num())) { dout(5) << "no session for " << req->get_client_inst().name << ", dropping" << dendl; delete req; return; } // old mdsmap? if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) { // send it? hrm, this isn't ideal; they may get a lot of copies if // they have a high request rate. } // okay, i want CInode *ref = 0; // retry? if (req->get_retry_attempt()) { if (mds->clientmap.have_completed_request(req->get_reqid())) { dout(5) << "already completed " << req->get_reqid() << dendl; mds->messenger->send_message(new MClientReply(req, 0), req->get_client_inst()); delete req; return; } } // trim completed_request list if (req->get_oldest_client_tid() > 0) { dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; mds->clientmap.trim_completed_requests(req->get_client_inst().name, req->get_oldest_client_tid()); } // ----- // some ops are on ino's switch (req->get_op()) { case CEPH_MDS_OP_FSTAT: ref = mdcache->get_inode(req->head.args.fstat.ino); assert(ref); break; case CEPH_MDS_OP_TRUNCATE: if (!req->head.args.truncate.ino) break; // can be called w/ either fh OR path ref = mdcache->get_inode(req->head.args.truncate.ino); assert(ref); break; case CEPH_MDS_OP_FSYNC: ref = mdcache->get_inode(req->head.args.fsync.ino); // fixme someday no ino needed? assert(ref); break; } // register + dispatch MDRequest *mdr = mdcache->request_start(req); if (!mdr) return; if (ref) { dout(10) << "inode op on ref " << *ref << dendl; mdr->ref = ref; mdr->pin(ref); } dispatch_client_request(mdr); return; } void Server::dispatch_client_request(MDRequest *mdr) { MClientRequest *req = mdr->client_request; if (logger) logger->inc("dcreq"); if (mdr->ref) { dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << dendl; } else { dout(7) << "dispatch_client_request " << *req << dendl; } // we shouldn't be waiting on anyone. assert(mdr->more()->waiting_on_slave.empty()); switch (req->get_op()) { // inodes ops. case CEPH_MDS_OP_STAT: case CEPH_MDS_OP_LSTAT: handle_client_stat(mdr); break; case CEPH_MDS_OP_UTIME: handle_client_utime(mdr); break; case CEPH_MDS_OP_CHMOD: handle_client_chmod(mdr); break; case CEPH_MDS_OP_CHOWN: handle_client_chown(mdr); break; case CEPH_MDS_OP_TRUNCATE: handle_client_truncate(mdr); break; case CEPH_MDS_OP_READDIR: handle_client_readdir(mdr); break; case CEPH_MDS_OP_FSYNC: //handle_client_fsync(req, ref); break; // funky. case CEPH_MDS_OP_OPEN: if (req->head.args.open.flags & O_CREAT) handle_client_openc(mdr); else handle_client_open(mdr); break; // namespace. // no prior locks. case CEPH_MDS_OP_MKNOD: handle_client_mknod(mdr); break; case CEPH_MDS_OP_LINK: handle_client_link(mdr); break; case CEPH_MDS_OP_UNLINK: case CEPH_MDS_OP_RMDIR: handle_client_unlink(mdr); break; case CEPH_MDS_OP_RENAME: handle_client_rename(mdr); break; case CEPH_MDS_OP_MKDIR: handle_client_mkdir(mdr); break; case CEPH_MDS_OP_SYMLINK: handle_client_symlink(mdr); break; default: dout(1) << " unknown client op " << req->get_op() << dendl; assert(0); } } // --------------------------------------- // SLAVE REQUESTS void Server::handle_slave_request(MMDSSlaveRequest *m) { dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl; int from = m->get_source().num(); if (logger) logger->inc("hsreq"); // reply? if (m->is_reply()) { switch (m->get_op()) { case MMDSSlaveRequest::OP_XLOCKACK: { // identify lock, master request SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), m->get_object_info()); MDRequest *mdr = mdcache->request_get(m->get_reqid()); mdr->more()->slaves.insert(from); dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; mdr->xlocks.insert(lock); mdr->locks.insert(lock); lock->get_xlock(mdr); lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); } break; case MMDSSlaveRequest::OP_AUTHPINACK: { MDRequest *mdr = mdcache->request_get(m->get_reqid()); handle_slave_auth_pin_ack(mdr, m); } break; case MMDSSlaveRequest::OP_LINKPREPACK: { MDRequest *mdr = mdcache->request_get(m->get_reqid()); handle_slave_link_prep_ack(mdr, m); } break; case MMDSSlaveRequest::OP_RENAMEPREPACK: { MDRequest *mdr = mdcache->request_get(m->get_reqid()); handle_slave_rename_prep_ack(mdr, m); } break; default: assert(0); } // done with reply. delete m; return; } else { // am i a new slave? MDRequest *mdr; if (mdcache->have_request(m->get_reqid())) { // existing? mdr = mdcache->request_get(m->get_reqid()); if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race) dout(10) << "local request " << *mdr << " not slave to mds" << from << ", ignoring " << *m << dendl; delete m; return; } } else { // new? if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { dout(10) << "missing slave request for " << m->get_reqid() << " OP_FINISH, must have lost race with a forward" << dendl; delete m; return; } mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num()); } assert(mdr->slave_request == 0); // only one at a time, please! mdr->slave_request = m; dispatch_slave_request(mdr); } } void Server::dispatch_slave_request(MDRequest *mdr) { dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl; if (mdr->aborted) { dout(7) << " abort flag set, finishing" << dendl; mdcache->request_finish(mdr); return; } if (logger) logger->inc("dsreq"); switch (mdr->slave_request->get_op()) { case MMDSSlaveRequest::OP_XLOCK: { // identify object SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), mdr->slave_request->get_object_info()); if (lock && lock->get_parent()->is_auth()) { // xlock. // use acquire_locks so that we get auth_pinning. set rdlocks; set wrlocks; set xlocks = mdr->xlocks; xlocks.insert(lock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // ack MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); r->set_lock_type(lock->get_type()); lock->get_parent()->set_object_info(r->get_object_info()); mds->send_message_mds(r, mdr->slave_request->get_source().num()); } else { if (lock) { dout(10) << "not auth for remote xlock attempt, dropping on " << *lock << " on " << *lock->get_parent() << dendl; } else { dout(10) << "don't have object, dropping" << dendl; assert(0); // can this happen, if we auth pinned properly. } } // done. delete mdr->slave_request; mdr->slave_request = 0; } break; case MMDSSlaveRequest::OP_UNXLOCK: { SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), mdr->slave_request->get_object_info()); assert(lock); mds->locker->xlock_finish(lock, mdr); // done. no ack necessary. delete mdr->slave_request; mdr->slave_request = 0; } break; case MMDSSlaveRequest::OP_AUTHPIN: handle_slave_auth_pin(mdr); break; case MMDSSlaveRequest::OP_LINKPREP: case MMDSSlaveRequest::OP_UNLINKPREP: handle_slave_link_prep(mdr); break; case MMDSSlaveRequest::OP_RENAMEPREP: handle_slave_rename_prep(mdr); break; case MMDSSlaveRequest::OP_FINISH: // finish off request. mdcache->request_finish(mdr); break; default: assert(0); } } void Server::handle_slave_auth_pin(MDRequest *mdr) { dout(10) << "handle_slave_auth_pin " << *mdr << dendl; // build list of objects list objects; bool fail = false; for (list::iterator p = mdr->slave_request->get_authpins().begin(); p != mdr->slave_request->get_authpins().end(); ++p) { MDSCacheObject *object = mdcache->get_object(*p); if (!object) { dout(10) << " don't have " << *p << dendl; fail = true; break; } objects.push_back(object); } // can we auth pin them? if (!fail) { for (list::iterator p = objects.begin(); p != objects.end(); ++p) { if (!(*p)->is_auth()) { dout(10) << " not auth for " << **p << dendl; fail = true; break; } if (!mdr->is_auth_pinned(*p) && !(*p)->can_auth_pin()) { // wait dout(10) << " waiting for authpinnable on " << **p << dendl; (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); mdr->drop_local_auth_pins(); return; } } } // auth pin! if (fail) { mdr->drop_local_auth_pins(); // just in case } else { for (list::iterator p = objects.begin(); p != objects.end(); ++p) { dout(10) << "auth_pinning " << **p << dendl; mdr->auth_pin(*p); } } // ack! MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK); // return list of my auth_pins (if any) for (set::iterator p = mdr->auth_pins.begin(); p != mdr->auth_pins.end(); ++p) { MDSCacheObjectInfo info; (*p)->set_object_info(info); reply->get_authpins().push_back(info); } mds->send_message_mds(reply, mdr->slave_to_mds); // clean up this request delete mdr->slave_request; mdr->slave_request = 0; return; } void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) { dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl; int from = ack->get_source().num(); // added auth pins? set pinned; for (list::iterator p = ack->get_authpins().begin(); p != ack->get_authpins().end(); ++p) { MDSCacheObject *object = mdcache->get_object(*p); assert(object); // we pinned it dout(10) << " remote has pinned " << *object << dendl; if (!mdr->is_auth_pinned(object)) mdr->remote_auth_pins.insert(object); pinned.insert(object); } // removed auth pins? set::iterator p = mdr->remote_auth_pins.begin(); while (p != mdr->remote_auth_pins.end()) { if ((*p)->authority().first == from && pinned.count(*p) == 0) { dout(10) << " remote has unpinned " << **p << dendl; set::iterator o = p; ++p; mdr->remote_auth_pins.erase(o); } else { ++p; } } // note slave mdr->more()->slaves.insert(from); // clear from waiting list assert(mdr->more()->waiting_on_slave.count(from)); mdr->more()->waiting_on_slave.erase(from); // go again? if (mdr->more()->waiting_on_slave.empty()) dispatch_client_request(mdr); else dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; } // --------------------------------------- // HELPERS /** validate_dentry_dir * * verify that the dir exists and would own the dname. * do not check if the dentry exists. */ CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) { // make sure parent is a dir? if (!diri->is_dir()) { dout(7) << "validate_dentry_dir: not a dir" << dendl; reply_request(mdr, -ENOTDIR); return false; } // which dirfrag? frag_t fg = diri->pick_dirfrag(dname); CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); if (!dir) return 0; // frozen? if (dir->is_frozen()) { dout(7) << "dir is frozen " << *dir << dendl; dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } return dir; } /** prepare_null_dentry * prepare a null (or existing) dentry in given dir. * wait for any dn lock. */ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) { dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl; assert(dir->is_auth()); // does it already exist? CDentry *dn = dir->lookup(dname); if (dn) { if (dn->lock.is_xlocked_by_other(mdr)) { dout(10) << "waiting on xlocked dentry " << *dn << dendl; dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } if (!dn->is_null()) { // name already exists dout(10) << "dentry " << dname << " exists in " << *dir << dendl; if (!okexist) { reply_request(mdr, -EEXIST); return 0; } } return dn; } // make sure dir is complete if (!dir->is_complete()) { dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // create dn = dir->add_null_dentry(dname); dn->mark_new(); dout(10) << "prepare_null_dentry added " << *dn << dendl; return dn; } /** prepare_new_inode * * create a new inode. set c/m/atime. hit dir pop. */ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) { CInode *in = mdcache->create_inode(); in->inode.uid = mdr->client_request->get_caller_uid(); in->inode.gid = mdr->client_request->get_caller_gid(); in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now dout(10) << "prepare_new_inode " << *in << dendl; return in; } CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) { // figure parent dir vs dname if (refpath.depth() == 0) { dout(7) << "can't do that to root" << dendl; reply_request(mdr, -EINVAL); return 0; } string dname = refpath.last_dentry(); refpath.pop_dentry(); dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl; // traverse to parent dir int r = mdcache->path_traverse(mdr, mdr->client_request, refpath, trace, true, MDS_TRAVERSE_FORWARD); if (r > 0) return 0; // delayed if (r < 0) { reply_request(mdr, r); return 0; } // open inode CInode *diri; if (trace.empty()) diri = mdcache->get_inode(refpath.get_ino()); else diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); if (!diri) return 0; // opening inode. // is it an auth dir? CDir *dir = validate_dentry_dir(mdr, diri, dname); if (!dir) return 0; // forwarded or waiting for freeze dout(10) << "traverse_to_auth_dir " << *dir << dendl; return dir; } CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) { // already got ref? if (mdr->ref) return mdr->ref; MClientRequest *req = mdr->client_request; // traverse filepath refpath = req->get_filepath(); vector trace; int r = mdcache->path_traverse(mdr, req, refpath, trace, req->follow_trailing_symlink(), MDS_TRAVERSE_FORWARD); if (r > 0) return false; // delayed if (r < 0) { // error reply_request(mdr, r); return 0; } // open ref inode CInode *ref = 0; if (trace.empty()) ref = mdcache->get_root(); else { CDentry *dn = trace[trace.size()-1]; // if no inode (null or unattached remote), fw to dentry auth? if (want_auth && !dn->is_auth() && (dn->is_null() || (dn->is_remote() && dn->inode))) { if (dn->is_ambiguous_auth()) { dout(10) << "waiting for single auth on " << *dn << dendl; dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); } else { dout(10) << "fw to auth for " << *dn << dendl; mdcache->request_forward(mdr, dn->authority().first); return 0; } } // open ref inode ref = mdcache->get_dentry_inode(dn, mdr); if (!ref) return 0; } dout(10) << "ref is " << *ref << dendl; // fw to inode auth? if (want_auth && !ref->is_auth()) { if (ref->is_ambiguous_auth()) { dout(10) << "waiting for single auth on " << *ref << dendl; ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); } else { dout(10) << "fw to auth for " << *ref << dendl; mdcache->request_forward(mdr, ref->authority().first); } return 0; } // auth_pin? if (want_auth) { if (ref->is_frozen()) { dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } mdr->auth_pin(ref); } // lock the path set rdlocks, empty; for (int i=0; i<(int)trace.size(); i++) rdlocks.insert(&trace[i]->lock); if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) return 0; // set and pin ref mdr->pin(ref); mdr->ref = ref; // save the locked trace. mdr->trace.swap(trace); return ref; } /** rdlock_path_xlock_dentry * traverse path to the directory that could/would contain dentry. * make sure i am auth for that dentry, forward as necessary. * create null dentry in place (or use existing if okexist). * get rdlocks on traversed dentries, xlock on new dentry. */ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) { MClientRequest *req = mdr->client_request; vector trace; CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); if (!dir) return 0; dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl; // make sure we can auth_pin (or have already authpinned) dir if (dir->is_frozen()) { dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl; dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // make a null dentry? const string &dname = req->get_filepath().last_dentry(); CDentry *dn; if (mustexist) { dn = dir->lookup(dname); // make sure dir is complete if (!dn && !dir->is_complete()) { dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // readable? if (dn && dn->lock.is_xlocked_by_other(mdr)) { dout(10) << "waiting on xlocked dentry " << *dn << dendl; dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // exists? if (!dn || dn->is_null()) { dout(7) << "dentry " << dname << " dne in " << *dir << dendl; reply_request(mdr, -ENOENT); return 0; } } else { dn = prepare_null_dentry(mdr, dir, dname, okexist); if (!dn) return 0; } // -- lock -- set rdlocks, wrlocks, xlocks; for (int i=0; i<(int)trace.size(); i++) rdlocks.insert(&trace[i]->lock); if (dn->is_null()) { xlocks.insert(&dn->lock); // new dn, xlock wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime } else rdlocks.insert(&dn->lock); // existing dn, rdlock if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return 0; // save the locked trace. mdr->trace.swap(trace); return dn; } /** * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth * * @diri base indoe * @fg the exact frag we want * @mdr request */ CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr) { CDir *dir = diri->get_dirfrag(fg); // not open and inode not mine? if (!dir && !diri->is_auth()) { int inauth = diri->authority().first; dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << dendl; mdcache->request_forward(mdr, inauth); return 0; } // not open and inode frozen? if (!dir && diri->is_frozen_dir()) { dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl; assert(diri->get_parent_dir()); diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } // invent? if (!dir) dir = diri->get_or_open_dirfrag(mds->mdcache, fg); // am i auth for the dirfrag? if (!dir->is_auth()) { int auth = dir->authority().first; dout(7) << "try_open_auth_dirfrag: not auth for " << *dir << ", fw to mds" << auth << dendl; mdcache->request_forward(mdr, auth); return 0; } return dir; } /** predirty_dn_diri * predirty the directory inode for a new dentry, if it is auth (and not root) * BUG: root inode doesn't get dirtied properly, currently. blech. */ version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) { version_t dirpv = 0; CInode *diri = dn->dir->inode; if (diri->is_base()) return 0; if (diri->is_auth()) { assert(mdr->wrlocks.count(&diri->dirlock)); dirpv = diri->pre_dirty(); dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << dendl; // predirty+journal inode_t *pi = diri->project_inode(); if (dirpv) pi->version = dirpv; pi->ctime = pi->mtime = mdr->now; blob->add_dir_context(diri->get_parent_dn()->get_dir()); blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); } else { // journal the mtime change anyway. inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); ji->ctime = ji->mtime = mdr->now; dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl; blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); assert(mdr->ls); mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime); } return dirpv; } /** dirty_dn_diri * follow-up with actual dirty of inode after journal entry commits. */ void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) { CInode *diri = dn->dir->inode; if (diri->is_root()) return; if (dirpv) { // we journaled and predirtied. assert(diri->is_auth() && !diri->is_root()); diri->pop_and_dirty_projected_inode(mdr->ls); dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl; } else { // dirlock scatterlock will propagate the update. diri->inode.ctime = diri->inode.mtime = mdr->now; diri->dirlock.set_updated(); dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl; } } // =============================================================================== // STAT void Server::handle_client_stat(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *ref = rdlock_path_pin_ref(mdr, false); if (!ref) return; // which inode locks do I want? /* note: this works because we include existing locks in our lists, and because all new locks are on inodes and sort to the right of the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; int mask = req->head.args.stat.mask; if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock); if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock); if (ref->is_file() && mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock); if (ref->is_dir() && mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; mds->balancer->hit_inode(g_clock.now(), ref, META_POP_IRD, mdr->client_request->get_client_inst().name.num()); // reply dout(10) << "reply to stat on " << *req << dendl; MClientReply *reply = new MClientReply(req); reply_request(mdr, reply, ref); } // =============================================================================== // INODE UPDATES /* * finisher for basic inode updates */ class C_MDS_inode_update_finish : public Context { MDS *mds; MDRequest *mdr; CInode *in; public: C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : mds(m), mdr(r), in(i) { } void finish(int r) { assert(r == 0); // apply in->pop_and_dirty_projected_inode(mdr->ls); mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, in); } }; // utime void Server::handle_client_utime(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; if (cur->is_root()) { reply_request(mdr, -EINVAL); // for now return; } // xlock inode set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; xlocks.insert(&cur->filelock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // project update inode_t *pi = cur->project_inode(); pi->mtime = req->head.args.utime.mtime; pi->atime = req->head.args.utime.atime; pi->version = cur->pre_dirty(); pi->ctime = g_clock.real_now(); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "utime"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); } // chmod void Server::handle_client_chmod(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; if (cur->is_root()) { reply_request(mdr, -EINVAL); // for now return; } // write set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; xlocks.insert(&cur->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // project update inode_t *pi = cur->project_inode(); pi->mode = (pi->mode & ~04777) | (req->head.args.chmod.mode & 04777); pi->version = cur->pre_dirty(); pi->ctime = g_clock.real_now(); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chmod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); } // chown void Server::handle_client_chown(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; if (cur->is_root()) { reply_request(mdr, -EINVAL); // for now return; } // write set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; xlocks.insert(&cur->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // project update inode_t *pi = cur->project_inode(); pi->uid = MAX(req->head.args.chown.uid, 0); pi->gid = MAX(req->head.args.chown.gid, 0); pi->version = cur->pre_dirty(); pi->ctime = g_clock.real_now(); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chown"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); mdlog->submit_entry(le); mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); } // ================================================================= // DIRECTORY and NAMESPACE OPS // READDIR void Server::handle_client_readdir(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *diri = rdlock_path_pin_ref(mdr, false); if (!diri) return; // it's a directory, right? if (!diri->is_dir()) { // not a dir dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl; reply_request(mdr, -ENOTDIR, diri); return; } // which frag? frag_t fg = req->head.args.readdir.frag; // does the frag exist? if (diri->dirfragtree[fg] != fg) { dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl; reply_request(mdr, -EAGAIN, diri); return; } CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); if (!dir) return; // ok! assert(dir->is_auth()); // check perm /* if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) return; mds->locker->inode_hard_rdlock_finish(diri, mdr); */ if (!dir->is_complete()) { // fetch dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl; dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return; } // build dir contents bufferlist dirbl, dnbl; DirStat::_encode(dirbl, dir, mds->get_nodeid()); __u32 numfiles = 0; for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); it++) { CDentry *dn = it->second; if (dn->is_null()) continue; CInode *in = dn->inode; // remote link? // better for the MDS to do the work, if we think the client will stat any of these files. if (dn->is_remote() && !in) { in = mdcache->get_inode(dn->get_remote_ino()); if (in) { dn->link_remote(in); } else { mdcache->open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(mdcache, mdr)); // touch everything i _do_ have for (it = dir->begin(); it != dir->end(); it++) if (!it->second->is_null()) mdcache->lru.lru_touch(it->second); return; } } assert(in); dout(12) << "including inode " << *in << dendl; // add this dentry + inodeinfo ::_encode(it->first, dnbl); InodeStat::_encode(dnbl, in); numfiles++; // touch it mdcache->lru.lru_touch(dn); } ::_encode_simple(numfiles, dirbl); dirbl.claim_append(dnbl); // yay, reply MClientReply *reply = new MClientReply(req); reply->take_dir_items(dirbl); dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl; reply->set_result(0); // bump popularity. NOTE: this doesn't quite capture it. mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles); // reply reply_request(mdr, reply, diri); } // ------------------------------------------------ // MKNOD class C_MDS_mknod_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; CInode *newi; version_t dirpv; version_t newdirpv; public: C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) : mds(m), mdr(r), dn(d), newi(ni), dirpv(dirpv_), newdirpv(newdirpv_) {} void finish(int r) { assert(r == 0); // link the inode dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir newi->mark_dirty(newi->inode.version + 1, mdr->ls); // mkdir? if (newdirpv) { CDir *dir = newi->get_dirfrag(frag_t()); assert(dir); dir->mark_dirty(newdirpv, mdr->ls); } // dir inode's mtime mds->server->dirty_dn_diri(mdr, dn, dirpv); // hit pop mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, newi); } }; void Server::handle_client_mknod(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; mdr->now = g_clock.real_now(); CInode *newi = prepare_new_inode(mdr, dn->dir); assert(newi); // it's a file. newi->inode.rdev = req->head.args.mknod.rdev; newi->inode.mode = req->head.args.mknod.mode; newi->inode.mode &= ~S_IFMT; newi->inode.mode |= S_IFREG; newi->inode.version = dn->pre_dirty() - 1; // prepare finisher mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "mknod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } // MKDIR void Server::handle_client_mkdir(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; // new inode mdr->now = g_clock.real_now(); CInode *newi = prepare_new_inode(mdr, dn->dir); assert(newi); // it's a directory. newi->inode.mode = req->head.args.mkdir.mode; newi->inode.mode &= ~S_IFMT; newi->inode.mode |= S_IFDIR; newi->inode.layout = g_OSD_MDDirLayout; newi->inode.version = dn->pre_dirty() - 1; // ...and that new dir is empty. CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); newdir->mark_complete(); version_t newdirpv = newdir->pre_dirty(); //if (mds->logger) mds->logger->inc("mkdir"); // prepare finisher mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "mkdir"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); le->metablob.add_dir(newdir, true, true); // dirty AND complete // log + wait mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv)); /* old export heuristic. pbly need to reimplement this at some point. if ( diri->dir->is_auth() && diri->dir->is_rep() && newdir->is_auth() && !newdir->is_hashing()) { int dest = rand() % mds->mdsmap->get_num_mds(); if (dest != whoami) { dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << dendl; mdcache->migrator->export_dir(newdir, dest); } } */ } // SYMLINK void Server::handle_client_symlink(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; mdr->now = g_clock.real_now(); CInode *newi = prepare_new_inode(mdr, dn->dir); assert(newi); // it's a symlink newi->inode.mode &= ~S_IFMT; newi->inode.mode |= S_IFLNK; newi->symlink = req->get_path2(); newi->inode.version = dn->pre_dirty() - 1; // prepare finisher mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "symlink"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too le->metablob.add_dir_context(dn->dir); le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } // LINK void Server::handle_client_link(MDRequest *mdr) { MClientRequest *req = mdr->client_request; dout(7) << "handle_client_link " << req->get_filepath() << " to " << req->get_filepath2() << dendl; // traverse to dest dir, make sure it's ours. const filepath &linkpath = req->get_filepath(); const string &dname = linkpath.last_dentry(); vector linktrace; CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); if (!dir) return; dout(7) << "handle_client_link link " << dname << " in " << *dir << dendl; // traverse to link target filepath targetpath = req->get_filepath2(); dout(7) << "handle_client_link discovering target " << targetpath << dendl; vector targettrace; int r = mdcache->path_traverse(mdr, req, targetpath, targettrace, false, MDS_TRAVERSE_DISCOVER); if (r > 0) return; // wait if (targettrace.empty()) r = -EINVAL; if (r < 0) { reply_request(mdr, r); return; } // identify target inode CInode *targeti = targettrace[targettrace.size()-1]->inode; assert(targeti); // dir? dout(7) << "target is " << *targeti << dendl; if (targeti->is_dir()) { dout(7) << "target is a dir, failing..." << dendl; reply_request(mdr, -EINVAL); return; } // get/make null link dentry CDentry *dn = prepare_null_dentry(mdr, dir, dname, false); if (!dn) return; // create lock lists set rdlocks, wrlocks, xlocks; for (int i=0; i<(int)linktrace.size(); i++) rdlocks.insert(&linktrace[i]->lock); xlocks.insert(&dn->lock); wrlocks.insert(&dn->dir->inode->dirlock); for (int i=0; i<(int)targettrace.size(); i++) rdlocks.insert(&targettrace[i]->lock); xlocks.insert(&targeti->linklock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; mdr->done_locking = true; // avoid wrlock moving target issues. // pick mtime if (mdr->now == utime_t()) mdr->now = g_clock.real_now(); // does the target need an anchor? if (targeti->is_auth()) { /*if (targeti->get_parent_dir() == dn->dir) { dout(7) << "target is in the same dirfrag, sweet" << dendl; } else */ if (targeti->is_anchored() && !targeti->is_unanchoring()) { dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; } else { dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; mdcache->anchor_create(mdr, targeti, new C_MDS_RetryRequest(mdcache, mdr)); return; } } // go! // local or remote? if (targeti->is_auth()) _link_local(mdr, dn, targeti); else _link_remote(mdr, dn, targeti); } class C_MDS_link_local_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; CInode *targeti; version_t dnpv; version_t tipv; version_t dirpv; public: C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dnpv_, version_t tipv_, version_t dirpv_) : mds(m), mdr(r), dn(d), targeti(ti), dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); } }; void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; mdr->ls = mdlog->get_current_segment(); // predirty NEW dentry version_t dnpv = dn->pre_dirty(); version_t tipv = targeti->pre_dirty(); // project inode update inode_t *pi = targeti->project_inode(); pi->nlink++; pi->ctime = mdr->now; pi->version = tipv; // log + wait EUpdate *le = new EUpdate(mdlog, "link_local"); le->metablob.add_client_req(mdr->reqid); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote le->metablob.add_dir_context(targeti->get_parent_dir()); le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); } void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, version_t dnpv, version_t tipv, version_t dirpv) { dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl; // link and unlock the NEW dentry dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); dn->mark_dirty(dnpv, mdr->ls); // target inode targeti->pop_and_dirty_projected_inode(mdr->ls); // new dentry dir mtime dirty_dn_diri(mdr, dn, dirpv); // bump target popularity mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref } // remote class C_MDS_link_remote_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; CInode *targeti; version_t dpv; version_t dirpv; public: C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : mds(m), mdr(r), dn(d), targeti(ti), dpv(d->get_projected_version()), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv); } }; void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_remote " << *dn << " to " << *targeti << dendl; // 1. send LinkPrepare to dest (journal nlink++ prepare) int linkauth = targeti->authority().first; if (mdr->more()->witnessed.count(linkauth) == 0) { dout(10) << " targeti auth must prepare nlink++" << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); targeti->set_object_info(req->get_object_info()); req->now = mdr->now; mds->send_message_mds(req, linkauth); assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); mdr->more()->waiting_on_slave.insert(linkauth); return; } dout(10) << " targeti auth has prepared nlink++" << dendl; // go. // predirty dentry dn->pre_dirty(); // add to event mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "link_remote"); le->metablob.add_client_req(mdr->reqid); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote // mark committing (needed for proper recovery) mdr->committing = true; // log + wait mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); } void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, version_t dpv, version_t dirpv) { dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << dendl; // link the new dentry dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); dn->mark_dirty(dpv, mdr->ls); // dir inode's mtime dirty_dn_diri(mdr, dn, dirpv); // bump target popularity mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref } // remote linking/unlinking class C_MDS_SlaveLinkPrep : public Context { Server *server; MDRequest *mdr; CInode *targeti; utime_t old_ctime; bool inc; public: C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } void finish(int r) { assert(r == 0); server->_logged_slave_link(mdr, targeti, old_ctime, inc); } }; void Server::handle_slave_link_prep(MDRequest *mdr) { dout(10) << "handle_slave_link_prep " << *mdr << " on " << mdr->slave_request->get_object_info() << dendl; CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); assert(targeti); dout(10) << "targeti " << *targeti << dendl; CDentry *dn = targeti->get_parent_dn(); assert(dn->is_primary()); mdr->now = mdr->slave_request->now; // anchor? if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { if (targeti->is_anchored() && !targeti->is_unanchoring()) { dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; } else { dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; mdcache->anchor_create(mdr, targeti, new C_MDS_RetryRequest(mdcache, mdr)); return; } } // journal it mdr->ls = mdlog->get_current_segment(); ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); inode_t *pi = dn->inode->project_inode(); // rollback case le->rollback.add_dir_context(targeti->get_parent_dir()); le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary // update journaled target inode bool inc; if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { inc = true; pi->nlink++; } else { inc = false; pi->nlink--; } utime_t old_ctime = pi->ctime; pi->ctime = mdr->now; pi->version = targeti->pre_dirty(); dout(10) << " projected inode " << pi << " v " << pi->version << dendl; // commit case le->commit.add_dir_context(targeti->get_parent_dir()); le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); } class C_MDS_SlaveLinkCommit : public Context { Server *server; MDRequest *mdr; CInode *targeti; utime_t old_ctime; version_t old_version; bool inc; public: C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) : server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { } void finish(int r) { server->_commit_slave_link(mdr, r, targeti, old_ctime, old_version, inc); } }; void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) { dout(10) << "_logged_slave_link " << *mdr << " inc=" << inc << " " << *targeti << dendl; version_t old_version = targeti->inode.version; // update the target targeti->pop_and_dirty_projected_inode(mdr->ls); // hit pop mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); // ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); mds->send_message_mds(reply, mdr->slave_to_mds); // set up commit waiter mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); // done. delete mdr->slave_request; mdr->slave_request = 0; } void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, utime_t old_ctime, version_t old_version, bool inc) { dout(10) << "_commit_slave_link " << *mdr << " r=" << r << " inc=" << inc << " " << *targeti << dendl; ESlaveUpdate *le; if (r == 0) { // write a commit to the journal le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); } else { le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); // -- rollback in memory -- assert(targeti->inode.ctime == mdr->now); assert(targeti->projected_inode.empty()); // we're holding the version lock. targeti->inode.ctime = old_ctime; targeti->inode.version = old_version; if (inc) targeti->inode.nlink++; else targeti->inode.nlink--; } mdlog->submit_entry(le); } void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) { dout(10) << "handle_slave_link_prep_ack " << *mdr << " " << *m << dendl; int from = m->get_source().num(); // note slave mdr->more()->slaves.insert(from); // witnessed! assert(mdr->more()->witnessed.count(from) == 0); mdr->more()->witnessed.insert(from); // remove from waiting list assert(mdr->more()->waiting_on_slave.count(from)); mdr->more()->waiting_on_slave.erase(from); assert(mdr->more()->waiting_on_slave.empty()); dispatch_client_request(mdr); // go again! } // UNLINK void Server::handle_client_unlink(MDRequest *mdr) { MClientRequest *req = mdr->client_request; // traverse to path vector trace; int r = mdcache->path_traverse(mdr, req, req->get_filepath(), trace, false, MDS_TRAVERSE_FORWARD); if (r > 0) return; if (trace.empty()) r = -EINVAL; // can't unlink root if (r < 0) { reply_request(mdr, r); return; } CDentry *dn = trace[trace.size()-1]; assert(dn); // is it my dentry? if (!dn->is_auth()) { // fw to auth mdcache->request_forward(mdr, dn->authority().first); return; } // rmdir or unlink? bool rmdir = false; if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true; if (rmdir) { dout(7) << "handle_client_rmdir on " << *dn << dendl; } else { dout(7) << "handle_client_unlink on " << *dn << dendl; } // readable? if (dn->lock.is_xlocked_by_other(mdr)) { dout(10) << "waiting on xlocked dentry " << *dn << dendl; dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return; } // dn looks ok. // get/open inode. mdr->trace.swap(trace); CInode *in = mdcache->get_dentry_inode(dn, mdr); if (!in) return; dout(7) << "dn links to " << *in << dendl; // rmdir vs is_dir if (in->is_dir()) { if (rmdir) { // do empty directory checks if (!_verify_rmdir(mdr, in)) return; } else { dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl; reply_request(mdr, -EISDIR); return; } } else { if (rmdir) { // unlink dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl; reply_request(mdr, -ENOTDIR); return; } } // lock set rdlocks, wrlocks, xlocks; for (int i=0; i<(int)trace.size()-1; i++) rdlocks.insert(&trace[i]->lock); xlocks.insert(&dn->lock); wrlocks.insert(&dn->dir->inode->dirlock); xlocks.insert(&in->linklock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // yay! mdr->done_locking = true; // avoid wrlock racing if (mdr->now == utime_t()) mdr->now = g_clock.real_now(); // get stray dn ready? CDentry *straydn = 0; if (dn->is_primary()) { straydn = mdcache->get_or_create_stray_dentry(dn->inode); mdr->pin(straydn); // pin it. dout(10) << " straydn is " << *straydn << dendl; assert(straydn->is_null()); if (!mdr->more()->dst_reanchor_atid && dn->inode->is_anchored()) { dout(10) << "reanchoring to stray " << *dn->inode << dendl; vector trace; straydn->make_anchor_trace(trace, dn->inode); mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, new C_MDS_RetryRequest(mdcache, mdr)); return; } } // ok! if (dn->is_remote() && !dn->inode->is_auth()) _unlink_remote(mdr, dn); else _unlink_local(mdr, dn, straydn); } class C_MDS_unlink_local_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; CDentry *straydn; version_t dnpv; // deleted dentry version_t dirpv; public: C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, version_t dirpv_) : mds(m), mdr(r), dn(d), straydn(sd), dnpv(d->get_projected_version()), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); } }; void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) { dout(10) << "_unlink_local " << *dn << dendl; // ok, let's do it. mdr->ls = mdlog->get_current_segment(); // prepare log entry EUpdate *le = new EUpdate(mdlog, "unlink_local"); le->metablob.add_client_req(mdr->reqid); version_t ipv = 0; // dirty inode version inode_t *ji = 0; // journaled projected inode if (dn->is_primary()) { // primary link. add stray dentry. assert(straydn); ipv = straydn->pre_dirty(dn->inode->inode.version); le->metablob.add_dir_context(straydn->dir); ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); } else { // remote link. update remote inode. ipv = dn->inode->pre_dirty(); le->metablob.add_dir_context(dn->inode->get_parent_dir()); ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); } // update journaled target inode inode_t *pi = dn->inode->project_inode(); pi->nlink--; pi->ctime = mdr->now; pi->version = ipv; *ji = *pi; // copy into journal // the unlinked dentry dn->pre_dirty(); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); if (mdr->more()->dst_reanchor_atid) le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); // log + wait journal_opens(); // journal pending opens, just in case mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, dirpv)); } void Server::_unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, version_t dnpv, version_t dirpv) { dout(10) << "_unlink_local_finish " << *dn << dendl; // unlink main dentry CInode *in = dn->inode; dn->dir->unlink_inode(dn); // relink as stray? (i.e. was primary link?) if (straydn) { dout(20) << " straydn is " << *straydn << dendl; straydn->dir->link_primary_inode(straydn, in); } // nlink--, dirty old dentry in->pop_and_dirty_projected_inode(mdr->ls); dn->mark_dirty(dnpv, mdr->ls); // dir inode's mtime dirty_dn_diri(mdr, dn, dirpv); // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); it != dn->replicas_end(); it++) { dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << dendl; MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); if (straydn) { unlink->strayin = straydn->dir->inode->replicate_to(it->first); unlink->straydir = straydn->dir->replicate_to(it->first); unlink->straydn = straydn->replicate_to(it->first); } mds->send_message_mds(unlink, it->first); } // commit anchor update? if (mdr->more()->dst_reanchor_atid) mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); // bump pop //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref // clean up? if (straydn) mdcache->eval_stray(straydn); // removing a new dn? dn->dir->try_remove_unlinked_dn(dn); } class C_MDS_unlink_remote_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; version_t dnpv; // deleted dentry version_t dirpv; public: C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d, version_t dirpv_) : mds(m), mdr(r), dn(d), dnpv(d->get_projected_version()), dirpv(dirpv_) { } void finish(int r) { assert(r == 0); mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv); } }; void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) { dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << dendl; // 1. send LinkPrepare to dest (journal nlink-- prepare) int inauth = dn->inode->authority().first; if (mdr->more()->witnessed.count(inauth) == 0) { dout(10) << " inode auth must prepare nlink--" << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); dn->inode->set_object_info(req->get_object_info()); req->now = mdr->now; mds->send_message_mds(req, inauth); assert(mdr->more()->waiting_on_slave.count(inauth) == 0); mdr->more()->waiting_on_slave.insert(inauth); return; } dout(10) << " inode auth has prepared nlink--" << dendl; // ok, let's do it. // prepare log entry mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "unlink_remote"); le->metablob.add_client_req(mdr->reqid); // the unlinked dentry dn->pre_dirty(); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); if (mdr->more()->dst_reanchor_atid) le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); // finisher C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); journal_opens(); // journal pending opens, just in case // mark committing (needed for proper recovery) mdr->committing = true; // log + wait mdlog->submit_entry(le, fin); } void Server::_unlink_remote_finish(MDRequest *mdr, CDentry *dn, version_t dnpv, version_t dirpv) { dout(10) << "_unlink_remote_finish " << *dn << dendl; // unlink main dentry dn->dir->unlink_inode(dn); dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry // dir inode's mtime dirty_dn_diri(mdr, dn, dirpv); // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); it != dn->replicas_end(); it++) { dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << dendl; MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); mds->send_message_mds(unlink, it->first); } // commit anchor update? if (mdr->more()->dst_reanchor_atid) mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref // removing a new dn? dn->dir->try_remove_unlinked_dn(dn); } /** _verify_rmdir * * verify that a directory is empty (i.e. we can rmdir it), * and make sure it is part of the same subtree (i.e. local) * so that rmdir will occur locally. * * @param in is the inode being rmdir'd. */ bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) { dout(10) << "_verify_rmdir " << *in << dendl; assert(in->is_auth()); list frags; in->dirfragtree.get_leaves(frags); for (list::iterator p = frags.begin(); p != frags.end(); ++p) { CDir *dir = in->get_or_open_dirfrag(mdcache, *p); assert(dir); // dir looks empty but incomplete? if (dir->is_auth() && dir->get_size() == 0 && !dir->is_complete()) { dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << dendl; dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return false; } // does the frag _look_ empty? if (dir->get_size()) { dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << dendl; reply_request(mdr, -ENOTEMPTY); return false; } // not dir auth? if (!dir->is_auth()) { dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << dendl; reply_request(mdr, -ENOTEMPTY); return false; } } return true; } /* // export sanity check if (!in->is_auth()) { // i should be exporting this now/soon, since the dir is empty. dout(7) << "handle_client_rmdir dir is auth, but not inode." << dendl; mdcache->migrator->export_empty_import(in->dir); in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); return; } */ // ====================================================== class C_MDS_rename_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *srcdn; CDentry *destdn; CDentry *straydn; public: C_MDS_rename_finish(MDS *m, MDRequest *r, CDentry *sdn, CDentry *ddn, CDentry *stdn) : mds(m), mdr(r), srcdn(sdn), destdn(ddn), straydn(stdn) { } void finish(int r) { assert(r == 0); mds->server->_rename_finish(mdr, srcdn, destdn, straydn); } }; /** handle_client_rename * */ void Server::handle_client_rename(MDRequest *mdr) { MClientRequest *req = mdr->client_request; dout(7) << "handle_client_rename " << *req << dendl; // traverse to dest dir (not dest) // we do this FIRST, because the rename should occur on the // destdn's auth. const filepath &destpath = req->get_filepath2(); const string &destname = destpath.last_dentry(); vector desttrace; CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); if (!destdir) return; // fw or error out dout(10) << "dest will be " << destname << " in " << *destdir << dendl; assert(destdir->is_auth()); // traverse to src filepath srcpath = req->get_filepath(); vector srctrace; int r = mdcache->path_traverse(mdr, req, srcpath, srctrace, false, MDS_TRAVERSE_DISCOVER); if (r > 0) return; if (srctrace.empty()) r = -EINVAL; // can't rename root if (r < 0) { reply_request(mdr, r); return; } CDentry *srcdn = srctrace[srctrace.size()-1]; dout(10) << " srcdn " << *srcdn << dendl; CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); dout(10) << " srci " << *srci << dendl; mdr->pin(srci); // -- some sanity checks -- // src == dest? if (srcdn->get_dir() == destdir && srcdn->name == destname) { dout(7) << "rename src=dest, noop" << dendl; reply_request(mdr, 0); return; } // dest a child of src? // e.g. mv /usr /usr/foo CDentry *pdn = destdir->inode->parent; while (pdn) { if (pdn == srcdn) { dout(7) << "cannot rename item to be a child of itself" << dendl; reply_request(mdr, -EINVAL); return; } pdn = pdn->dir->inode->parent; } // identify/create dest dentry CDentry *destdn = destdir->lookup(destname); if (destdn && destdn->lock.is_xlocked_by_other(mdr)) { destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return; } CInode *oldin = 0; if (destdn && !destdn->is_null()) { //dout(10) << "dest dn exists " << *destdn << dendl; oldin = mdcache->get_dentry_inode(destdn, mdr); if (!oldin) return; dout(10) << " oldin " << *oldin << dendl; // mv /some/thing /to/some/existing_other_thing if (oldin->is_dir() && !srci->is_dir()) { reply_request(mdr, -EISDIR); return; } if (!oldin->is_dir() && srci->is_dir()) { reply_request(mdr, -ENOTDIR); return; } // non-empty dir? if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) return; } if (!destdn) { // mv /some/thing /to/some/non_existent_name destdn = prepare_null_dentry(mdr, destdir, destname); if (!destdn) return; } dout(10) << " destdn " << *destdn << dendl; // -- locks -- set rdlocks, wrlocks, xlocks; // rdlock sourcedir path, xlock src dentry for (int i=0; i<(int)srctrace.size()-1; i++) rdlocks.insert(&srctrace[i]->lock); xlocks.insert(&srcdn->lock); wrlocks.insert(&srcdn->dir->inode->dirlock); /* * no, this causes problems if the dftlock is scattered... * and what was i thinking anyway? * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. */ // rdlock destdir path, xlock dest dentry for (int i=0; i<(int)desttrace.size(); i++) rdlocks.insert(&desttrace[i]->lock); xlocks.insert(&destdn->lock); wrlocks.insert(&destdn->dir->inode->dirlock); // xlock versionlock on srci if remote? // this ensures it gets safely remotely auth_pinned, avoiding deadlock; // strictly speaking, having the slave node freeze the inode is // otherwise sufficient for avoiding conflicts with inode locks, etc. if (!srcdn->is_auth() && srcdn->is_primary()) xlocks.insert(&srcdn->inode->versionlock); // xlock oldin (for nlink--) if (oldin) xlocks.insert(&oldin->linklock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // set done_locking flag, to avoid problems with wrlock moving auth target mdr->done_locking = true; // -- open all srcdn inode frags, if any -- // we need these open so that auth can properly delegate from inode to dirfrags // after the inode is _ours_. if (srcdn->is_primary() && !srcdn->is_auth() && srci->is_dir()) { dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl; mdr->set_stickydirs(srci); list frags; srci->dirfragtree.get_leaves(frags); for (list::iterator p = frags.begin(); p != frags.end(); ++p) { CDir *dir = srci->get_dirfrag(*p); if (!dir) { dout(10) << " opening " << *p << " under " << *srci << dendl; mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); return; } } } // -- declare now -- if (mdr->now == utime_t()) mdr->now = g_clock.real_now(); // -- create stray dentry? -- CDentry *straydn = 0; if (destdn->is_primary()) { straydn = mdcache->get_or_create_stray_dentry(destdn->inode); mdr->pin(straydn); dout(10) << "straydn is " << *straydn << dendl; } // -- prepare witnesses -- /* * NOTE: we use _all_ replicas as witnesses. * this probably isn't totally necessary (esp for file renames), * but if/when we change that, we have to make sure rejoin is * sufficiently robust to handle strong rejoins from survivors * with totally wrong dentry->inode linkage. * (currently, it can ignore rename effects, because the resolve * stage will sort them out.) */ set witnesses = mdr->more()->extra_witnesses; if (srcdn->is_auth()) srcdn->list_replicas(witnesses); else witnesses.insert(srcdn->authority().first); destdn->list_replicas(witnesses); dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; // do srcdn auth last int last = -1; if (!srcdn->is_auth()) last = srcdn->authority().first; for (set::iterator p = witnesses.begin(); p != witnesses.end(); ++p) { if (*p == last) continue; // do it last! if (mdr->more()->witnessed.count(*p)) { dout(10) << " already witnessed by mds" << *p << dendl; } else if (mdr->more()->waiting_on_slave.count(*p)) { dout(10) << " already waiting on witness mds" << *p << dendl; } else { _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); } } if (!mdr->more()->waiting_on_slave.empty()) return; // we're waiting for a witness. if (last >= 0 && mdr->more()->witnessed.count(last) == 0 && mdr->more()->waiting_on_slave.count(last) == 0) { dout(10) << " preparing last witness (srcdn auth)" << dendl; _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); return; } // -- prepare anchor updates -- bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); if (!linkmerge) { C_Gather *anchorgather = 0; if (srcdn->is_primary() && srcdn->inode->is_anchored() && srcdn->dir != destdn->dir && !mdr->more()->src_reanchor_atid) { dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl; vector trace; destdn->make_anchor_trace(trace, srcdn->inode); anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid, anchorgather->new_sub()); } if (destdn->is_primary() && destdn->inode->is_anchored() && !mdr->more()->dst_reanchor_atid) { dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl; assert(straydn); vector trace; straydn->make_anchor_trace(trace, destdn->inode); if (!anchorgather) anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, anchorgather->new_sub()); } if (anchorgather) return; // waiting for anchor prepares } // -- prepare journal entry -- mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "rename"); le->metablob.add_client_req(mdr->reqid); _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn); if (!srcdn->is_auth() && srcdn->is_primary()) { // importing inode; also journal imported client map // ** DER FIXME ** } // -- commit locally -- C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); journal_opens(); // journal pending opens, just in case // mark committing (needed for proper recovery) mdr->committing = true; // log + wait mdlog->submit_entry(le, fin); } void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_rename_finish " << *mdr << dendl; // apply _rename_apply(mdr, srcdn, destdn, straydn); // commit anchor updates? if (mdr->more()->src_reanchor_atid) mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls); if (mdr->more()->dst_reanchor_atid) mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); // bump popularity //if (srcdn->is_auth()) //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR); if (destdn->is_remote() && destdn->inode->is_auth()) mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref // clean up? if (straydn) mdcache->eval_stray(straydn); } // helpers void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_rename_prepare_witness mds" << who << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); srcdn->make_path(req->srcdnpath); destdn->make_path(req->destdnpath); req->now = mdr->now; if (straydn) { CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); CDirDiscover *dirdis = straydn->dir->replicate_to(who); CDentryDiscover *dndis = straydn->replicate_to(who); indis->_encode(req->stray); dirdis->_encode(req->stray); dndis->_encode(req->stray); delete indis; delete dirdis; delete dndis; } // srcdn auth will verify our current witness list is sufficient req->witnesses = mdr->more()->witnessed; mds->send_message_mds(req, who); assert(mdr->more()->waiting_on_slave.count(who) == 0); mdr->more()->waiting_on_slave.insert(who); } void Server::_rename_prepare(MDRequest *mdr, EMetaBlob *metablob, bufferlist *client_map_bl, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; // primary+remote link merge? bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); if (mdr->is_master()) { mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); if (destdn->dir != srcdn->dir) mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); } inode_t *ji = 0; // journaled inode getting nlink-- version_t ipv = 0; // it's version if (linkmerge) { dout(10) << "will merge remote+primary links" << dendl; // destdn -> primary metablob->add_dir_context(destdn->dir); if (destdn->is_auth()) ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); ji = metablob->add_primary_dentry(destdn, true, destdn->inode); // do src dentry metablob->add_dir_context(srcdn->dir); if (srcdn->is_auth()) mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); metablob->add_null_dentry(srcdn, true); } else { // move to stray? if (destdn->is_primary()) { // primary. we'll move inode to stray dir. assert(straydn); // link-- inode, move to stray dir. metablob->add_dir_context(straydn->dir); if (straydn->is_auth()) ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); ji = metablob->add_primary_dentry(straydn, true, destdn->inode); } else if (destdn->is_remote()) { // remote. // nlink-- targeti metablob->add_dir_context(destdn->inode->get_parent_dir()); if (destdn->inode->is_auth()) ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty(); ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl; } else { assert(destdn->is_null()); } // add dest dentry metablob->add_dir_context(destdn->dir); if (srcdn->is_primary()) { dout(10) << "src is a primary dentry" << dendl; if (destdn->is_auth()) { version_t siv; if (srcdn->is_auth()) siv = srcdn->inode->get_projected_version(); else { siv = mdr->more()->inode_import_v; /* import node */ bufferlist::iterator blp = mdr->more()->inode_import.begin(); // imported caps ::_decode_simple(mdr->more()->imported_client_map, blp); ::_encode_simple(mdr->more()->imported_client_map, *client_map_bl); prepare_force_open_sessions(mdr->more()->imported_client_map); list updated_scatterlocks; // we clear_updated explicitly below mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls, mdr->more()->cap_imports, updated_scatterlocks); srcdn->inode->dirlock.clear_updated(); // hack: force back to !auth and clean, temporarily srcdn->inode->state_clear(CInode::STATE_AUTH); srcdn->inode->mark_clean(); } mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1); } metablob->add_primary_dentry(destdn, true, srcdn->inode); } else { assert(srcdn->is_remote()); dout(10) << "src is a remote dentry" << dendl; if (destdn->is_auth()) mdr->more()->pvmap[destdn] = destdn->pre_dirty(); metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); } // remove src dentry metablob->add_dir_context(srcdn->dir); if (srcdn->is_auth()) mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); metablob->add_null_dentry(srcdn, true); // new subtree? if (srcdn->is_primary() && srcdn->inode->is_dir()) { list ls; srcdn->inode->get_nested_dirfrags(ls); int auth = srcdn->authority().first; for (list::iterator p = ls.begin(); p != ls.end(); ++p) mdcache->adjust_subtree_auth(*p, auth, auth); } } if (ipv) { // update journaled target inode inode_t *pi = destdn->inode->project_inode(); pi->nlink--; pi->ctime = mdr->now; pi->version = ipv; *ji = *pi; // copy into journal } // anchor updates? if (mdr->more()->src_reanchor_atid) metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid); if (mdr->more()->dst_reanchor_atid) metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid); } void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; dout(10) << " pvs " << mdr->more()->pvmap << dendl; CInode *oldin = destdn->inode; // primary+remote link merge? bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); // dir mtimes if (mdr->is_master()) { dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]); if (destdn->dir != srcdn->dir) dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]); } if (linkmerge) { if (destdn->is_primary()) { dout(10) << "merging remote onto primary link" << dendl; // nlink-- in place destdn->inode->inode.nlink--; destdn->inode->inode.ctime = mdr->now; if (destdn->inode->is_auth()) destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); // unlink srcdn srcdn->dir->unlink_inode(srcdn); if (srcdn->is_auth()) srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } else { dout(10) << "merging primary onto remote link" << dendl; assert(srcdn->is_primary()); // move inode to dest srcdn->dir->unlink_inode(srcdn); destdn->dir->unlink_inode(destdn); destdn->dir->link_primary_inode(destdn, oldin); // nlink-- destdn->inode->inode.nlink--; destdn->inode->inode.ctime = mdr->now; if (destdn->inode->is_auth()) destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); // mark src dirty if (srcdn->is_auth()) srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } } else { // unlink destdn? if (!destdn->is_null()) destdn->dir->unlink_inode(destdn); if (straydn) { dout(10) << "straydn is " << *straydn << dendl; // relink oldin to stray dir. destdn was primary. assert(oldin); straydn->dir->link_primary_inode(straydn, oldin); //assert(straypv == ipv); // nlink-- in stray dir. oldin->inode.nlink--; oldin->inode.ctime = mdr->now; if (oldin->is_auth()) oldin->pop_and_dirty_projected_inode(mdr->ls); } else if (oldin) { // nlink-- remote. destdn was remote. oldin->inode.nlink--; oldin->inode.ctime = mdr->now; if (oldin->is_auth()) oldin->pop_and_dirty_projected_inode(mdr->ls); } CInode *in = srcdn->inode; assert(in); if (srcdn->is_remote()) { // srcdn was remote. srcdn->dir->unlink_inode(srcdn); destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); destdn->link_remote(in); if (destdn->is_auth()) destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); } else { // srcdn was primary. srcdn->dir->unlink_inode(srcdn); destdn->dir->link_primary_inode(destdn, in); // srcdn inode import? if (!srcdn->is_auth() && destdn->is_auth()) { assert(mdr->more()->inode_import.length() > 0); // finish cap imports finish_force_open_sessions(mdr->more()->imported_client_map); if (mdr->more()->cap_imports.count(destdn->inode)) mds->mdcache->migrator->finish_import_inode_caps(destdn->inode, srcdn->authority().first, mdr->more()->cap_imports[destdn->inode]); // hack: fix auth bit destdn->inode->state_set(CInode::STATE_AUTH); } if (destdn->inode->is_auth()) destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); } if (srcdn->is_auth()) srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } // update subtree map? if (destdn->is_primary() && destdn->inode->is_dir()) mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); // removing a new dn? if (srcdn->is_auth()) srcdn->dir->try_remove_unlinked_dn(srcdn); } // ------------ // SLAVE class C_MDS_SlaveRenamePrep : public Context { Server *server; MDRequest *mdr; CDentry *srcdn, *destdn, *straydn; public: C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} void finish(int r) { server->_logged_slave_rename(mdr, srcdn, destdn, straydn); } }; class C_MDS_SlaveRenameCommit : public Context { Server *server; MDRequest *mdr; CDentry *srcdn, *destdn, *straydn; public: C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} void finish(int r) { server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); } }; void Server::handle_slave_rename_prep(MDRequest *mdr) { dout(10) << "handle_slave_rename_prep " << *mdr << " " << mdr->slave_request->srcdnpath << " to " << mdr->slave_request->destdnpath << dendl; // discover destdn filepath destpath(mdr->slave_request->destdnpath); dout(10) << " dest " << destpath << dendl; vector trace; int r = mdcache->path_traverse(mdr, mdr->slave_request, destpath, trace, false, MDS_TRAVERSE_DISCOVERXLOCK); if (r > 0) return; assert(r == 0); // we shouldn't get an error here! CDentry *destdn = trace[trace.size()-1]; dout(10) << " destdn " << *destdn << dendl; mdr->pin(destdn); // discover srcdn filepath srcpath(mdr->slave_request->srcdnpath); dout(10) << " src " << srcpath << dendl; r = mdcache->path_traverse(mdr, mdr->slave_request, srcpath, trace, false, MDS_TRAVERSE_DISCOVERXLOCK); if (r > 0) return; assert(r == 0); CDentry *srcdn = trace[trace.size()-1]; dout(10) << " srcdn " << *srcdn << dendl; mdr->pin(srcdn); assert(srcdn->inode); mdr->pin(srcdn->inode); // stray? CDentry *straydn = 0; if (destdn->is_primary()) { assert(mdr->slave_request->stray.length() > 0); straydn = mdcache->add_replica_stray(mdr->slave_request->stray, destdn->inode, mdr->slave_to_mds); assert(straydn); mdr->pin(straydn); } mdr->now = mdr->slave_request->now; // set up commit waiter (early, to clean up any freezing etc we do) if (!mdr->more()->slave_commit) mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); // am i srcdn auth? if (srcdn->is_auth()) { if (srcdn->is_primary() && !srcdn->inode->is_freezing_inode() && !srcdn->inode->is_frozen_inode()) { // srci auth. // set ambiguous auth. srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); // freeze? // we need this to // - avoid conflicting lock state changes // - avoid concurrent updates to the inode // (this could also be accomplished with the versionlock) int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; if (!srcdn->inode->freeze_inode(allowance)) { srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); return; } } // is witness list sufficient? set srcdnrep; srcdn->list_replicas(srcdnrep); for (set::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) { if (*p == mdr->slave_to_mds || mdr->slave_request->witnesses.count(*p)) continue; dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); reply->witnesses.swap(srcdnrep); mds->send_message_mds(reply, mdr->slave_to_mds); delete mdr->slave_request; mdr->slave_request = 0; return; } dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; } // journal it? if (srcdn->is_auth() || (destdn->inode && destdn->inode->is_auth()) || srcdn->inode->is_any_caps()) { // journal. mdr->ls = mdlog->get_current_segment(); ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); // rollback case if (destdn->inode && destdn->inode->is_auth()) { assert(destdn->is_remote()); le->rollback.add_dir_context(destdn->dir); le->rollback.add_dentry(destdn, true); } if (srcdn->is_auth() || (srcdn->inode && srcdn->inode->is_auth())) { le->rollback.add_dir_context(srcdn->dir); le->rollback.add_dentry(srcdn, true); } // commit case bufferlist blah; _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn); mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); } else { // don't journal. dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl; // prepare anyway; this may twiddle dir_auth EMetaBlob blob; bufferlist blah; _rename_prepare(mdr, &blob, &blah, srcdn, destdn, straydn); _logged_slave_rename(mdr, srcdn, destdn, straydn); } } void Server::_logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_logged_slave_rename " << *mdr << dendl; // prepare ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); // export srci? if (srcdn->is_auth() && srcdn->is_primary()) { list finished; map exported_client_map; bufferlist inodebl; mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, exported_client_map); mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished); mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. ::_encode(exported_client_map, reply->inode_export); reply->inode_export.claim_append(inodebl); reply->inode_export_v = srcdn->inode->inode.version; // remove mdr auth pin mdr->auth_unpin(srcdn->inode); assert(!srcdn->inode->is_auth_pinned()); dout(10) << " exported srci " << *srcdn->inode << dendl; } // apply _rename_apply(mdr, srcdn, destdn, straydn); mds->send_message_mds(reply, mdr->slave_to_mds); // bump popularity //if (srcdn->is_auth()) //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); if (destdn->inode && destdn->inode->is_auth()) mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); // done. delete mdr->slave_request; mdr->slave_request = 0; } void Server::_commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; // unfreeze+singleauth inode // hmm, do i really need to delay this? if (srcdn->is_auth() && destdn->is_primary()) { dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; list finished; // singleauth assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); // unfreeze assert(destdn->inode->is_frozen_inode() || destdn->inode->is_freezing_inode()); destdn->inode->unfreeze_inode(finished); mds->queue_waiters(finished); } ESlaveUpdate *le; if (r == 0) { // write a commit to the journal le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); } else { // abort le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); // -- rollback in memory -- if (mdr->more()->was_link_merge) { // link merge CInode *in = destdn->inode; in->inode.nlink++; if (mdr->more()->destdn_was_remote_inode) { destdn->dir->unlink_inode(destdn); srcdn->dir->link_primary_inode(srcdn, in); destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); } else { srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode)); } } else { // normal // revert srcdn if (destdn->is_remote()) { srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode)); destdn->dir->unlink_inode(destdn); } else { // renamed a primary CInode *in = destdn->inode; destdn->dir->unlink_inode(destdn); srcdn->dir->link_primary_inode(srcdn, in); } // revert destdn if (mdr->more()->destdn_was_remote_inode) { destdn->dir->link_remote_inode(destdn, mdr->more()->destdn_was_remote_inode->ino(), MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode)); mdr->more()->destdn_was_remote_inode->inode.nlink++; } else if (straydn && straydn->inode) { CInode *in = straydn->inode; straydn->dir->unlink_inode(straydn); destdn->dir->link_primary_inode(destdn, in); straydn->dir->remove_dentry(straydn); } } // FIXME: reverse srci export? dout(-10) << " srcdn back to " << *srcdn << dendl; dout(-10) << " srci back to " << *srcdn->inode << dendl; dout(-10) << " destdn back to " << *destdn << dendl; if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl; // *** WRITE ME *** assert(0); } mdlog->submit_entry(le); } void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) { dout(10) << "handle_slave_rename_prep_ack " << *mdr << " witnessed by " << ack->get_source() << " " << *ack << dendl; int from = ack->get_source().num(); // note slave mdr->more()->slaves.insert(from); // witnessed? or add extra witnesses? assert(mdr->more()->witnessed.count(from) == 0); if (ack->witnesses.empty()) { mdr->more()->witnessed.insert(from); } else { dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; mdr->more()->extra_witnesses.swap(ack->witnesses); mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! } // srci import? if (ack->inode_export.length()) { dout(10) << " got srci import" << dendl; mdr->more()->inode_import.claim(ack->inode_export); mdr->more()->inode_import_v = ack->inode_export_v; } // remove from waiting list assert(mdr->more()->waiting_on_slave.count(from)); mdr->more()->waiting_on_slave.erase(from); if (mdr->more()->waiting_on_slave.empty()) dispatch_client_request(mdr); // go again! else dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; } // =================================== // TRUNCATE, FSYNC class C_MDS_truncate_purged : public Context { MDS *mds; MDRequest *mdr; CInode *in; version_t pv; off_t size; utime_t ctime; public: C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : mds(m), mdr(r), in(i), pv(pdv), size(sz), ctime(ct) { } void finish(int r) { assert(r == 0); // apply to cache in->inode.size = size; in->inode.ctime = ctime; in->inode.mtime = ctime; in->mark_dirty(pv, mdr->ls); // reply mds->server->reply_request(mdr, 0); } }; class C_MDS_truncate_logged : public Context { MDS *mds; MDRequest *mdr; CInode *in; version_t pv; off_t size; utime_t ctime; public: C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : mds(m), mdr(r), in(i), pv(pdv), size(sz), ctime(ct) { } void finish(int r) { assert(r == 0); // purge mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls); mds->mdcache->wait_for_purge(in, size, new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); } }; void Server::handle_client_truncate(MDRequest *mdr) { MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; // check permissions? // xlock inode set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; xlocks.insert(&cur->filelock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; // already small enough? if (cur->inode.size <= req->head.args.truncate.length) { reply_request(mdr, 0); return; } // prepare version_t pdv = cur->pre_dirty(); utime_t ctime = g_clock.real_now(); Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, pdv, req->head.args.truncate.length, ctime); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_inode_truncate(cur->ino(), req->head.args.truncate.length, cur->inode.size); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mtime = ctime; pi->ctime = ctime; pi->version = pdv; pi->size = req->head.args.truncate.length; mdlog->submit_entry(le, fin); } // =========================== // open, openc, close void Server::handle_client_open(MDRequest *mdr) { MClientRequest *req = mdr->client_request; int flags = req->head.args.open.flags; int cmode = req->get_open_file_mode(); bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || (flags & O_TRUNC)); dout(10) << "open flags = " << flags << ", filemode = " << cmode << ", need_auth = " << need_auth << dendl; CInode *cur = rdlock_path_pin_ref(mdr, need_auth); if (!cur) return; // regular file? if (!cur->inode.is_file() && !cur->inode.is_dir()) { dout(7) << "not a file or dir " << *cur << dendl; reply_request(mdr, -EINVAL); // FIXME what error do we want? return; } // can only open a dir rdonly, no flags. if (cur->inode.is_dir() && (cmode != FILE_MODE_R || flags != 0)) { reply_request(mdr, -EINVAL); return; } // hmm, check permissions or something. // O_TRUNC if (flags & O_TRUNC) { assert(cur->is_auth()); // xlock file size set rdlocks = mdr->rdlocks; set wrlocks = mdr->wrlocks; set xlocks = mdr->xlocks; xlocks.insert(&cur->filelock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; if (cur->inode.size > 0) { handle_client_opent(mdr); return; } } // do it _do_open(mdr, cur); } void Server::_do_open(MDRequest *mdr, CInode *cur) { MClientRequest *req = mdr->client_request; int cmode = req->get_open_file_mode(); // can we issue the caps they want? //version_t fdv = mds->locker->issue_file_data_version(cur); Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); if (!cap) return; // can't issue (yet), so wait! dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << dendl; // hit pop mdr->now = g_clock.now(); if (cmode == FILE_MODE_RW || cmode == FILE_MODE_W) mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR); else mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD, mdr->client_request->get_client_inst().name.num()); // reply MClientReply *reply = new MClientReply(req, 0); reply->set_file_caps(cap->pending()); reply->set_file_caps_seq(cap->get_last_seq()); //reply->set_file_data_version(fdv); reply_request(mdr, reply, cur); // journal? if (cur->last_open_journaled == 0) { queue_journal_open(cur); maybe_journal_opens(); } } void Server::queue_journal_open(CInode *in) { dout(10) << "queue_journal_open on " << *in << dendl; if (journal_open_queue.count(in) == 0) { // pin so our pointer stays valid in->get(CInode::PIN_BATCHOPENJOURNAL); // queue it up for a bit journal_open_queue.insert(in); } } void Server::journal_opens() { dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << dendl; if (journal_open_queue.empty()) return; EOpen *le = 0; // check queued inodes LogSegment *ls = mdlog->get_current_segment(); for (set::iterator p = journal_open_queue.begin(); p != journal_open_queue.end(); ++p) { CInode *in = *p; in->put(CInode::PIN_BATCHOPENJOURNAL); if (in->is_any_caps()) { if (!le) le = new EOpen(mdlog); le->add_inode(in); in->last_open_journaled = mds->mdlog->get_write_pos(); ls->open_files.push_back(&in->xlist_open_file); } } journal_open_queue.clear(); if (le) { // journal mdlog->submit_entry(le); // add waiters to journal entry for (list::iterator p = journal_open_waiters.begin(); p != journal_open_waiters.end(); ++p) mds->mdlog->wait_for_sync(*p); journal_open_waiters.clear(); } else { // nothing worth journaling here, just kick the waiters. mds->queue_waiters(journal_open_waiters); } } class C_MDS_open_truncate_purged : public Context { MDS *mds; MDRequest *mdr; CInode *in; version_t pv; utime_t ctime; public: C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : mds(m), mdr(r), in(i), pv(pdv), ctime(ct) { } void finish(int r) { assert(r == 0); // apply to cache in->inode.size = 0; in->inode.ctime = ctime; in->inode.mtime = ctime; in->mark_dirty(pv, mdr->ls); // do the open mds->server->_do_open(mdr, in); } }; class C_MDS_open_truncate_logged : public Context { MDS *mds; MDRequest *mdr; CInode *in; version_t pv; utime_t ctime; public: C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : mds(m), mdr(r), in(i), pv(pdv), ctime(ct) { } void finish(int r) { assert(r == 0); // hit pop mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); // purge also... mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls); mds->mdcache->wait_for_purge(in, 0, new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); } }; void Server::handle_client_opent(MDRequest *mdr) { CInode *cur = mdr->ref; assert(cur); // prepare version_t pdv = cur->pre_dirty(); utime_t ctime = g_clock.real_now(); Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, pdv, ctime); // log + wait mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "open_truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mtime = ctime; pi->ctime = ctime; pi->version = pdv; pi->size = 0; mdlog->submit_entry(le, fin); } class C_MDS_openc_finish : public Context { MDS *mds; MDRequest *mdr; CDentry *dn; CInode *newi; version_t pv; public: C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : mds(m), mdr(r), dn(d), newi(ni), pv(d->get_projected_version()) {} void finish(int r) { assert(r == 0); // link the inode dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir newi->mark_dirty(pv, mdr->ls); // downgrade xlock to rdlock //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); // set/pin ref inode for open() mdr->ref = newi; mdr->pin(newi); // ok, do the open. mds->server->handle_client_open(mdr); } }; void Server::handle_client_openc(MDRequest *mdr) { MClientRequest *req = mdr->client_request; dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl; bool excl = (req->head.args.open.flags & O_EXCL); CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); if (!dn) return; if (!dn->is_null()) { // it existed. if (req->head.args.open.flags & O_EXCL) { dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl; reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); return; } // pass to regular open handler. handle_client_open(mdr); return; } // created null dn. // create inode. mdr->now = g_clock.real_now(); CInode *in = prepare_new_inode(mdr, dn->dir); assert(in); // it's a file. in->inode.mode = req->head.args.open.mode; in->inode.mode |= S_IFREG; in->inode.version = dn->pre_dirty() - 1; // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "openc"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); le->metablob.add_dir_context(dn->dir); le->metablob.add_primary_dentry(dn, true, in, &in->inode); // log + wait mdlog->submit_entry(le, fin); /* FIXME. this needs to be rewritten when the write capability stuff starts getting journaled. */ }