mirror of
https://github.com/ceph/ceph
synced 2025-02-08 19:38:47 +00:00
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2100 29311d96-e01e-0410-9327-a35deaab8ce9
4060 lines
113 KiB
C++
4060 lines
113 KiB
C++
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
|
// vim: ts=8 sw=2 smarttab
|
|
/*
|
|
* Ceph - scalable distributed file system
|
|
*
|
|
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
|
|
*
|
|
* This is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License version 2.1, as published by the Free Software
|
|
* Foundation. See file COPYING.
|
|
*
|
|
*/
|
|
|
|
#include "MDS.h"
|
|
#include "Server.h"
|
|
#include "Locker.h"
|
|
#include "MDCache.h"
|
|
#include "MDLog.h"
|
|
#include "Migrator.h"
|
|
#include "MDBalancer.h"
|
|
#include "AnchorClient.h"
|
|
#include "IdAllocator.h"
|
|
|
|
#include "msg/Messenger.h"
|
|
|
|
#include "messages/MClientSession.h"
|
|
#include "messages/MClientRequest.h"
|
|
#include "messages/MClientReply.h"
|
|
#include "messages/MClientReconnect.h"
|
|
#include "messages/MClientFileCaps.h"
|
|
|
|
#include "messages/MMDSSlaveRequest.h"
|
|
|
|
#include "messages/MLock.h"
|
|
|
|
#include "messages/MDentryUnlink.h"
|
|
|
|
#include "events/EString.h"
|
|
#include "events/EUpdate.h"
|
|
#include "events/ESlaveUpdate.h"
|
|
#include "events/ESession.h"
|
|
#include "events/EOpen.h"
|
|
|
|
#include "include/filepath.h"
|
|
#include "common/Timer.h"
|
|
#include "common/Logger.h"
|
|
#include "common/LogType.h"
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
|
|
#include <list>
|
|
#include <iostream>
|
|
using namespace std;
|
|
|
|
#include "config.h"
|
|
|
|
#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server "
|
|
#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server "
|
|
|
|
|
|
void Server::reopen_logger(utime_t start, bool append)
|
|
{
|
|
static LogType mdserver_logtype;
|
|
static bool didit = false;
|
|
if (!didit) {
|
|
didit = true;
|
|
mdserver_logtype.add_inc("hcreq"); // handle client req
|
|
mdserver_logtype.add_inc("hsreq"); // slave
|
|
mdserver_logtype.add_inc("hcsess"); // client session
|
|
mdserver_logtype.add_inc("dcreq"); // dispatch client req
|
|
mdserver_logtype.add_inc("dsreq"); // slave
|
|
}
|
|
|
|
if (logger)
|
|
delete logger;
|
|
|
|
// logger
|
|
char name[80];
|
|
sprintf(name, "mds%d.server", mds->get_nodeid());
|
|
logger = new Logger(name, &mdserver_logtype, append);
|
|
logger->set_start(start);
|
|
}
|
|
|
|
|
|
void Server::dispatch(Message *m)
|
|
{
|
|
switch (m->get_type()) {
|
|
case CEPH_MSG_CLIENT_RECONNECT:
|
|
handle_client_reconnect((MClientReconnect*)m);
|
|
return;
|
|
}
|
|
|
|
// active?
|
|
if (!mds->is_active() && !mds->is_stopping()) {
|
|
dout(3) << "not active yet, waiting" << dendl;
|
|
mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
|
|
return;
|
|
}
|
|
|
|
switch (m->get_type()) {
|
|
case CEPH_MSG_CLIENT_SESSION:
|
|
handle_client_session((MClientSession*)m);
|
|
return;
|
|
case CEPH_MSG_CLIENT_REQUEST:
|
|
handle_client_request((MClientRequest*)m);
|
|
return;
|
|
case MSG_MDS_SLAVE_REQUEST:
|
|
handle_slave_request((MMDSSlaveRequest*)m);
|
|
return;
|
|
}
|
|
|
|
dout(1) << "server unknown message " << m->get_type() << dendl;
|
|
assert(0);
|
|
}
|
|
|
|
|
|
|
|
// ----------------------------------------------------------
|
|
// SESSION management
|
|
|
|
|
|
class C_MDS_session_finish : public Context {
|
|
MDS *mds;
|
|
entity_inst_t client_inst;
|
|
bool open;
|
|
version_t cmapv;
|
|
public:
|
|
C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) :
|
|
mds(m), client_inst(ci), open(s), cmapv(mv) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_session_logged(client_inst, open, cmapv);
|
|
}
|
|
};
|
|
|
|
|
|
void Server::handle_client_session(MClientSession *m)
|
|
{
|
|
dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
|
|
int from = m->get_source().num();
|
|
bool open = m->op == MClientSession::OP_REQUEST_OPEN;
|
|
|
|
if (open) {
|
|
if (mds->clientmap.have_session(from)) {
|
|
dout(10) << "already open, dropping this req" << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
if (mds->clientmap.is_opening(from)) {
|
|
dout(10) << "already opening, dropping this req" << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
mds->clientmap.add_opening(from);
|
|
} else {
|
|
if (mds->clientmap.is_closing(from)) {
|
|
dout(10) << "already closing, dropping this req" << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
if (m->seq < mds->clientmap.get_push_seq(from)) {
|
|
dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from)
|
|
<< ", dropping" << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
assert(m->seq == mds->clientmap.get_push_seq(from));
|
|
|
|
mds->clientmap.add_closing(from);
|
|
}
|
|
|
|
// journal it
|
|
version_t cmapv = mds->clientmap.inc_projected();
|
|
dout(10) << " clientmap v " << mds->clientmap.get_version() << " pv " << cmapv << dendl;
|
|
mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv),
|
|
new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv));
|
|
delete m;
|
|
|
|
if (logger) logger->inc("hcsess");
|
|
}
|
|
|
|
void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv)
|
|
{
|
|
dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close")
|
|
<< " " << cmapv
|
|
<< dendl;
|
|
|
|
// apply
|
|
int from = client_inst.name.num();
|
|
if (open) {
|
|
assert(mds->clientmap.is_opening(from));
|
|
mds->clientmap.open_session(client_inst);
|
|
} else if (mds->clientmap.is_closing(from)) {
|
|
mds->clientmap.close_session(from);
|
|
|
|
// purge completed requests from clientmap
|
|
mds->clientmap.trim_completed_requests(client_inst.name, 0);
|
|
} else {
|
|
// close must have been canceled (by an import?) ...
|
|
assert(!open);
|
|
mds->clientmap.noop();
|
|
}
|
|
|
|
assert(cmapv == mds->clientmap.get_version());
|
|
|
|
// reply
|
|
if (open)
|
|
mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst);
|
|
else
|
|
mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst);
|
|
}
|
|
|
|
void Server::prepare_force_open_sessions(map<int,entity_inst_t>& cm)
|
|
{
|
|
version_t cmapv = mds->clientmap.inc_projected();
|
|
dout(10) << "prepare_force_open_sessions " << cmapv
|
|
<< " on " << cm.size() << " clients"
|
|
<< dendl;
|
|
for (map<int,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
|
|
mds->clientmap.add_opening(p->first);
|
|
}
|
|
}
|
|
|
|
void Server::finish_force_open_sessions(map<int,entity_inst_t>& cm)
|
|
{
|
|
version_t v = mds->clientmap.get_version();
|
|
dout(10) << "finish_force_open_sessions on " << cm.size() << " clients, v " << v << " -> " << (v+1) << dendl;
|
|
for (map<int,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
|
|
if (mds->clientmap.is_closing(p->first)) {
|
|
dout(15) << "force_open_sessions canceling close on " << p->second << dendl;
|
|
mds->clientmap.remove_closing(p->first);
|
|
continue;
|
|
}
|
|
if (mds->clientmap.have_session(p->first)) {
|
|
dout(15) << "force_open_sessions have session " << p->second << dendl;
|
|
continue;
|
|
}
|
|
|
|
dout(10) << "force_open_sessions opening " << p->second << dendl;
|
|
mds->clientmap.open_session(p->second);
|
|
mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), p->second);
|
|
}
|
|
mds->clientmap.set_version(v+1);
|
|
}
|
|
|
|
|
|
void Server::terminate_sessions()
|
|
{
|
|
dout(2) << "terminate_sessions" << dendl;
|
|
|
|
// kill them off. clients will retry etc.
|
|
for (set<int>::const_iterator p = mds->clientmap.get_session_set().begin();
|
|
p != mds->clientmap.get_session_set().end();
|
|
++p) {
|
|
if (mds->clientmap.is_closing(*p))
|
|
continue;
|
|
mds->clientmap.add_closing(*p);
|
|
version_t cmapv = mds->clientmap.inc_projected();
|
|
mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv),
|
|
new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv));
|
|
}
|
|
}
|
|
|
|
|
|
void Server::reconnect_clients()
|
|
{
|
|
// reconnect with clients
|
|
if (mds->clientmap.get_session_set().empty()) {
|
|
dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
|
|
reconnect_gather_finish();
|
|
return;
|
|
}
|
|
|
|
dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << dendl;
|
|
|
|
mds->bcast_mds_map(); // send mdsmap to all client sessions
|
|
|
|
// init gather list
|
|
reconnect_start = g_clock.now();
|
|
client_reconnect_gather = mds->clientmap.get_session_set();
|
|
dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
|
|
}
|
|
|
|
void Server::handle_client_reconnect(MClientReconnect *m)
|
|
{
|
|
dout(7) << "handle_client_reconnect " << m->get_source() << dendl;
|
|
int from = m->get_source().num();
|
|
|
|
if (m->closed) {
|
|
dout(7) << " client had no session, removing from clientmap" << dendl;
|
|
|
|
mds->clientmap.add_closing(from);
|
|
version_t cmapv = mds->clientmap.inc_projected();
|
|
mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv),
|
|
new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv));
|
|
|
|
} else {
|
|
|
|
// caps
|
|
for (map<inodeno_t, inode_caps_reconnect_t>::iterator p = m->inode_caps.begin();
|
|
p != m->inode_caps.end();
|
|
++p) {
|
|
CInode *in = mdcache->get_inode(p->first);
|
|
if (in && in->is_auth()) {
|
|
// we recovered it, and it's ours. take note.
|
|
dout(15) << "open caps on " << *in << dendl;
|
|
in->reconnect_cap(from, p->second);
|
|
reconnected_caps.insert(in);
|
|
continue;
|
|
}
|
|
|
|
filepath path = m->inode_path[p->first];
|
|
if ((in && !in->is_auth()) ||
|
|
!mds->mdcache->path_is_mine(path)) {
|
|
// not mine.
|
|
dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first]
|
|
<< ", will pass off to authority" << dendl;
|
|
|
|
// mark client caps stale.
|
|
inode_t fake_inode;
|
|
fake_inode.ino = p->first;
|
|
MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_EXPORT,
|
|
fake_inode,
|
|
0,
|
|
0, // doesn't matter.
|
|
p->second.wanted); // doesn't matter.
|
|
mds->send_message_client(stale, m->get_source_inst());
|
|
|
|
// add to cap export list.
|
|
mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second);
|
|
} else {
|
|
// mine. fetch later.
|
|
dout(0) << "missing " << p->first << " " << m->inode_path[p->first]
|
|
<< " (mine), will load later" << dendl;
|
|
mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second,
|
|
-1); // "from" me.
|
|
}
|
|
}
|
|
}
|
|
|
|
// remove from gather set
|
|
client_reconnect_gather.erase(from);
|
|
if (client_reconnect_gather.empty()) reconnect_gather_finish();
|
|
|
|
delete m;
|
|
}
|
|
|
|
/*
|
|
* called by mdcache, late in rejoin (right before acks are sent)
|
|
*/
|
|
void Server::process_reconnected_caps()
|
|
{
|
|
dout(10) << "process_reconnected_caps" << dendl;
|
|
|
|
// adjust filelock state appropriately
|
|
for (set<CInode*>::iterator p = reconnected_caps.begin();
|
|
p != reconnected_caps.end();
|
|
++p) {
|
|
CInode *in = *p;
|
|
int issued = in->get_caps_issued();
|
|
if (in->is_auth()) {
|
|
// wr?
|
|
if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) {
|
|
if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) {
|
|
in->filelock.set_state(LOCK_LONER);
|
|
} else {
|
|
in->filelock.set_state(LOCK_MIXED);
|
|
}
|
|
}
|
|
} else {
|
|
// note that client should perform stale/reap cleanup during reconnect.
|
|
assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ????
|
|
if (in->filelock.is_xlocked())
|
|
in->filelock.set_state(LOCK_LOCK);
|
|
else
|
|
in->filelock.set_state(LOCK_SYNC); // might have been lock, previously
|
|
}
|
|
dout(15) << " issued " << cap_string(issued)
|
|
<< " chose " << in->filelock
|
|
<< " on " << *in << dendl;
|
|
}
|
|
reconnected_caps.clear(); // clean up
|
|
}
|
|
|
|
|
|
void Server::client_reconnect_failure(int from)
|
|
{
|
|
dout(5) << "client_reconnect_failure on client" << from << dendl;
|
|
if (mds->is_reconnect() &&
|
|
client_reconnect_gather.count(from)) {
|
|
client_reconnect_gather.erase(from);
|
|
if (client_reconnect_gather.empty())
|
|
reconnect_gather_finish();
|
|
}
|
|
}
|
|
|
|
void Server::reconnect_gather_finish()
|
|
{
|
|
dout(7) << "reconnect_gather_finish" << dendl;
|
|
mds->reconnect_done();
|
|
}
|
|
|
|
|
|
|
|
/*******
|
|
* some generic stuff for finishing off requests
|
|
*/
|
|
|
|
|
|
/*
|
|
* send generic response (just and error code)
|
|
*/
|
|
void Server::reply_request(MDRequest *mdr, int r, CInode *tracei)
|
|
{
|
|
reply_request(mdr, new MClientReply(mdr->client_request, r), tracei);
|
|
}
|
|
|
|
|
|
/*
|
|
* send given reply
|
|
* include a trace to tracei
|
|
*/
|
|
void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
dout(10) << "reply_request " << reply->get_result()
|
|
<< " (" << strerror(-reply->get_result())
|
|
<< ") " << *req << dendl;
|
|
|
|
// note result code in clientmap?
|
|
if (!req->is_idempotent())
|
|
mds->clientmap.add_completed_request(mdr->reqid);
|
|
|
|
/*
|
|
if (tracei && !tracei->hack_accessed) {
|
|
tracei->hack_accessed = true;
|
|
mds->logger->inc("newt");
|
|
if (tracei->parent &&
|
|
tracei->parent->dir->hack_num_accessed >= 0) {
|
|
tracei->parent->dir->hack_num_accessed++;
|
|
if (tracei->parent->dir->hack_num_accessed == 1)
|
|
mds->logger->inc("dirt1");
|
|
if (tracei->parent->dir->hack_num_accessed == 2)
|
|
mds->logger->inc("dirt2");
|
|
if (tracei->parent->dir->hack_num_accessed == 3)
|
|
mds->logger->inc("dirt3");
|
|
if (tracei->parent->dir->hack_num_accessed == 4)
|
|
mds->logger->inc("dirt4");
|
|
if (tracei->parent->dir->hack_num_accessed == 5)
|
|
mds->logger->inc("dirt5");
|
|
}
|
|
}
|
|
*/
|
|
|
|
// include trace
|
|
if (tracei)
|
|
reply->set_trace_dist( tracei, mds->get_nodeid() );
|
|
|
|
reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
|
|
|
|
// send reply
|
|
if (req->get_client_inst().name.is_mds())
|
|
delete reply; // mds doesn't need a reply
|
|
else
|
|
messenger->send_message(reply, req->get_client_inst());
|
|
|
|
// finish request
|
|
mdcache->request_finish(mdr);
|
|
|
|
if (tracei &&
|
|
tracei->get_parent_dn() &&
|
|
tracei->get_parent_dn()->is_remote())
|
|
mdcache->eval_remote(tracei->get_parent_dn());
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/***
|
|
* process a client request
|
|
*/
|
|
void Server::handle_client_request(MClientRequest *req)
|
|
{
|
|
dout(4) << "handle_client_request " << *req << dendl;
|
|
|
|
if (logger) logger->inc("hcreq");
|
|
|
|
if (!mds->is_active() &&
|
|
!(mds->is_stopping() && req->get_client_inst().name.is_mds())) {
|
|
dout(5) << " not active (or stopping+mds), discarding request." << dendl;
|
|
delete req;
|
|
return;
|
|
}
|
|
|
|
if (!mdcache->get_root()) {
|
|
dout(5) << "need to open root" << dendl;
|
|
mdcache->open_root(new C_MDS_RetryMessage(mds, req));
|
|
return;
|
|
}
|
|
|
|
// active session?
|
|
if (req->get_client_inst().name.is_client() &&
|
|
!mds->clientmap.have_session(req->get_client_inst().name.num())) {
|
|
dout(5) << "no session for " << req->get_client_inst().name << ", dropping" << dendl;
|
|
delete req;
|
|
return;
|
|
}
|
|
|
|
// old mdsmap?
|
|
if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
|
|
// send it? hrm, this isn't ideal; they may get a lot of copies if
|
|
// they have a high request rate.
|
|
}
|
|
|
|
// okay, i want
|
|
CInode *ref = 0;
|
|
|
|
// retry?
|
|
if (req->get_retry_attempt()) {
|
|
if (mds->clientmap.have_completed_request(req->get_reqid())) {
|
|
dout(5) << "already completed " << req->get_reqid() << dendl;
|
|
mds->messenger->send_message(new MClientReply(req, 0), req->get_client_inst());
|
|
delete req;
|
|
return;
|
|
}
|
|
}
|
|
// trim completed_request list
|
|
if (req->get_oldest_client_tid() > 0) {
|
|
dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
|
|
mds->clientmap.trim_completed_requests(req->get_client_inst().name,
|
|
req->get_oldest_client_tid());
|
|
}
|
|
|
|
|
|
// -----
|
|
// some ops are on ino's
|
|
switch (req->get_op()) {
|
|
case CEPH_MDS_OP_FSTAT:
|
|
ref = mdcache->get_inode(req->head.args.fstat.ino);
|
|
assert(ref);
|
|
break;
|
|
|
|
case CEPH_MDS_OP_TRUNCATE:
|
|
if (!req->head.args.truncate.ino)
|
|
break; // can be called w/ either fh OR path
|
|
ref = mdcache->get_inode(req->head.args.truncate.ino);
|
|
assert(ref);
|
|
break;
|
|
|
|
case CEPH_MDS_OP_FSYNC:
|
|
ref = mdcache->get_inode(req->head.args.fsync.ino); // fixme someday no ino needed?
|
|
assert(ref);
|
|
break;
|
|
}
|
|
|
|
// register + dispatch
|
|
MDRequest *mdr = mdcache->request_start(req);
|
|
if (!mdr) return;
|
|
|
|
if (ref) {
|
|
dout(10) << "inode op on ref " << *ref << dendl;
|
|
mdr->ref = ref;
|
|
mdr->pin(ref);
|
|
}
|
|
|
|
dispatch_client_request(mdr);
|
|
return;
|
|
}
|
|
|
|
|
|
void Server::dispatch_client_request(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
if (logger) logger->inc("dcreq");
|
|
|
|
if (mdr->ref) {
|
|
dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << dendl;
|
|
} else {
|
|
dout(7) << "dispatch_client_request " << *req << dendl;
|
|
}
|
|
|
|
// we shouldn't be waiting on anyone.
|
|
assert(mdr->more()->waiting_on_slave.empty());
|
|
|
|
switch (req->get_op()) {
|
|
|
|
// inodes ops.
|
|
case CEPH_MDS_OP_STAT:
|
|
case CEPH_MDS_OP_LSTAT:
|
|
handle_client_stat(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_UTIME:
|
|
handle_client_utime(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_CHMOD:
|
|
handle_client_chmod(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_CHOWN:
|
|
handle_client_chown(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_TRUNCATE:
|
|
handle_client_truncate(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_READDIR:
|
|
handle_client_readdir(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_FSYNC:
|
|
//handle_client_fsync(req, ref);
|
|
break;
|
|
|
|
// funky.
|
|
case CEPH_MDS_OP_OPEN:
|
|
if (req->head.args.open.flags & O_CREAT)
|
|
handle_client_openc(mdr);
|
|
else
|
|
handle_client_open(mdr);
|
|
break;
|
|
|
|
// namespace.
|
|
// no prior locks.
|
|
case CEPH_MDS_OP_MKNOD:
|
|
handle_client_mknod(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_LINK:
|
|
handle_client_link(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_UNLINK:
|
|
case CEPH_MDS_OP_RMDIR:
|
|
handle_client_unlink(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_RENAME:
|
|
handle_client_rename(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_MKDIR:
|
|
handle_client_mkdir(mdr);
|
|
break;
|
|
case CEPH_MDS_OP_SYMLINK:
|
|
handle_client_symlink(mdr);
|
|
break;
|
|
|
|
|
|
default:
|
|
dout(1) << " unknown client op " << req->get_op() << dendl;
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
|
|
// ---------------------------------------
|
|
// SLAVE REQUESTS
|
|
|
|
void Server::handle_slave_request(MMDSSlaveRequest *m)
|
|
{
|
|
dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
|
|
int from = m->get_source().num();
|
|
|
|
if (logger) logger->inc("hsreq");
|
|
|
|
// reply?
|
|
if (m->is_reply()) {
|
|
|
|
switch (m->get_op()) {
|
|
case MMDSSlaveRequest::OP_XLOCKACK:
|
|
{
|
|
// identify lock, master request
|
|
SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
|
|
m->get_object_info());
|
|
MDRequest *mdr = mdcache->request_get(m->get_reqid());
|
|
mdr->more()->slaves.insert(from);
|
|
dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
|
|
mdr->xlocks.insert(lock);
|
|
mdr->locks.insert(lock);
|
|
lock->get_xlock(mdr);
|
|
lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
|
|
}
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_AUTHPINACK:
|
|
{
|
|
MDRequest *mdr = mdcache->request_get(m->get_reqid());
|
|
handle_slave_auth_pin_ack(mdr, m);
|
|
}
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_LINKPREPACK:
|
|
{
|
|
MDRequest *mdr = mdcache->request_get(m->get_reqid());
|
|
handle_slave_link_prep_ack(mdr, m);
|
|
}
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_RENAMEPREPACK:
|
|
{
|
|
MDRequest *mdr = mdcache->request_get(m->get_reqid());
|
|
handle_slave_rename_prep_ack(mdr, m);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
|
|
// done with reply.
|
|
delete m;
|
|
return;
|
|
|
|
} else {
|
|
// am i a new slave?
|
|
MDRequest *mdr;
|
|
if (mdcache->have_request(m->get_reqid())) {
|
|
// existing?
|
|
mdr = mdcache->request_get(m->get_reqid());
|
|
if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race)
|
|
dout(10) << "local request " << *mdr << " not slave to mds" << from
|
|
<< ", ignoring " << *m << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
} else {
|
|
// new?
|
|
if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
|
|
dout(10) << "missing slave request for " << m->get_reqid()
|
|
<< " OP_FINISH, must have lost race with a forward" << dendl;
|
|
delete m;
|
|
return;
|
|
}
|
|
mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num());
|
|
}
|
|
assert(mdr->slave_request == 0); // only one at a time, please!
|
|
mdr->slave_request = m;
|
|
|
|
dispatch_slave_request(mdr);
|
|
}
|
|
}
|
|
|
|
void Server::dispatch_slave_request(MDRequest *mdr)
|
|
{
|
|
dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
|
|
|
|
if (mdr->aborted) {
|
|
dout(7) << " abort flag set, finishing" << dendl;
|
|
mdcache->request_finish(mdr);
|
|
return;
|
|
}
|
|
|
|
if (logger) logger->inc("dsreq");
|
|
|
|
switch (mdr->slave_request->get_op()) {
|
|
case MMDSSlaveRequest::OP_XLOCK:
|
|
{
|
|
// identify object
|
|
SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
|
|
mdr->slave_request->get_object_info());
|
|
|
|
if (lock && lock->get_parent()->is_auth()) {
|
|
// xlock.
|
|
// use acquire_locks so that we get auth_pinning.
|
|
set<SimpleLock*> rdlocks;
|
|
set<SimpleLock*> wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(lock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// ack
|
|
MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK);
|
|
r->set_lock_type(lock->get_type());
|
|
lock->get_parent()->set_object_info(r->get_object_info());
|
|
mds->send_message_mds(r, mdr->slave_request->get_source().num());
|
|
} else {
|
|
if (lock) {
|
|
dout(10) << "not auth for remote xlock attempt, dropping on "
|
|
<< *lock << " on " << *lock->get_parent() << dendl;
|
|
} else {
|
|
dout(10) << "don't have object, dropping" << dendl;
|
|
assert(0); // can this happen, if we auth pinned properly.
|
|
}
|
|
}
|
|
|
|
// done.
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
}
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_UNXLOCK:
|
|
{
|
|
SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
|
|
mdr->slave_request->get_object_info());
|
|
assert(lock);
|
|
mds->locker->xlock_finish(lock, mdr);
|
|
|
|
// done. no ack necessary.
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
}
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_AUTHPIN:
|
|
handle_slave_auth_pin(mdr);
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_LINKPREP:
|
|
case MMDSSlaveRequest::OP_UNLINKPREP:
|
|
handle_slave_link_prep(mdr);
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_RENAMEPREP:
|
|
handle_slave_rename_prep(mdr);
|
|
break;
|
|
|
|
case MMDSSlaveRequest::OP_FINISH:
|
|
// finish off request.
|
|
mdcache->request_finish(mdr);
|
|
break;
|
|
|
|
default:
|
|
assert(0);
|
|
}
|
|
}
|
|
|
|
|
|
void Server::handle_slave_auth_pin(MDRequest *mdr)
|
|
{
|
|
dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
|
|
|
|
// build list of objects
|
|
list<MDSCacheObject*> objects;
|
|
bool fail = false;
|
|
|
|
for (list<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin();
|
|
p != mdr->slave_request->get_authpins().end();
|
|
++p) {
|
|
MDSCacheObject *object = mdcache->get_object(*p);
|
|
if (!object) {
|
|
dout(10) << " don't have " << *p << dendl;
|
|
fail = true;
|
|
break;
|
|
}
|
|
|
|
objects.push_back(object);
|
|
}
|
|
|
|
// can we auth pin them?
|
|
if (!fail) {
|
|
for (list<MDSCacheObject*>::iterator p = objects.begin();
|
|
p != objects.end();
|
|
++p) {
|
|
if (!(*p)->is_auth()) {
|
|
dout(10) << " not auth for " << **p << dendl;
|
|
fail = true;
|
|
break;
|
|
}
|
|
if (!mdr->is_auth_pinned(*p) &&
|
|
!(*p)->can_auth_pin()) {
|
|
// wait
|
|
dout(10) << " waiting for authpinnable on " << **p << dendl;
|
|
(*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
|
|
mdr->drop_local_auth_pins();
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// auth pin!
|
|
if (fail) {
|
|
mdr->drop_local_auth_pins(); // just in case
|
|
} else {
|
|
for (list<MDSCacheObject*>::iterator p = objects.begin();
|
|
p != objects.end();
|
|
++p) {
|
|
dout(10) << "auth_pinning " << **p << dendl;
|
|
mdr->auth_pin(*p);
|
|
}
|
|
}
|
|
|
|
// ack!
|
|
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK);
|
|
|
|
// return list of my auth_pins (if any)
|
|
for (set<MDSCacheObject*>::iterator p = mdr->auth_pins.begin();
|
|
p != mdr->auth_pins.end();
|
|
++p) {
|
|
MDSCacheObjectInfo info;
|
|
(*p)->set_object_info(info);
|
|
reply->get_authpins().push_back(info);
|
|
}
|
|
|
|
mds->send_message_mds(reply, mdr->slave_to_mds);
|
|
|
|
// clean up this request
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
return;
|
|
}
|
|
|
|
void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
|
|
{
|
|
dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
|
|
int from = ack->get_source().num();
|
|
|
|
// added auth pins?
|
|
set<MDSCacheObject*> pinned;
|
|
for (list<MDSCacheObjectInfo>::iterator p = ack->get_authpins().begin();
|
|
p != ack->get_authpins().end();
|
|
++p) {
|
|
MDSCacheObject *object = mdcache->get_object(*p);
|
|
assert(object); // we pinned it
|
|
dout(10) << " remote has pinned " << *object << dendl;
|
|
if (!mdr->is_auth_pinned(object))
|
|
mdr->remote_auth_pins.insert(object);
|
|
pinned.insert(object);
|
|
}
|
|
|
|
// removed auth pins?
|
|
set<MDSCacheObject*>::iterator p = mdr->remote_auth_pins.begin();
|
|
while (p != mdr->remote_auth_pins.end()) {
|
|
if ((*p)->authority().first == from &&
|
|
pinned.count(*p) == 0) {
|
|
dout(10) << " remote has unpinned " << **p << dendl;
|
|
set<MDSCacheObject*>::iterator o = p;
|
|
++p;
|
|
mdr->remote_auth_pins.erase(o);
|
|
} else {
|
|
++p;
|
|
}
|
|
}
|
|
|
|
// note slave
|
|
mdr->more()->slaves.insert(from);
|
|
|
|
// clear from waiting list
|
|
assert(mdr->more()->waiting_on_slave.count(from));
|
|
mdr->more()->waiting_on_slave.erase(from);
|
|
|
|
// go again?
|
|
if (mdr->more()->waiting_on_slave.empty())
|
|
dispatch_client_request(mdr);
|
|
else
|
|
dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
|
|
}
|
|
|
|
|
|
// ---------------------------------------
|
|
// HELPERS
|
|
|
|
|
|
/** validate_dentry_dir
|
|
*
|
|
* verify that the dir exists and would own the dname.
|
|
* do not check if the dentry exists.
|
|
*/
|
|
CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname)
|
|
{
|
|
// make sure parent is a dir?
|
|
if (!diri->is_dir()) {
|
|
dout(7) << "validate_dentry_dir: not a dir" << dendl;
|
|
reply_request(mdr, -ENOTDIR);
|
|
return false;
|
|
}
|
|
|
|
// which dirfrag?
|
|
frag_t fg = diri->pick_dirfrag(dname);
|
|
CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
|
|
if (!dir)
|
|
return 0;
|
|
|
|
// frozen?
|
|
if (dir->is_frozen()) {
|
|
dout(7) << "dir is frozen " << *dir << dendl;
|
|
dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return false;
|
|
}
|
|
|
|
return dir;
|
|
}
|
|
|
|
|
|
/** prepare_null_dentry
|
|
* prepare a null (or existing) dentry in given dir.
|
|
* wait for any dn lock.
|
|
*/
|
|
CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist)
|
|
{
|
|
dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl;
|
|
assert(dir->is_auth());
|
|
|
|
// does it already exist?
|
|
CDentry *dn = dir->lookup(dname);
|
|
if (dn) {
|
|
if (dn->lock.is_xlocked_by_other(mdr)) {
|
|
dout(10) << "waiting on xlocked dentry " << *dn << dendl;
|
|
dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
if (!dn->is_null()) {
|
|
// name already exists
|
|
dout(10) << "dentry " << dname << " exists in " << *dir << dendl;
|
|
if (!okexist) {
|
|
reply_request(mdr, -EEXIST);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return dn;
|
|
}
|
|
|
|
// make sure dir is complete
|
|
if (!dir->is_complete()) {
|
|
dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
|
|
dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
// create
|
|
dn = dir->add_null_dentry(dname);
|
|
dn->mark_new();
|
|
dout(10) << "prepare_null_dentry added " << *dn << dendl;
|
|
|
|
return dn;
|
|
}
|
|
|
|
|
|
/** prepare_new_inode
|
|
*
|
|
* create a new inode. set c/m/atime. hit dir pop.
|
|
*/
|
|
CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir)
|
|
{
|
|
CInode *in = mdcache->create_inode();
|
|
in->inode.uid = mdr->client_request->get_caller_uid();
|
|
in->inode.gid = mdr->client_request->get_caller_gid();
|
|
in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now
|
|
dout(10) << "prepare_new_inode " << *in << dendl;
|
|
|
|
return in;
|
|
}
|
|
|
|
|
|
|
|
CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector<CDentry*> &trace, filepath refpath)
|
|
{
|
|
// figure parent dir vs dname
|
|
if (refpath.depth() == 0) {
|
|
dout(7) << "can't do that to root" << dendl;
|
|
reply_request(mdr, -EINVAL);
|
|
return 0;
|
|
}
|
|
string dname = refpath.last_dentry();
|
|
refpath.pop_dentry();
|
|
|
|
dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl;
|
|
|
|
// traverse to parent dir
|
|
int r = mdcache->path_traverse(mdr, mdr->client_request,
|
|
refpath, trace, true,
|
|
MDS_TRAVERSE_FORWARD);
|
|
if (r > 0) return 0; // delayed
|
|
if (r < 0) {
|
|
reply_request(mdr, r);
|
|
return 0;
|
|
}
|
|
|
|
// open inode
|
|
CInode *diri;
|
|
if (trace.empty())
|
|
diri = mdcache->get_inode(refpath.get_ino());
|
|
else
|
|
diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr);
|
|
if (!diri)
|
|
return 0; // opening inode.
|
|
|
|
// is it an auth dir?
|
|
CDir *dir = validate_dentry_dir(mdr, diri, dname);
|
|
if (!dir)
|
|
return 0; // forwarded or waiting for freeze
|
|
|
|
dout(10) << "traverse_to_auth_dir " << *dir << dendl;
|
|
return dir;
|
|
}
|
|
|
|
|
|
|
|
CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth)
|
|
{
|
|
// already got ref?
|
|
if (mdr->ref)
|
|
return mdr->ref;
|
|
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
// traverse
|
|
filepath refpath = req->get_filepath();
|
|
vector<CDentry*> trace;
|
|
int r = mdcache->path_traverse(mdr, req,
|
|
refpath,
|
|
trace, req->follow_trailing_symlink(),
|
|
MDS_TRAVERSE_FORWARD);
|
|
if (r > 0) return false; // delayed
|
|
if (r < 0) { // error
|
|
reply_request(mdr, r);
|
|
return 0;
|
|
}
|
|
|
|
// open ref inode
|
|
CInode *ref = 0;
|
|
if (trace.empty())
|
|
ref = mdcache->get_root();
|
|
else {
|
|
CDentry *dn = trace[trace.size()-1];
|
|
|
|
// if no inode (null or unattached remote), fw to dentry auth?
|
|
if (want_auth && !dn->is_auth() &&
|
|
(dn->is_null() ||
|
|
(dn->is_remote() && dn->inode))) {
|
|
if (dn->is_ambiguous_auth()) {
|
|
dout(10) << "waiting for single auth on " << *dn << dendl;
|
|
dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
|
|
} else {
|
|
dout(10) << "fw to auth for " << *dn << dendl;
|
|
mdcache->request_forward(mdr, dn->authority().first);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// open ref inode
|
|
ref = mdcache->get_dentry_inode(dn, mdr);
|
|
if (!ref) return 0;
|
|
}
|
|
dout(10) << "ref is " << *ref << dendl;
|
|
|
|
// fw to inode auth?
|
|
if (want_auth && !ref->is_auth()) {
|
|
if (ref->is_ambiguous_auth()) {
|
|
dout(10) << "waiting for single auth on " << *ref << dendl;
|
|
ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
|
|
} else {
|
|
dout(10) << "fw to auth for " << *ref << dendl;
|
|
mdcache->request_forward(mdr, ref->authority().first);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// auth_pin?
|
|
if (want_auth) {
|
|
if (ref->is_frozen()) {
|
|
dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
|
|
ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
mdr->auth_pin(ref);
|
|
}
|
|
|
|
// lock the path
|
|
set<SimpleLock*> rdlocks, empty;
|
|
|
|
for (int i=0; i<(int)trace.size(); i++)
|
|
rdlocks.insert(&trace[i]->lock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty))
|
|
return 0;
|
|
|
|
// set and pin ref
|
|
mdr->pin(ref);
|
|
mdr->ref = ref;
|
|
|
|
// save the locked trace.
|
|
mdr->trace.swap(trace);
|
|
|
|
return ref;
|
|
}
|
|
|
|
|
|
/** rdlock_path_xlock_dentry
|
|
* traverse path to the directory that could/would contain dentry.
|
|
* make sure i am auth for that dentry, forward as necessary.
|
|
* create null dentry in place (or use existing if okexist).
|
|
* get rdlocks on traversed dentries, xlock on new dentry.
|
|
*/
|
|
CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
vector<CDentry*> trace;
|
|
CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath());
|
|
if (!dir) return 0;
|
|
dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl;
|
|
|
|
// make sure we can auth_pin (or have already authpinned) dir
|
|
if (dir->is_frozen()) {
|
|
dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl;
|
|
dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
// make a null dentry?
|
|
const string &dname = req->get_filepath().last_dentry();
|
|
CDentry *dn;
|
|
if (mustexist) {
|
|
dn = dir->lookup(dname);
|
|
|
|
// make sure dir is complete
|
|
if (!dn && !dir->is_complete()) {
|
|
dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl;
|
|
dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
// readable?
|
|
if (dn && dn->lock.is_xlocked_by_other(mdr)) {
|
|
dout(10) << "waiting on xlocked dentry " << *dn << dendl;
|
|
dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
// exists?
|
|
if (!dn || dn->is_null()) {
|
|
dout(7) << "dentry " << dname << " dne in " << *dir << dendl;
|
|
reply_request(mdr, -ENOENT);
|
|
return 0;
|
|
}
|
|
} else {
|
|
dn = prepare_null_dentry(mdr, dir, dname, okexist);
|
|
if (!dn)
|
|
return 0;
|
|
}
|
|
|
|
// -- lock --
|
|
set<SimpleLock*> rdlocks, wrlocks, xlocks;
|
|
|
|
for (int i=0; i<(int)trace.size(); i++)
|
|
rdlocks.insert(&trace[i]->lock);
|
|
if (dn->is_null()) {
|
|
xlocks.insert(&dn->lock); // new dn, xlock
|
|
wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime
|
|
} else
|
|
rdlocks.insert(&dn->lock); // existing dn, rdlock
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return 0;
|
|
|
|
// save the locked trace.
|
|
mdr->trace.swap(trace);
|
|
|
|
return dn;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
* try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
|
|
*
|
|
* @diri base indoe
|
|
* @fg the exact frag we want
|
|
* @mdr request
|
|
*/
|
|
CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr)
|
|
{
|
|
CDir *dir = diri->get_dirfrag(fg);
|
|
|
|
// not open and inode not mine?
|
|
if (!dir && !diri->is_auth()) {
|
|
int inauth = diri->authority().first;
|
|
dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << dendl;
|
|
mdcache->request_forward(mdr, inauth);
|
|
return 0;
|
|
}
|
|
|
|
// not open and inode frozen?
|
|
if (!dir && diri->is_frozen_dir()) {
|
|
dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
|
|
assert(diri->get_parent_dir());
|
|
diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return 0;
|
|
}
|
|
|
|
// invent?
|
|
if (!dir)
|
|
dir = diri->get_or_open_dirfrag(mds->mdcache, fg);
|
|
|
|
// am i auth for the dirfrag?
|
|
if (!dir->is_auth()) {
|
|
int auth = dir->authority().first;
|
|
dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
|
|
<< ", fw to mds" << auth << dendl;
|
|
mdcache->request_forward(mdr, auth);
|
|
return 0;
|
|
}
|
|
|
|
return dir;
|
|
}
|
|
|
|
|
|
|
|
/** predirty_dn_diri
|
|
* predirty the directory inode for a new dentry, if it is auth (and not root)
|
|
* BUG: root inode doesn't get dirtied properly, currently. blech.
|
|
*/
|
|
version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob)
|
|
{
|
|
version_t dirpv = 0;
|
|
CInode *diri = dn->dir->inode;
|
|
|
|
if (diri->is_base()) return 0;
|
|
|
|
if (diri->is_auth()) {
|
|
assert(mdr->wrlocks.count(&diri->dirlock));
|
|
|
|
dirpv = diri->pre_dirty();
|
|
dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << dendl;
|
|
|
|
// predirty+journal
|
|
inode_t *pi = diri->project_inode();
|
|
if (dirpv) pi->version = dirpv;
|
|
pi->ctime = pi->mtime = mdr->now;
|
|
blob->add_dir_context(diri->get_parent_dn()->get_dir());
|
|
blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi);
|
|
} else {
|
|
// journal the mtime change anyway.
|
|
inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true);
|
|
ji->ctime = ji->mtime = mdr->now;
|
|
|
|
dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl;
|
|
|
|
blob->add_dirtied_inode_mtime(diri->ino(), mdr->now);
|
|
assert(mdr->ls);
|
|
mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime);
|
|
}
|
|
|
|
return dirpv;
|
|
}
|
|
|
|
/** dirty_dn_diri
|
|
* follow-up with actual dirty of inode after journal entry commits.
|
|
*/
|
|
void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv)
|
|
{
|
|
CInode *diri = dn->dir->inode;
|
|
|
|
if (diri->is_root()) return;
|
|
|
|
if (dirpv) {
|
|
// we journaled and predirtied.
|
|
assert(diri->is_auth() && !diri->is_root());
|
|
diri->pop_and_dirty_projected_inode(mdr->ls);
|
|
dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl;
|
|
} else {
|
|
// dirlock scatterlock will propagate the update.
|
|
diri->inode.ctime = diri->inode.mtime = mdr->now;
|
|
diri->dirlock.set_updated();
|
|
dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ===============================================================================
|
|
// STAT
|
|
|
|
void Server::handle_client_stat(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *ref = rdlock_path_pin_ref(mdr, false);
|
|
if (!ref) return;
|
|
|
|
// which inode locks do I want?
|
|
/* note: this works because we include existing locks in our lists,
|
|
and because all new locks are on inodes and sort to the right of
|
|
the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
|
|
int mask = req->head.args.stat.mask;
|
|
if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock);
|
|
if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock);
|
|
if (ref->is_file() &&
|
|
mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock);
|
|
if (ref->is_dir() &&
|
|
mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
mds->balancer->hit_inode(g_clock.now(), ref, META_POP_IRD,
|
|
mdr->client_request->get_client_inst().name.num());
|
|
|
|
// reply
|
|
dout(10) << "reply to stat on " << *req << dendl;
|
|
MClientReply *reply = new MClientReply(req);
|
|
reply_request(mdr, reply, ref);
|
|
}
|
|
|
|
|
|
|
|
|
|
// ===============================================================================
|
|
// INODE UPDATES
|
|
|
|
|
|
/*
|
|
* finisher for basic inode updates
|
|
*/
|
|
class C_MDS_inode_update_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CInode *in;
|
|
public:
|
|
C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) :
|
|
mds(m), mdr(r), in(i) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// apply
|
|
in->pop_and_dirty_projected_inode(mdr->ls);
|
|
|
|
mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply->set_result(0);
|
|
mds->server->reply_request(mdr, reply, in);
|
|
}
|
|
};
|
|
|
|
|
|
// utime
|
|
|
|
void Server::handle_client_utime(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *cur = rdlock_path_pin_ref(mdr, true);
|
|
if (!cur) return;
|
|
|
|
if (cur->is_root()) {
|
|
reply_request(mdr, -EINVAL); // for now
|
|
return;
|
|
}
|
|
|
|
// xlock inode
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(&cur->filelock);
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// project update
|
|
inode_t *pi = cur->project_inode();
|
|
pi->mtime = req->head.args.utime.mtime;
|
|
pi->atime = req->head.args.utime.atime;
|
|
pi->version = cur->pre_dirty();
|
|
pi->ctime = g_clock.real_now();
|
|
|
|
// log + wait
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "utime");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_dir_context(cur->get_parent_dir());
|
|
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
|
|
|
|
mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur));
|
|
}
|
|
|
|
|
|
// chmod
|
|
|
|
void Server::handle_client_chmod(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *cur = rdlock_path_pin_ref(mdr, true);
|
|
if (!cur) return;
|
|
|
|
if (cur->is_root()) {
|
|
reply_request(mdr, -EINVAL); // for now
|
|
return;
|
|
}
|
|
|
|
// write
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(&cur->authlock);
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// project update
|
|
inode_t *pi = cur->project_inode();
|
|
pi->mode =
|
|
(pi->mode & ~04777) |
|
|
(req->head.args.chmod.mode & 04777);
|
|
pi->version = cur->pre_dirty();
|
|
pi->ctime = g_clock.real_now();
|
|
|
|
// log + wait
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "chmod");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_dir_context(cur->get_parent_dir());
|
|
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
|
|
|
|
mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur));
|
|
}
|
|
|
|
|
|
// chown
|
|
|
|
void Server::handle_client_chown(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *cur = rdlock_path_pin_ref(mdr, true);
|
|
if (!cur) return;
|
|
|
|
if (cur->is_root()) {
|
|
reply_request(mdr, -EINVAL); // for now
|
|
return;
|
|
}
|
|
|
|
// write
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(&cur->authlock);
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// project update
|
|
inode_t *pi = cur->project_inode();
|
|
pi->uid = MAX(req->head.args.chown.uid, 0);
|
|
pi->gid = MAX(req->head.args.chown.gid, 0);
|
|
pi->version = cur->pre_dirty();
|
|
pi->ctime = g_clock.real_now();
|
|
|
|
// log + wait
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "chown");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_dir_context(cur->get_parent_dir());
|
|
le->metablob.add_primary_dentry(cur->parent, true, 0, pi);
|
|
|
|
mdlog->submit_entry(le);
|
|
mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur));
|
|
}
|
|
|
|
|
|
|
|
|
|
// =================================================================
|
|
// DIRECTORY and NAMESPACE OPS
|
|
|
|
// READDIR
|
|
|
|
void Server::handle_client_readdir(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *diri = rdlock_path_pin_ref(mdr, false);
|
|
if (!diri) return;
|
|
|
|
// it's a directory, right?
|
|
if (!diri->is_dir()) {
|
|
// not a dir
|
|
dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
|
|
reply_request(mdr, -ENOTDIR, diri);
|
|
return;
|
|
}
|
|
|
|
// which frag?
|
|
frag_t fg = req->head.args.readdir.frag;
|
|
|
|
// does the frag exist?
|
|
if (diri->dirfragtree[fg] != fg) {
|
|
dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl;
|
|
reply_request(mdr, -EAGAIN, diri);
|
|
return;
|
|
}
|
|
|
|
CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
|
|
if (!dir) return;
|
|
|
|
// ok!
|
|
assert(dir->is_auth());
|
|
|
|
// check perm
|
|
/*
|
|
if (!mds->locker->inode_hard_rdlock_start(diri, mdr))
|
|
return;
|
|
mds->locker->inode_hard_rdlock_finish(diri, mdr);
|
|
*/
|
|
|
|
if (!dir->is_complete()) {
|
|
// fetch
|
|
dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
|
|
dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
|
|
// build dir contents
|
|
bufferlist dirbl, dnbl;
|
|
DirStat::_encode(dirbl, dir, mds->get_nodeid());
|
|
|
|
__u32 numfiles = 0;
|
|
for (CDir::map_t::iterator it = dir->begin();
|
|
it != dir->end();
|
|
it++) {
|
|
CDentry *dn = it->second;
|
|
if (dn->is_null()) continue;
|
|
|
|
CInode *in = dn->inode;
|
|
|
|
// remote link?
|
|
// better for the MDS to do the work, if we think the client will stat any of these files.
|
|
if (dn->is_remote() && !in) {
|
|
in = mdcache->get_inode(dn->get_remote_ino());
|
|
if (in) {
|
|
dn->link_remote(in);
|
|
} else {
|
|
mdcache->open_remote_ino(dn->get_remote_ino(),
|
|
mdr,
|
|
new C_MDS_RetryRequest(mdcache, mdr));
|
|
|
|
// touch everything i _do_ have
|
|
for (it = dir->begin();
|
|
it != dir->end();
|
|
it++)
|
|
if (!it->second->is_null())
|
|
mdcache->lru.lru_touch(it->second);
|
|
return;
|
|
}
|
|
}
|
|
assert(in);
|
|
|
|
dout(12) << "including inode " << *in << dendl;
|
|
|
|
// add this dentry + inodeinfo
|
|
::_encode(it->first, dnbl);
|
|
InodeStat::_encode(dnbl, in);
|
|
numfiles++;
|
|
|
|
// touch it
|
|
mdcache->lru.lru_touch(dn);
|
|
}
|
|
::_encode_simple(numfiles, dirbl);
|
|
dirbl.claim_append(dnbl);
|
|
|
|
// yay, reply
|
|
MClientReply *reply = new MClientReply(req);
|
|
reply->take_dir_items(dirbl);
|
|
|
|
dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl;
|
|
reply->set_result(0);
|
|
|
|
// bump popularity. NOTE: this doesn't quite capture it.
|
|
mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles);
|
|
|
|
// reply
|
|
reply_request(mdr, reply, diri);
|
|
}
|
|
|
|
|
|
|
|
// ------------------------------------------------
|
|
|
|
// MKNOD
|
|
|
|
class C_MDS_mknod_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
CInode *newi;
|
|
version_t dirpv;
|
|
version_t newdirpv;
|
|
public:
|
|
C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) :
|
|
mds(m), mdr(r), dn(d), newi(ni),
|
|
dirpv(dirpv_), newdirpv(newdirpv_) {}
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// link the inode
|
|
dn->get_dir()->link_primary_inode(dn, newi);
|
|
|
|
// dirty inode, dn, dir
|
|
newi->mark_dirty(newi->inode.version + 1, mdr->ls);
|
|
|
|
// mkdir?
|
|
if (newdirpv) {
|
|
CDir *dir = newi->get_dirfrag(frag_t());
|
|
assert(dir);
|
|
dir->mark_dirty(newdirpv, mdr->ls);
|
|
}
|
|
|
|
// dir inode's mtime
|
|
mds->server->dirty_dn_diri(mdr, dn, dirpv);
|
|
|
|
// hit pop
|
|
mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR);
|
|
//mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply->set_result(0);
|
|
mds->server->reply_request(mdr, reply, newi);
|
|
}
|
|
};
|
|
|
|
|
|
void Server::handle_client_mknod(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false);
|
|
if (!dn) return;
|
|
|
|
mdr->now = g_clock.real_now();
|
|
CInode *newi = prepare_new_inode(mdr, dn->dir);
|
|
assert(newi);
|
|
|
|
// it's a file.
|
|
newi->inode.rdev = req->head.args.mknod.rdev;
|
|
newi->inode.mode = req->head.args.mknod.mode;
|
|
newi->inode.mode &= ~S_IFMT;
|
|
newi->inode.mode |= S_IFREG;
|
|
newi->inode.version = dn->pre_dirty() - 1;
|
|
|
|
// prepare finisher
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "mknod");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
|
|
le->metablob.add_dir_context(dn->dir);
|
|
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
|
|
}
|
|
|
|
|
|
|
|
// MKDIR
|
|
|
|
void Server::handle_client_mkdir(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false);
|
|
if (!dn) return;
|
|
|
|
// new inode
|
|
mdr->now = g_clock.real_now();
|
|
CInode *newi = prepare_new_inode(mdr, dn->dir);
|
|
assert(newi);
|
|
|
|
// it's a directory.
|
|
newi->inode.mode = req->head.args.mkdir.mode;
|
|
newi->inode.mode &= ~S_IFMT;
|
|
newi->inode.mode |= S_IFDIR;
|
|
newi->inode.layout = g_OSD_MDDirLayout;
|
|
newi->inode.version = dn->pre_dirty() - 1;
|
|
|
|
// ...and that new dir is empty.
|
|
CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());
|
|
newdir->mark_complete();
|
|
version_t newdirpv = newdir->pre_dirty();
|
|
|
|
//if (mds->logger) mds->logger->inc("mkdir");
|
|
|
|
// prepare finisher
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "mkdir");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
|
|
le->metablob.add_dir_context(dn->dir);
|
|
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
|
|
le->metablob.add_dir(newdir, true, true); // dirty AND complete
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv));
|
|
|
|
/* old export heuristic. pbly need to reimplement this at some point.
|
|
if (
|
|
diri->dir->is_auth() &&
|
|
diri->dir->is_rep() &&
|
|
newdir->is_auth() &&
|
|
!newdir->is_hashing()) {
|
|
int dest = rand() % mds->mdsmap->get_num_mds();
|
|
if (dest != whoami) {
|
|
dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << dendl;
|
|
mdcache->migrator->export_dir(newdir, dest);
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
// SYMLINK
|
|
|
|
void Server::handle_client_symlink(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false);
|
|
if (!dn) return;
|
|
|
|
mdr->now = g_clock.real_now();
|
|
|
|
CInode *newi = prepare_new_inode(mdr, dn->dir);
|
|
assert(newi);
|
|
|
|
// it's a symlink
|
|
newi->inode.mode &= ~S_IFMT;
|
|
newi->inode.mode |= S_IFLNK;
|
|
newi->symlink = req->get_path2();
|
|
newi->inode.version = dn->pre_dirty() - 1;
|
|
|
|
// prepare finisher
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "symlink");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version());
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too
|
|
le->metablob.add_dir_context(dn->dir);
|
|
le->metablob.add_primary_dentry(dn, true, newi, &newi->inode);
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv));
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// LINK
|
|
|
|
void Server::handle_client_link(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
dout(7) << "handle_client_link " << req->get_filepath()
|
|
<< " to " << req->get_filepath2()
|
|
<< dendl;
|
|
|
|
// traverse to dest dir, make sure it's ours.
|
|
const filepath &linkpath = req->get_filepath();
|
|
const string &dname = linkpath.last_dentry();
|
|
vector<CDentry*> linktrace;
|
|
CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath);
|
|
if (!dir) return;
|
|
dout(7) << "handle_client_link link " << dname << " in " << *dir << dendl;
|
|
|
|
// traverse to link target
|
|
filepath targetpath = req->get_filepath2();
|
|
dout(7) << "handle_client_link discovering target " << targetpath << dendl;
|
|
vector<CDentry*> targettrace;
|
|
int r = mdcache->path_traverse(mdr, req,
|
|
targetpath, targettrace, false,
|
|
MDS_TRAVERSE_DISCOVER);
|
|
if (r > 0) return; // wait
|
|
if (targettrace.empty()) r = -EINVAL;
|
|
if (r < 0) {
|
|
reply_request(mdr, r);
|
|
return;
|
|
}
|
|
|
|
// identify target inode
|
|
CInode *targeti = targettrace[targettrace.size()-1]->inode;
|
|
assert(targeti);
|
|
|
|
// dir?
|
|
dout(7) << "target is " << *targeti << dendl;
|
|
if (targeti->is_dir()) {
|
|
dout(7) << "target is a dir, failing..." << dendl;
|
|
reply_request(mdr, -EINVAL);
|
|
return;
|
|
}
|
|
|
|
// get/make null link dentry
|
|
CDentry *dn = prepare_null_dentry(mdr, dir, dname, false);
|
|
if (!dn) return;
|
|
|
|
// create lock lists
|
|
set<SimpleLock*> rdlocks, wrlocks, xlocks;
|
|
|
|
for (int i=0; i<(int)linktrace.size(); i++)
|
|
rdlocks.insert(&linktrace[i]->lock);
|
|
xlocks.insert(&dn->lock);
|
|
wrlocks.insert(&dn->dir->inode->dirlock);
|
|
for (int i=0; i<(int)targettrace.size(); i++)
|
|
rdlocks.insert(&targettrace[i]->lock);
|
|
xlocks.insert(&targeti->linklock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
mdr->done_locking = true; // avoid wrlock moving target issues.
|
|
|
|
// pick mtime
|
|
if (mdr->now == utime_t())
|
|
mdr->now = g_clock.real_now();
|
|
|
|
// does the target need an anchor?
|
|
if (targeti->is_auth()) {
|
|
/*if (targeti->get_parent_dir() == dn->dir) {
|
|
dout(7) << "target is in the same dirfrag, sweet" << dendl;
|
|
}
|
|
else
|
|
*/
|
|
if (targeti->is_anchored() && !targeti->is_unanchoring()) {
|
|
dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl;
|
|
}
|
|
else {
|
|
dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl;
|
|
|
|
mdcache->anchor_create(mdr, targeti,
|
|
new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
}
|
|
|
|
// go!
|
|
|
|
// local or remote?
|
|
if (targeti->is_auth())
|
|
_link_local(mdr, dn, targeti);
|
|
else
|
|
_link_remote(mdr, dn, targeti);
|
|
}
|
|
|
|
|
|
class C_MDS_link_local_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
CInode *targeti;
|
|
version_t dnpv;
|
|
version_t tipv;
|
|
version_t dirpv;
|
|
public:
|
|
C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti,
|
|
version_t dnpv_, version_t tipv_, version_t dirpv_) :
|
|
mds(m), mdr(r), dn(d), targeti(ti),
|
|
dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv);
|
|
}
|
|
};
|
|
|
|
|
|
void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti)
|
|
{
|
|
dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
|
|
|
|
mdr->ls = mdlog->get_current_segment();
|
|
|
|
// predirty NEW dentry
|
|
version_t dnpv = dn->pre_dirty();
|
|
version_t tipv = targeti->pre_dirty();
|
|
|
|
// project inode update
|
|
inode_t *pi = targeti->project_inode();
|
|
pi->nlink++;
|
|
pi->ctime = mdr->now;
|
|
pi->version = tipv;
|
|
|
|
// log + wait
|
|
EUpdate *le = new EUpdate(mdlog, "link_local");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime
|
|
le->metablob.add_dir_context(dn->get_dir());
|
|
le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote
|
|
le->metablob.add_dir_context(targeti->get_parent_dir());
|
|
le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary
|
|
|
|
mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv));
|
|
}
|
|
|
|
void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
|
|
version_t dnpv, version_t tipv, version_t dirpv)
|
|
{
|
|
dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
|
|
|
|
// link and unlock the NEW dentry
|
|
dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
|
|
dn->mark_dirty(dnpv, mdr->ls);
|
|
|
|
// target inode
|
|
targeti->pop_and_dirty_projected_inode(mdr->ls);
|
|
|
|
// new dentry dir mtime
|
|
dirty_dn_diri(mdr, dn, dirpv);
|
|
|
|
// bump target popularity
|
|
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
|
|
//mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref
|
|
}
|
|
|
|
|
|
// remote
|
|
|
|
class C_MDS_link_remote_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
CInode *targeti;
|
|
version_t dpv;
|
|
version_t dirpv;
|
|
public:
|
|
C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) :
|
|
mds(m), mdr(r), dn(d), targeti(ti),
|
|
dpv(d->get_projected_version()),
|
|
dirpv(dirpv_) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv);
|
|
}
|
|
};
|
|
|
|
void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti)
|
|
{
|
|
dout(10) << "_link_remote " << *dn << " to " << *targeti << dendl;
|
|
|
|
// 1. send LinkPrepare to dest (journal nlink++ prepare)
|
|
int linkauth = targeti->authority().first;
|
|
if (mdr->more()->witnessed.count(linkauth) == 0) {
|
|
dout(10) << " targeti auth must prepare nlink++" << dendl;
|
|
|
|
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP);
|
|
targeti->set_object_info(req->get_object_info());
|
|
req->now = mdr->now;
|
|
mds->send_message_mds(req, linkauth);
|
|
|
|
assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
|
|
mdr->more()->waiting_on_slave.insert(linkauth);
|
|
return;
|
|
}
|
|
dout(10) << " targeti auth has prepared nlink++" << dendl;
|
|
|
|
// go.
|
|
// predirty dentry
|
|
dn->pre_dirty();
|
|
|
|
// add to event
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "link_remote");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime
|
|
le->metablob.add_dir_context(dn->get_dir());
|
|
le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote
|
|
|
|
// mark committing (needed for proper recovery)
|
|
mdr->committing = true;
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv));
|
|
}
|
|
|
|
void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti,
|
|
version_t dpv, version_t dirpv)
|
|
{
|
|
dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << dendl;
|
|
|
|
// link the new dentry
|
|
dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode));
|
|
dn->mark_dirty(dpv, mdr->ls);
|
|
|
|
// dir inode's mtime
|
|
dirty_dn_diri(mdr, dn, dirpv);
|
|
|
|
// bump target popularity
|
|
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
|
|
//mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref
|
|
}
|
|
|
|
|
|
// remote linking/unlinking
|
|
|
|
class C_MDS_SlaveLinkPrep : public Context {
|
|
Server *server;
|
|
MDRequest *mdr;
|
|
CInode *targeti;
|
|
utime_t old_ctime;
|
|
bool inc;
|
|
public:
|
|
C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) :
|
|
server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
server->_logged_slave_link(mdr, targeti, old_ctime, inc);
|
|
}
|
|
};
|
|
|
|
void Server::handle_slave_link_prep(MDRequest *mdr)
|
|
{
|
|
dout(10) << "handle_slave_link_prep " << *mdr
|
|
<< " on " << mdr->slave_request->get_object_info()
|
|
<< dendl;
|
|
|
|
CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
|
|
assert(targeti);
|
|
dout(10) << "targeti " << *targeti << dendl;
|
|
CDentry *dn = targeti->get_parent_dn();
|
|
assert(dn->is_primary());
|
|
|
|
mdr->now = mdr->slave_request->now;
|
|
|
|
// anchor?
|
|
if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
|
|
if (targeti->is_anchored() && !targeti->is_unanchoring()) {
|
|
dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl;
|
|
}
|
|
else {
|
|
dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl;
|
|
mdcache->anchor_create(mdr, targeti,
|
|
new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
}
|
|
|
|
// journal it
|
|
mdr->ls = mdlog->get_current_segment();
|
|
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
|
|
|
|
inode_t *pi = dn->inode->project_inode();
|
|
|
|
// rollback case
|
|
le->rollback.add_dir_context(targeti->get_parent_dir());
|
|
le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary
|
|
|
|
// update journaled target inode
|
|
bool inc;
|
|
if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
|
|
inc = true;
|
|
pi->nlink++;
|
|
} else {
|
|
inc = false;
|
|
pi->nlink--;
|
|
}
|
|
utime_t old_ctime = pi->ctime;
|
|
pi->ctime = mdr->now;
|
|
pi->version = targeti->pre_dirty();
|
|
|
|
dout(10) << " projected inode " << pi << " v " << pi->version << dendl;
|
|
|
|
// commit case
|
|
le->commit.add_dir_context(targeti->get_parent_dir());
|
|
le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary
|
|
|
|
mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc));
|
|
}
|
|
|
|
class C_MDS_SlaveLinkCommit : public Context {
|
|
Server *server;
|
|
MDRequest *mdr;
|
|
CInode *targeti;
|
|
utime_t old_ctime;
|
|
version_t old_version;
|
|
bool inc;
|
|
public:
|
|
C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) :
|
|
server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { }
|
|
void finish(int r) {
|
|
server->_commit_slave_link(mdr, r, targeti,
|
|
old_ctime, old_version, inc);
|
|
}
|
|
};
|
|
|
|
void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc)
|
|
{
|
|
dout(10) << "_logged_slave_link " << *mdr
|
|
<< " inc=" << inc
|
|
<< " " << *targeti << dendl;
|
|
|
|
version_t old_version = targeti->inode.version;
|
|
|
|
// update the target
|
|
targeti->pop_and_dirty_projected_inode(mdr->ls);
|
|
|
|
// hit pop
|
|
mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR);
|
|
|
|
// ack
|
|
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK);
|
|
mds->send_message_mds(reply, mdr->slave_to_mds);
|
|
|
|
// set up commit waiter
|
|
mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc);
|
|
|
|
// done.
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
}
|
|
|
|
|
|
void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti,
|
|
utime_t old_ctime, version_t old_version, bool inc)
|
|
{
|
|
dout(10) << "_commit_slave_link " << *mdr
|
|
<< " r=" << r
|
|
<< " inc=" << inc
|
|
<< " " << *targeti << dendl;
|
|
|
|
ESlaveUpdate *le;
|
|
if (r == 0) {
|
|
// write a commit to the journal
|
|
le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
|
|
} else {
|
|
le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
|
|
|
|
// -- rollback in memory --
|
|
assert(targeti->inode.ctime == mdr->now);
|
|
assert(targeti->projected_inode.empty()); // we're holding the version lock.
|
|
|
|
targeti->inode.ctime = old_ctime;
|
|
targeti->inode.version = old_version;
|
|
if (inc)
|
|
targeti->inode.nlink++;
|
|
else
|
|
targeti->inode.nlink--;
|
|
}
|
|
|
|
mdlog->submit_entry(le);
|
|
}
|
|
|
|
|
|
|
|
void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m)
|
|
{
|
|
dout(10) << "handle_slave_link_prep_ack " << *mdr
|
|
<< " " << *m << dendl;
|
|
int from = m->get_source().num();
|
|
|
|
// note slave
|
|
mdr->more()->slaves.insert(from);
|
|
|
|
// witnessed!
|
|
assert(mdr->more()->witnessed.count(from) == 0);
|
|
mdr->more()->witnessed.insert(from);
|
|
|
|
// remove from waiting list
|
|
assert(mdr->more()->waiting_on_slave.count(from));
|
|
mdr->more()->waiting_on_slave.erase(from);
|
|
|
|
assert(mdr->more()->waiting_on_slave.empty());
|
|
|
|
dispatch_client_request(mdr); // go again!
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// UNLINK
|
|
|
|
void Server::handle_client_unlink(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
// traverse to path
|
|
vector<CDentry*> trace;
|
|
int r = mdcache->path_traverse(mdr, req,
|
|
req->get_filepath(), trace, false,
|
|
MDS_TRAVERSE_FORWARD);
|
|
if (r > 0) return;
|
|
if (trace.empty()) r = -EINVAL; // can't unlink root
|
|
if (r < 0) {
|
|
reply_request(mdr, r);
|
|
return;
|
|
}
|
|
|
|
CDentry *dn = trace[trace.size()-1];
|
|
assert(dn);
|
|
|
|
// is it my dentry?
|
|
if (!dn->is_auth()) {
|
|
// fw to auth
|
|
mdcache->request_forward(mdr, dn->authority().first);
|
|
return;
|
|
}
|
|
|
|
// rmdir or unlink?
|
|
bool rmdir = false;
|
|
if (req->get_op() == CEPH_MDS_OP_RMDIR) rmdir = true;
|
|
|
|
if (rmdir) {
|
|
dout(7) << "handle_client_rmdir on " << *dn << dendl;
|
|
} else {
|
|
dout(7) << "handle_client_unlink on " << *dn << dendl;
|
|
}
|
|
|
|
// readable?
|
|
if (dn->lock.is_xlocked_by_other(mdr)) {
|
|
dout(10) << "waiting on xlocked dentry " << *dn << dendl;
|
|
dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
|
|
// dn looks ok.
|
|
|
|
// get/open inode.
|
|
mdr->trace.swap(trace);
|
|
CInode *in = mdcache->get_dentry_inode(dn, mdr);
|
|
if (!in) return;
|
|
dout(7) << "dn links to " << *in << dendl;
|
|
|
|
// rmdir vs is_dir
|
|
if (in->is_dir()) {
|
|
if (rmdir) {
|
|
// do empty directory checks
|
|
if (!_verify_rmdir(mdr, in))
|
|
return;
|
|
} else {
|
|
dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
|
|
reply_request(mdr, -EISDIR);
|
|
return;
|
|
}
|
|
} else {
|
|
if (rmdir) {
|
|
// unlink
|
|
dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
|
|
reply_request(mdr, -ENOTDIR);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// lock
|
|
set<SimpleLock*> rdlocks, wrlocks, xlocks;
|
|
|
|
for (int i=0; i<(int)trace.size()-1; i++)
|
|
rdlocks.insert(&trace[i]->lock);
|
|
xlocks.insert(&dn->lock);
|
|
wrlocks.insert(&dn->dir->inode->dirlock);
|
|
xlocks.insert(&in->linklock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// yay!
|
|
mdr->done_locking = true; // avoid wrlock racing
|
|
if (mdr->now == utime_t())
|
|
mdr->now = g_clock.real_now();
|
|
|
|
// get stray dn ready?
|
|
CDentry *straydn = 0;
|
|
if (dn->is_primary()) {
|
|
straydn = mdcache->get_or_create_stray_dentry(dn->inode);
|
|
mdr->pin(straydn); // pin it.
|
|
dout(10) << " straydn is " << *straydn << dendl;
|
|
assert(straydn->is_null());
|
|
|
|
if (!mdr->more()->dst_reanchor_atid &&
|
|
dn->inode->is_anchored()) {
|
|
dout(10) << "reanchoring to stray " << *dn->inode << dendl;
|
|
vector<Anchor> trace;
|
|
straydn->make_anchor_trace(trace, dn->inode);
|
|
mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid,
|
|
new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
}
|
|
|
|
// ok!
|
|
if (dn->is_remote() && !dn->inode->is_auth())
|
|
_unlink_remote(mdr, dn);
|
|
else
|
|
_unlink_local(mdr, dn, straydn);
|
|
}
|
|
|
|
|
|
|
|
class C_MDS_unlink_local_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
CDentry *straydn;
|
|
version_t dnpv; // deleted dentry
|
|
version_t dirpv;
|
|
public:
|
|
C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd,
|
|
version_t dirpv_) :
|
|
mds(m), mdr(r), dn(d), straydn(sd),
|
|
dnpv(d->get_projected_version()), dirpv(dirpv_) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv);
|
|
}
|
|
};
|
|
|
|
|
|
void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_unlink_local " << *dn << dendl;
|
|
|
|
// ok, let's do it.
|
|
mdr->ls = mdlog->get_current_segment();
|
|
|
|
// prepare log entry
|
|
EUpdate *le = new EUpdate(mdlog, "unlink_local");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
|
|
version_t ipv = 0; // dirty inode version
|
|
inode_t *ji = 0; // journaled projected inode
|
|
if (dn->is_primary()) {
|
|
// primary link. add stray dentry.
|
|
assert(straydn);
|
|
ipv = straydn->pre_dirty(dn->inode->inode.version);
|
|
le->metablob.add_dir_context(straydn->dir);
|
|
ji = le->metablob.add_primary_dentry(straydn, true, dn->inode);
|
|
} else {
|
|
// remote link. update remote inode.
|
|
ipv = dn->inode->pre_dirty();
|
|
le->metablob.add_dir_context(dn->inode->get_parent_dir());
|
|
ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode);
|
|
}
|
|
|
|
// update journaled target inode
|
|
inode_t *pi = dn->inode->project_inode();
|
|
pi->nlink--;
|
|
pi->ctime = mdr->now;
|
|
pi->version = ipv;
|
|
*ji = *pi; // copy into journal
|
|
|
|
// the unlinked dentry
|
|
dn->pre_dirty();
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob);
|
|
le->metablob.add_dir_context(dn->get_dir());
|
|
le->metablob.add_null_dentry(dn, true);
|
|
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid);
|
|
|
|
// log + wait
|
|
journal_opens(); // journal pending opens, just in case
|
|
mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn,
|
|
dirpv));
|
|
}
|
|
|
|
void Server::_unlink_local_finish(MDRequest *mdr,
|
|
CDentry *dn, CDentry *straydn,
|
|
version_t dnpv, version_t dirpv)
|
|
{
|
|
dout(10) << "_unlink_local_finish " << *dn << dendl;
|
|
|
|
// unlink main dentry
|
|
CInode *in = dn->inode;
|
|
dn->dir->unlink_inode(dn);
|
|
|
|
// relink as stray? (i.e. was primary link?)
|
|
if (straydn) {
|
|
dout(20) << " straydn is " << *straydn << dendl;
|
|
straydn->dir->link_primary_inode(straydn, in);
|
|
}
|
|
|
|
// nlink--, dirty old dentry
|
|
in->pop_and_dirty_projected_inode(mdr->ls);
|
|
dn->mark_dirty(dnpv, mdr->ls);
|
|
|
|
// dir inode's mtime
|
|
dirty_dn_diri(mdr, dn, dirpv);
|
|
|
|
// share unlink news with replicas
|
|
for (map<int,int>::iterator it = dn->replicas_begin();
|
|
it != dn->replicas_end();
|
|
it++) {
|
|
dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << dendl;
|
|
MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name);
|
|
if (straydn) {
|
|
unlink->strayin = straydn->dir->inode->replicate_to(it->first);
|
|
unlink->straydir = straydn->dir->replicate_to(it->first);
|
|
unlink->straydn = straydn->replicate_to(it->first);
|
|
}
|
|
mds->send_message_mds(unlink, it->first);
|
|
}
|
|
|
|
// commit anchor update?
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
|
|
|
|
// bump pop
|
|
//mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref
|
|
|
|
// clean up?
|
|
if (straydn)
|
|
mdcache->eval_stray(straydn);
|
|
|
|
// removing a new dn?
|
|
dn->dir->try_remove_unlinked_dn(dn);
|
|
}
|
|
|
|
|
|
class C_MDS_unlink_remote_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
version_t dnpv; // deleted dentry
|
|
version_t dirpv;
|
|
public:
|
|
C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d,
|
|
version_t dirpv_) :
|
|
mds(m), mdr(r), dn(d),
|
|
dnpv(d->get_projected_version()), dirpv(dirpv_) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv);
|
|
}
|
|
};
|
|
|
|
void Server::_unlink_remote(MDRequest *mdr, CDentry *dn)
|
|
{
|
|
dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << dendl;
|
|
|
|
// 1. send LinkPrepare to dest (journal nlink-- prepare)
|
|
int inauth = dn->inode->authority().first;
|
|
if (mdr->more()->witnessed.count(inauth) == 0) {
|
|
dout(10) << " inode auth must prepare nlink--" << dendl;
|
|
|
|
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP);
|
|
dn->inode->set_object_info(req->get_object_info());
|
|
req->now = mdr->now;
|
|
mds->send_message_mds(req, inauth);
|
|
|
|
assert(mdr->more()->waiting_on_slave.count(inauth) == 0);
|
|
mdr->more()->waiting_on_slave.insert(inauth);
|
|
return;
|
|
}
|
|
dout(10) << " inode auth has prepared nlink--" << dendl;
|
|
|
|
// ok, let's do it.
|
|
// prepare log entry
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "unlink_remote");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
|
|
// the unlinked dentry
|
|
dn->pre_dirty();
|
|
version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob);
|
|
le->metablob.add_dir_context(dn->get_dir());
|
|
le->metablob.add_null_dentry(dn, true);
|
|
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid);
|
|
|
|
// finisher
|
|
C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv);
|
|
|
|
journal_opens(); // journal pending opens, just in case
|
|
|
|
// mark committing (needed for proper recovery)
|
|
mdr->committing = true;
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, fin);
|
|
}
|
|
|
|
void Server::_unlink_remote_finish(MDRequest *mdr,
|
|
CDentry *dn,
|
|
version_t dnpv, version_t dirpv)
|
|
{
|
|
dout(10) << "_unlink_remote_finish " << *dn << dendl;
|
|
|
|
// unlink main dentry
|
|
dn->dir->unlink_inode(dn);
|
|
dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry
|
|
|
|
// dir inode's mtime
|
|
dirty_dn_diri(mdr, dn, dirpv);
|
|
|
|
// share unlink news with replicas
|
|
for (map<int,int>::iterator it = dn->replicas_begin();
|
|
it != dn->replicas_end();
|
|
it++) {
|
|
dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << dendl;
|
|
MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name);
|
|
mds->send_message_mds(unlink, it->first);
|
|
}
|
|
|
|
// commit anchor update?
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
|
|
|
|
//mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref
|
|
|
|
// removing a new dn?
|
|
dn->dir->try_remove_unlinked_dn(dn);
|
|
}
|
|
|
|
|
|
|
|
|
|
/** _verify_rmdir
|
|
*
|
|
* verify that a directory is empty (i.e. we can rmdir it),
|
|
* and make sure it is part of the same subtree (i.e. local)
|
|
* so that rmdir will occur locally.
|
|
*
|
|
* @param in is the inode being rmdir'd.
|
|
*/
|
|
bool Server::_verify_rmdir(MDRequest *mdr, CInode *in)
|
|
{
|
|
dout(10) << "_verify_rmdir " << *in << dendl;
|
|
assert(in->is_auth());
|
|
|
|
list<frag_t> frags;
|
|
in->dirfragtree.get_leaves(frags);
|
|
|
|
for (list<frag_t>::iterator p = frags.begin();
|
|
p != frags.end();
|
|
++p) {
|
|
CDir *dir = in->get_or_open_dirfrag(mdcache, *p);
|
|
assert(dir);
|
|
|
|
// dir looks empty but incomplete?
|
|
if (dir->is_auth() &&
|
|
dir->get_size() == 0 &&
|
|
!dir->is_complete()) {
|
|
dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << dendl;
|
|
dir->fetch(new C_MDS_RetryRequest(mdcache, mdr));
|
|
return false;
|
|
}
|
|
|
|
// does the frag _look_ empty?
|
|
if (dir->get_size()) {
|
|
dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << dendl;
|
|
reply_request(mdr, -ENOTEMPTY);
|
|
return false;
|
|
}
|
|
|
|
// not dir auth?
|
|
if (!dir->is_auth()) {
|
|
dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << dendl;
|
|
reply_request(mdr, -ENOTEMPTY);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
/*
|
|
// export sanity check
|
|
if (!in->is_auth()) {
|
|
// i should be exporting this now/soon, since the dir is empty.
|
|
dout(7) << "handle_client_rmdir dir is auth, but not inode." << dendl;
|
|
mdcache->migrator->export_empty_import(in->dir);
|
|
in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri));
|
|
return;
|
|
}
|
|
*/
|
|
|
|
|
|
|
|
|
|
// ======================================================
|
|
|
|
|
|
class C_MDS_rename_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *srcdn;
|
|
CDentry *destdn;
|
|
CDentry *straydn;
|
|
public:
|
|
C_MDS_rename_finish(MDS *m, MDRequest *r,
|
|
CDentry *sdn, CDentry *ddn, CDentry *stdn) :
|
|
mds(m), mdr(r),
|
|
srcdn(sdn), destdn(ddn), straydn(stdn) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
mds->server->_rename_finish(mdr, srcdn, destdn, straydn);
|
|
}
|
|
};
|
|
|
|
|
|
/** handle_client_rename
|
|
*
|
|
*/
|
|
void Server::handle_client_rename(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
dout(7) << "handle_client_rename " << *req << dendl;
|
|
|
|
// traverse to dest dir (not dest)
|
|
// we do this FIRST, because the rename should occur on the
|
|
// destdn's auth.
|
|
const filepath &destpath = req->get_filepath2();
|
|
const string &destname = destpath.last_dentry();
|
|
vector<CDentry*> desttrace;
|
|
CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath);
|
|
if (!destdir) return; // fw or error out
|
|
dout(10) << "dest will be " << destname << " in " << *destdir << dendl;
|
|
assert(destdir->is_auth());
|
|
|
|
// traverse to src
|
|
filepath srcpath = req->get_filepath();
|
|
vector<CDentry*> srctrace;
|
|
int r = mdcache->path_traverse(mdr, req,
|
|
srcpath, srctrace, false,
|
|
MDS_TRAVERSE_DISCOVER);
|
|
if (r > 0) return;
|
|
if (srctrace.empty()) r = -EINVAL; // can't rename root
|
|
if (r < 0) {
|
|
reply_request(mdr, r);
|
|
return;
|
|
}
|
|
CDentry *srcdn = srctrace[srctrace.size()-1];
|
|
dout(10) << " srcdn " << *srcdn << dendl;
|
|
CInode *srci = mdcache->get_dentry_inode(srcdn, mdr);
|
|
dout(10) << " srci " << *srci << dendl;
|
|
mdr->pin(srci);
|
|
|
|
// -- some sanity checks --
|
|
// src == dest?
|
|
if (srcdn->get_dir() == destdir && srcdn->name == destname) {
|
|
dout(7) << "rename src=dest, noop" << dendl;
|
|
reply_request(mdr, 0);
|
|
return;
|
|
}
|
|
|
|
// dest a child of src?
|
|
// e.g. mv /usr /usr/foo
|
|
CDentry *pdn = destdir->inode->parent;
|
|
while (pdn) {
|
|
if (pdn == srcdn) {
|
|
dout(7) << "cannot rename item to be a child of itself" << dendl;
|
|
reply_request(mdr, -EINVAL);
|
|
return;
|
|
}
|
|
pdn = pdn->dir->inode->parent;
|
|
}
|
|
|
|
|
|
// identify/create dest dentry
|
|
CDentry *destdn = destdir->lookup(destname);
|
|
if (destdn && destdn->lock.is_xlocked_by_other(mdr)) {
|
|
destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
|
|
CInode *oldin = 0;
|
|
if (destdn && !destdn->is_null()) {
|
|
//dout(10) << "dest dn exists " << *destdn << dendl;
|
|
oldin = mdcache->get_dentry_inode(destdn, mdr);
|
|
if (!oldin) return;
|
|
dout(10) << " oldin " << *oldin << dendl;
|
|
|
|
// mv /some/thing /to/some/existing_other_thing
|
|
if (oldin->is_dir() && !srci->is_dir()) {
|
|
reply_request(mdr, -EISDIR);
|
|
return;
|
|
}
|
|
if (!oldin->is_dir() && srci->is_dir()) {
|
|
reply_request(mdr, -ENOTDIR);
|
|
return;
|
|
}
|
|
|
|
// non-empty dir?
|
|
if (oldin->is_dir() && !_verify_rmdir(mdr, oldin))
|
|
return;
|
|
}
|
|
if (!destdn) {
|
|
// mv /some/thing /to/some/non_existent_name
|
|
destdn = prepare_null_dentry(mdr, destdir, destname);
|
|
if (!destdn) return;
|
|
}
|
|
|
|
dout(10) << " destdn " << *destdn << dendl;
|
|
|
|
// -- locks --
|
|
set<SimpleLock*> rdlocks, wrlocks, xlocks;
|
|
|
|
// rdlock sourcedir path, xlock src dentry
|
|
for (int i=0; i<(int)srctrace.size()-1; i++)
|
|
rdlocks.insert(&srctrace[i]->lock);
|
|
xlocks.insert(&srcdn->lock);
|
|
wrlocks.insert(&srcdn->dir->inode->dirlock);
|
|
/*
|
|
* no, this causes problems if the dftlock is scattered...
|
|
* and what was i thinking anyway?
|
|
* rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree.
|
|
*/
|
|
|
|
// rdlock destdir path, xlock dest dentry
|
|
for (int i=0; i<(int)desttrace.size(); i++)
|
|
rdlocks.insert(&desttrace[i]->lock);
|
|
xlocks.insert(&destdn->lock);
|
|
wrlocks.insert(&destdn->dir->inode->dirlock);
|
|
|
|
// xlock versionlock on srci if remote?
|
|
// this ensures it gets safely remotely auth_pinned, avoiding deadlock;
|
|
// strictly speaking, having the slave node freeze the inode is
|
|
// otherwise sufficient for avoiding conflicts with inode locks, etc.
|
|
if (!srcdn->is_auth() && srcdn->is_primary())
|
|
xlocks.insert(&srcdn->inode->versionlock);
|
|
|
|
// xlock oldin (for nlink--)
|
|
if (oldin) xlocks.insert(&oldin->linklock);
|
|
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// set done_locking flag, to avoid problems with wrlock moving auth target
|
|
mdr->done_locking = true;
|
|
|
|
// -- open all srcdn inode frags, if any --
|
|
// we need these open so that auth can properly delegate from inode to dirfrags
|
|
// after the inode is _ours_.
|
|
if (srcdn->is_primary() &&
|
|
!srcdn->is_auth() &&
|
|
srci->is_dir()) {
|
|
dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
|
|
mdr->set_stickydirs(srci);
|
|
|
|
list<frag_t> frags;
|
|
srci->dirfragtree.get_leaves(frags);
|
|
for (list<frag_t>::iterator p = frags.begin();
|
|
p != frags.end();
|
|
++p) {
|
|
CDir *dir = srci->get_dirfrag(*p);
|
|
if (!dir) {
|
|
dout(10) << " opening " << *p << " under " << *srci << dendl;
|
|
mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// -- declare now --
|
|
if (mdr->now == utime_t())
|
|
mdr->now = g_clock.real_now();
|
|
|
|
// -- create stray dentry? --
|
|
CDentry *straydn = 0;
|
|
if (destdn->is_primary()) {
|
|
straydn = mdcache->get_or_create_stray_dentry(destdn->inode);
|
|
mdr->pin(straydn);
|
|
dout(10) << "straydn is " << *straydn << dendl;
|
|
}
|
|
|
|
// -- prepare witnesses --
|
|
/*
|
|
* NOTE: we use _all_ replicas as witnesses.
|
|
* this probably isn't totally necessary (esp for file renames),
|
|
* but if/when we change that, we have to make sure rejoin is
|
|
* sufficiently robust to handle strong rejoins from survivors
|
|
* with totally wrong dentry->inode linkage.
|
|
* (currently, it can ignore rename effects, because the resolve
|
|
* stage will sort them out.)
|
|
*/
|
|
set<int> witnesses = mdr->more()->extra_witnesses;
|
|
if (srcdn->is_auth())
|
|
srcdn->list_replicas(witnesses);
|
|
else
|
|
witnesses.insert(srcdn->authority().first);
|
|
destdn->list_replicas(witnesses);
|
|
dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
|
|
|
|
// do srcdn auth last
|
|
int last = -1;
|
|
if (!srcdn->is_auth())
|
|
last = srcdn->authority().first;
|
|
|
|
for (set<int>::iterator p = witnesses.begin();
|
|
p != witnesses.end();
|
|
++p) {
|
|
if (*p == last) continue; // do it last!
|
|
if (mdr->more()->witnessed.count(*p)) {
|
|
dout(10) << " already witnessed by mds" << *p << dendl;
|
|
} else if (mdr->more()->waiting_on_slave.count(*p)) {
|
|
dout(10) << " already waiting on witness mds" << *p << dendl;
|
|
} else {
|
|
_rename_prepare_witness(mdr, *p, srcdn, destdn, straydn);
|
|
}
|
|
}
|
|
if (!mdr->more()->waiting_on_slave.empty())
|
|
return; // we're waiting for a witness.
|
|
|
|
if (last >= 0 &&
|
|
mdr->more()->witnessed.count(last) == 0 &&
|
|
mdr->more()->waiting_on_slave.count(last) == 0) {
|
|
dout(10) << " preparing last witness (srcdn auth)" << dendl;
|
|
_rename_prepare_witness(mdr, last, srcdn, destdn, straydn);
|
|
return;
|
|
}
|
|
|
|
// -- prepare anchor updates --
|
|
bool linkmerge = (srcdn->inode == destdn->inode &&
|
|
(srcdn->is_primary() || destdn->is_primary()));
|
|
|
|
if (!linkmerge) {
|
|
C_Gather *anchorgather = 0;
|
|
|
|
if (srcdn->is_primary() && srcdn->inode->is_anchored() &&
|
|
srcdn->dir != destdn->dir &&
|
|
!mdr->more()->src_reanchor_atid) {
|
|
dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl;
|
|
vector<Anchor> trace;
|
|
destdn->make_anchor_trace(trace, srcdn->inode);
|
|
|
|
anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr));
|
|
mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid,
|
|
anchorgather->new_sub());
|
|
}
|
|
if (destdn->is_primary() &&
|
|
destdn->inode->is_anchored() &&
|
|
!mdr->more()->dst_reanchor_atid) {
|
|
dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl;
|
|
|
|
assert(straydn);
|
|
vector<Anchor> trace;
|
|
straydn->make_anchor_trace(trace, destdn->inode);
|
|
|
|
if (!anchorgather)
|
|
anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr));
|
|
mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid,
|
|
anchorgather->new_sub());
|
|
}
|
|
|
|
if (anchorgather)
|
|
return; // waiting for anchor prepares
|
|
}
|
|
|
|
// -- prepare journal entry --
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "rename");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
|
|
_rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
|
|
|
|
if (!srcdn->is_auth() && srcdn->is_primary()) {
|
|
// importing inode; also journal imported client map
|
|
|
|
// ** DER FIXME **
|
|
}
|
|
|
|
// -- commit locally --
|
|
C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn);
|
|
|
|
journal_opens(); // journal pending opens, just in case
|
|
|
|
// mark committing (needed for proper recovery)
|
|
mdr->committing = true;
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, fin);
|
|
}
|
|
|
|
|
|
void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_rename_finish " << *mdr << dendl;
|
|
|
|
// apply
|
|
_rename_apply(mdr, srcdn, destdn, straydn);
|
|
|
|
// commit anchor updates?
|
|
if (mdr->more()->src_reanchor_atid)
|
|
mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls);
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls);
|
|
|
|
// bump popularity
|
|
//if (srcdn->is_auth())
|
|
//mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR);
|
|
// mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR);
|
|
if (destdn->is_remote() &&
|
|
destdn->inode->is_auth())
|
|
mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR);
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(mdr->client_request, 0);
|
|
reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref
|
|
|
|
// clean up?
|
|
if (straydn)
|
|
mdcache->eval_stray(straydn);
|
|
}
|
|
|
|
|
|
|
|
// helpers
|
|
|
|
void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_rename_prepare_witness mds" << who << dendl;
|
|
MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP);
|
|
srcdn->make_path(req->srcdnpath);
|
|
destdn->make_path(req->destdnpath);
|
|
req->now = mdr->now;
|
|
|
|
if (straydn) {
|
|
CInodeDiscover *indis = straydn->dir->inode->replicate_to(who);
|
|
CDirDiscover *dirdis = straydn->dir->replicate_to(who);
|
|
CDentryDiscover *dndis = straydn->replicate_to(who);
|
|
indis->_encode(req->stray);
|
|
dirdis->_encode(req->stray);
|
|
dndis->_encode(req->stray);
|
|
delete indis;
|
|
delete dirdis;
|
|
delete dndis;
|
|
}
|
|
|
|
// srcdn auth will verify our current witness list is sufficient
|
|
req->witnesses = mdr->more()->witnessed;
|
|
|
|
mds->send_message_mds(req, who);
|
|
|
|
assert(mdr->more()->waiting_on_slave.count(who) == 0);
|
|
mdr->more()->waiting_on_slave.insert(who);
|
|
}
|
|
|
|
|
|
void Server::_rename_prepare(MDRequest *mdr,
|
|
EMetaBlob *metablob, bufferlist *client_map_bl,
|
|
CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
|
|
|
|
// primary+remote link merge?
|
|
bool linkmerge = (srcdn->inode == destdn->inode &&
|
|
(srcdn->is_primary() || destdn->is_primary()));
|
|
|
|
if (mdr->is_master()) {
|
|
mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob);
|
|
if (destdn->dir != srcdn->dir)
|
|
mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob);
|
|
}
|
|
|
|
inode_t *ji = 0; // journaled inode getting nlink--
|
|
version_t ipv = 0; // it's version
|
|
|
|
if (linkmerge) {
|
|
dout(10) << "will merge remote+primary links" << dendl;
|
|
|
|
// destdn -> primary
|
|
metablob->add_dir_context(destdn->dir);
|
|
if (destdn->is_auth())
|
|
ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version);
|
|
ji = metablob->add_primary_dentry(destdn, true, destdn->inode);
|
|
|
|
// do src dentry
|
|
metablob->add_dir_context(srcdn->dir);
|
|
if (srcdn->is_auth())
|
|
mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
|
|
metablob->add_null_dentry(srcdn, true);
|
|
|
|
} else {
|
|
// move to stray?
|
|
if (destdn->is_primary()) {
|
|
// primary. we'll move inode to stray dir.
|
|
assert(straydn);
|
|
|
|
// link-- inode, move to stray dir.
|
|
metablob->add_dir_context(straydn->dir);
|
|
if (straydn->is_auth())
|
|
ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version);
|
|
ji = metablob->add_primary_dentry(straydn, true, destdn->inode);
|
|
}
|
|
else if (destdn->is_remote()) {
|
|
// remote.
|
|
// nlink-- targeti
|
|
metablob->add_dir_context(destdn->inode->get_parent_dir());
|
|
if (destdn->inode->is_auth())
|
|
ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty();
|
|
ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary
|
|
dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl;
|
|
}
|
|
else {
|
|
assert(destdn->is_null());
|
|
}
|
|
|
|
// add dest dentry
|
|
metablob->add_dir_context(destdn->dir);
|
|
if (srcdn->is_primary()) {
|
|
dout(10) << "src is a primary dentry" << dendl;
|
|
if (destdn->is_auth()) {
|
|
version_t siv;
|
|
if (srcdn->is_auth())
|
|
siv = srcdn->inode->get_projected_version();
|
|
else {
|
|
siv = mdr->more()->inode_import_v;
|
|
|
|
/* import node */
|
|
bufferlist::iterator blp = mdr->more()->inode_import.begin();
|
|
|
|
// imported caps
|
|
::_decode_simple(mdr->more()->imported_client_map, blp);
|
|
::_encode_simple(mdr->more()->imported_client_map, *client_map_bl);
|
|
prepare_force_open_sessions(mdr->more()->imported_client_map);
|
|
|
|
list<ScatterLock*> updated_scatterlocks; // we clear_updated explicitly below
|
|
|
|
mdcache->migrator->decode_import_inode(srcdn, blp,
|
|
srcdn->authority().first,
|
|
mdr->ls,
|
|
mdr->more()->cap_imports, updated_scatterlocks);
|
|
srcdn->inode->dirlock.clear_updated();
|
|
|
|
|
|
// hack: force back to !auth and clean, temporarily
|
|
srcdn->inode->state_clear(CInode::STATE_AUTH);
|
|
srcdn->inode->mark_clean();
|
|
}
|
|
mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1);
|
|
}
|
|
metablob->add_primary_dentry(destdn, true, srcdn->inode);
|
|
|
|
} else {
|
|
assert(srcdn->is_remote());
|
|
dout(10) << "src is a remote dentry" << dendl;
|
|
if (destdn->is_auth())
|
|
mdr->more()->pvmap[destdn] = destdn->pre_dirty();
|
|
metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino());
|
|
}
|
|
|
|
// remove src dentry
|
|
metablob->add_dir_context(srcdn->dir);
|
|
if (srcdn->is_auth())
|
|
mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
|
|
metablob->add_null_dentry(srcdn, true);
|
|
|
|
// new subtree?
|
|
if (srcdn->is_primary() &&
|
|
srcdn->inode->is_dir()) {
|
|
list<CDir*> ls;
|
|
srcdn->inode->get_nested_dirfrags(ls);
|
|
int auth = srcdn->authority().first;
|
|
for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p)
|
|
mdcache->adjust_subtree_auth(*p, auth, auth);
|
|
}
|
|
}
|
|
|
|
if (ipv) {
|
|
// update journaled target inode
|
|
inode_t *pi = destdn->inode->project_inode();
|
|
pi->nlink--;
|
|
pi->ctime = mdr->now;
|
|
pi->version = ipv;
|
|
*ji = *pi; // copy into journal
|
|
}
|
|
|
|
// anchor updates?
|
|
if (mdr->more()->src_reanchor_atid)
|
|
metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid);
|
|
if (mdr->more()->dst_reanchor_atid)
|
|
metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid);
|
|
}
|
|
|
|
|
|
void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
|
|
dout(10) << " pvs " << mdr->more()->pvmap << dendl;
|
|
|
|
CInode *oldin = destdn->inode;
|
|
|
|
// primary+remote link merge?
|
|
bool linkmerge = (srcdn->inode == destdn->inode &&
|
|
(srcdn->is_primary() || destdn->is_primary()));
|
|
|
|
// dir mtimes
|
|
if (mdr->is_master()) {
|
|
dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]);
|
|
if (destdn->dir != srcdn->dir)
|
|
dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]);
|
|
}
|
|
|
|
if (linkmerge) {
|
|
if (destdn->is_primary()) {
|
|
dout(10) << "merging remote onto primary link" << dendl;
|
|
|
|
// nlink-- in place
|
|
destdn->inode->inode.nlink--;
|
|
destdn->inode->inode.ctime = mdr->now;
|
|
if (destdn->inode->is_auth())
|
|
destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
|
|
|
|
// unlink srcdn
|
|
srcdn->dir->unlink_inode(srcdn);
|
|
if (srcdn->is_auth())
|
|
srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
|
|
} else {
|
|
dout(10) << "merging primary onto remote link" << dendl;
|
|
assert(srcdn->is_primary());
|
|
|
|
// move inode to dest
|
|
srcdn->dir->unlink_inode(srcdn);
|
|
destdn->dir->unlink_inode(destdn);
|
|
destdn->dir->link_primary_inode(destdn, oldin);
|
|
|
|
// nlink--
|
|
destdn->inode->inode.nlink--;
|
|
destdn->inode->inode.ctime = mdr->now;
|
|
if (destdn->inode->is_auth())
|
|
destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
|
|
|
|
// mark src dirty
|
|
if (srcdn->is_auth())
|
|
srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
|
|
}
|
|
}
|
|
else {
|
|
// unlink destdn?
|
|
if (!destdn->is_null())
|
|
destdn->dir->unlink_inode(destdn);
|
|
|
|
if (straydn) {
|
|
dout(10) << "straydn is " << *straydn << dendl;
|
|
|
|
// relink oldin to stray dir. destdn was primary.
|
|
assert(oldin);
|
|
straydn->dir->link_primary_inode(straydn, oldin);
|
|
//assert(straypv == ipv);
|
|
|
|
// nlink-- in stray dir.
|
|
oldin->inode.nlink--;
|
|
oldin->inode.ctime = mdr->now;
|
|
if (oldin->is_auth())
|
|
oldin->pop_and_dirty_projected_inode(mdr->ls);
|
|
}
|
|
else if (oldin) {
|
|
// nlink-- remote. destdn was remote.
|
|
oldin->inode.nlink--;
|
|
oldin->inode.ctime = mdr->now;
|
|
if (oldin->is_auth())
|
|
oldin->pop_and_dirty_projected_inode(mdr->ls);
|
|
}
|
|
|
|
CInode *in = srcdn->inode;
|
|
assert(in);
|
|
if (srcdn->is_remote()) {
|
|
// srcdn was remote.
|
|
srcdn->dir->unlink_inode(srcdn);
|
|
destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode));
|
|
destdn->link_remote(in);
|
|
if (destdn->is_auth())
|
|
destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
|
|
} else {
|
|
// srcdn was primary.
|
|
srcdn->dir->unlink_inode(srcdn);
|
|
destdn->dir->link_primary_inode(destdn, in);
|
|
|
|
// srcdn inode import?
|
|
if (!srcdn->is_auth() && destdn->is_auth()) {
|
|
assert(mdr->more()->inode_import.length() > 0);
|
|
|
|
// finish cap imports
|
|
finish_force_open_sessions(mdr->more()->imported_client_map);
|
|
if (mdr->more()->cap_imports.count(destdn->inode))
|
|
mds->mdcache->migrator->finish_import_inode_caps(destdn->inode, srcdn->authority().first,
|
|
mdr->more()->cap_imports[destdn->inode]);
|
|
|
|
// hack: fix auth bit
|
|
destdn->inode->state_set(CInode::STATE_AUTH);
|
|
}
|
|
if (destdn->inode->is_auth())
|
|
destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
|
|
}
|
|
|
|
if (srcdn->is_auth())
|
|
srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
|
|
}
|
|
|
|
// update subtree map?
|
|
if (destdn->is_primary() && destdn->inode->is_dir())
|
|
mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir);
|
|
|
|
// removing a new dn?
|
|
if (srcdn->is_auth())
|
|
srcdn->dir->try_remove_unlinked_dn(srcdn);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ------------
|
|
// SLAVE
|
|
|
|
class C_MDS_SlaveRenamePrep : public Context {
|
|
Server *server;
|
|
MDRequest *mdr;
|
|
CDentry *srcdn, *destdn, *straydn;
|
|
public:
|
|
C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) :
|
|
server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
|
|
void finish(int r) {
|
|
server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
|
|
}
|
|
};
|
|
|
|
class C_MDS_SlaveRenameCommit : public Context {
|
|
Server *server;
|
|
MDRequest *mdr;
|
|
CDentry *srcdn, *destdn, *straydn;
|
|
public:
|
|
C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) :
|
|
server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
|
|
void finish(int r) {
|
|
server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
|
|
}
|
|
};
|
|
|
|
void Server::handle_slave_rename_prep(MDRequest *mdr)
|
|
{
|
|
dout(10) << "handle_slave_rename_prep " << *mdr
|
|
<< " " << mdr->slave_request->srcdnpath
|
|
<< " to " << mdr->slave_request->destdnpath
|
|
<< dendl;
|
|
|
|
// discover destdn
|
|
filepath destpath(mdr->slave_request->destdnpath);
|
|
dout(10) << " dest " << destpath << dendl;
|
|
vector<CDentry*> trace;
|
|
int r = mdcache->path_traverse(mdr, mdr->slave_request,
|
|
destpath, trace, false,
|
|
MDS_TRAVERSE_DISCOVERXLOCK);
|
|
if (r > 0) return;
|
|
assert(r == 0); // we shouldn't get an error here!
|
|
|
|
CDentry *destdn = trace[trace.size()-1];
|
|
dout(10) << " destdn " << *destdn << dendl;
|
|
mdr->pin(destdn);
|
|
|
|
// discover srcdn
|
|
filepath srcpath(mdr->slave_request->srcdnpath);
|
|
dout(10) << " src " << srcpath << dendl;
|
|
r = mdcache->path_traverse(mdr, mdr->slave_request,
|
|
srcpath, trace, false,
|
|
MDS_TRAVERSE_DISCOVERXLOCK);
|
|
if (r > 0) return;
|
|
assert(r == 0);
|
|
|
|
CDentry *srcdn = trace[trace.size()-1];
|
|
dout(10) << " srcdn " << *srcdn << dendl;
|
|
mdr->pin(srcdn);
|
|
assert(srcdn->inode);
|
|
mdr->pin(srcdn->inode);
|
|
|
|
// stray?
|
|
CDentry *straydn = 0;
|
|
if (destdn->is_primary()) {
|
|
assert(mdr->slave_request->stray.length() > 0);
|
|
straydn = mdcache->add_replica_stray(mdr->slave_request->stray,
|
|
destdn->inode, mdr->slave_to_mds);
|
|
assert(straydn);
|
|
mdr->pin(straydn);
|
|
}
|
|
|
|
mdr->now = mdr->slave_request->now;
|
|
|
|
// set up commit waiter (early, to clean up any freezing etc we do)
|
|
if (!mdr->more()->slave_commit)
|
|
mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
|
|
|
|
// am i srcdn auth?
|
|
if (srcdn->is_auth()) {
|
|
if (srcdn->is_primary() &&
|
|
!srcdn->inode->is_freezing_inode() &&
|
|
!srcdn->inode->is_frozen_inode()) {
|
|
// srci auth.
|
|
// set ambiguous auth.
|
|
srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH);
|
|
|
|
// freeze?
|
|
// we need this to
|
|
// - avoid conflicting lock state changes
|
|
// - avoid concurrent updates to the inode
|
|
// (this could also be accomplished with the versionlock)
|
|
int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr)
|
|
dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl;
|
|
if (!srcdn->inode->freeze_inode(allowance)) {
|
|
srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
|
|
return;
|
|
}
|
|
}
|
|
|
|
// is witness list sufficient?
|
|
set<int> srcdnrep;
|
|
srcdn->list_replicas(srcdnrep);
|
|
for (set<int>::iterator p = srcdnrep.begin();
|
|
p != srcdnrep.end();
|
|
++p) {
|
|
if (*p == mdr->slave_to_mds ||
|
|
mdr->slave_request->witnesses.count(*p)) continue;
|
|
dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
|
|
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK);
|
|
reply->witnesses.swap(srcdnrep);
|
|
mds->send_message_mds(reply, mdr->slave_to_mds);
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
return;
|
|
}
|
|
dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
|
|
}
|
|
|
|
// journal it?
|
|
if (srcdn->is_auth() ||
|
|
(destdn->inode && destdn->inode->is_auth()) ||
|
|
srcdn->inode->is_any_caps()) {
|
|
// journal.
|
|
mdr->ls = mdlog->get_current_segment();
|
|
ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE);
|
|
|
|
// rollback case
|
|
if (destdn->inode && destdn->inode->is_auth()) {
|
|
assert(destdn->is_remote());
|
|
le->rollback.add_dir_context(destdn->dir);
|
|
le->rollback.add_dentry(destdn, true);
|
|
}
|
|
if (srcdn->is_auth() ||
|
|
(srcdn->inode && srcdn->inode->is_auth())) {
|
|
le->rollback.add_dir_context(srcdn->dir);
|
|
le->rollback.add_dentry(srcdn, true);
|
|
}
|
|
|
|
// commit case
|
|
bufferlist blah;
|
|
_rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
|
|
|
|
mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn));
|
|
} else {
|
|
// don't journal.
|
|
dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl;
|
|
|
|
// prepare anyway; this may twiddle dir_auth
|
|
EMetaBlob blob;
|
|
bufferlist blah;
|
|
_rename_prepare(mdr, &blob, &blah, srcdn, destdn, straydn);
|
|
_logged_slave_rename(mdr, srcdn, destdn, straydn);
|
|
}
|
|
}
|
|
|
|
void Server::_logged_slave_rename(MDRequest *mdr,
|
|
CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_logged_slave_rename " << *mdr << dendl;
|
|
|
|
// prepare ack
|
|
MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK);
|
|
|
|
// export srci?
|
|
if (srcdn->is_auth() && srcdn->is_primary()) {
|
|
list<Context*> finished;
|
|
map<int,entity_inst_t> exported_client_map;
|
|
bufferlist inodebl;
|
|
mdcache->migrator->encode_export_inode(srcdn->inode, inodebl,
|
|
exported_client_map);
|
|
mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished);
|
|
mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
|
|
::_encode(exported_client_map, reply->inode_export);
|
|
reply->inode_export.claim_append(inodebl);
|
|
reply->inode_export_v = srcdn->inode->inode.version;
|
|
|
|
// remove mdr auth pin
|
|
mdr->auth_unpin(srcdn->inode);
|
|
assert(!srcdn->inode->is_auth_pinned());
|
|
|
|
dout(10) << " exported srci " << *srcdn->inode << dendl;
|
|
}
|
|
|
|
// apply
|
|
_rename_apply(mdr, srcdn, destdn, straydn);
|
|
|
|
mds->send_message_mds(reply, mdr->slave_to_mds);
|
|
|
|
// bump popularity
|
|
//if (srcdn->is_auth())
|
|
//mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR);
|
|
if (destdn->inode && destdn->inode->is_auth())
|
|
mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR);
|
|
|
|
// done.
|
|
delete mdr->slave_request;
|
|
mdr->slave_request = 0;
|
|
}
|
|
|
|
void Server::_commit_slave_rename(MDRequest *mdr, int r,
|
|
CDentry *srcdn, CDentry *destdn, CDentry *straydn)
|
|
{
|
|
dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
|
|
|
|
// unfreeze+singleauth inode
|
|
// hmm, do i really need to delay this?
|
|
if (srcdn->is_auth() && destdn->is_primary()) {
|
|
dout(10) << " unfreezing exported inode " << *destdn->inode << dendl;
|
|
list<Context*> finished;
|
|
|
|
// singleauth
|
|
assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH));
|
|
destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH);
|
|
destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished);
|
|
|
|
// unfreeze
|
|
assert(destdn->inode->is_frozen_inode() ||
|
|
destdn->inode->is_freezing_inode());
|
|
destdn->inode->unfreeze_inode(finished);
|
|
|
|
mds->queue_waiters(finished);
|
|
}
|
|
|
|
|
|
ESlaveUpdate *le;
|
|
if (r == 0) {
|
|
// write a commit to the journal
|
|
le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT);
|
|
|
|
} else {
|
|
// abort
|
|
le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK);
|
|
|
|
// -- rollback in memory --
|
|
|
|
if (mdr->more()->was_link_merge) {
|
|
// link merge
|
|
CInode *in = destdn->inode;
|
|
in->inode.nlink++;
|
|
if (mdr->more()->destdn_was_remote_inode) {
|
|
destdn->dir->unlink_inode(destdn);
|
|
srcdn->dir->link_primary_inode(srcdn, in);
|
|
destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode));
|
|
} else {
|
|
srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode));
|
|
}
|
|
} else {
|
|
// normal
|
|
|
|
// revert srcdn
|
|
if (destdn->is_remote()) {
|
|
srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode));
|
|
destdn->dir->unlink_inode(destdn);
|
|
} else {
|
|
// renamed a primary
|
|
CInode *in = destdn->inode;
|
|
destdn->dir->unlink_inode(destdn);
|
|
srcdn->dir->link_primary_inode(srcdn, in);
|
|
}
|
|
|
|
// revert destdn
|
|
if (mdr->more()->destdn_was_remote_inode) {
|
|
destdn->dir->link_remote_inode(destdn,
|
|
mdr->more()->destdn_was_remote_inode->ino(),
|
|
MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode));
|
|
mdr->more()->destdn_was_remote_inode->inode.nlink++;
|
|
} else if (straydn && straydn->inode) {
|
|
CInode *in = straydn->inode;
|
|
straydn->dir->unlink_inode(straydn);
|
|
destdn->dir->link_primary_inode(destdn, in);
|
|
straydn->dir->remove_dentry(straydn);
|
|
}
|
|
}
|
|
|
|
// FIXME: reverse srci export?
|
|
|
|
dout(-10) << " srcdn back to " << *srcdn << dendl;
|
|
dout(-10) << " srci back to " << *srcdn->inode << dendl;
|
|
dout(-10) << " destdn back to " << *destdn << dendl;
|
|
if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl;
|
|
|
|
// *** WRITE ME ***
|
|
assert(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
mdlog->submit_entry(le);
|
|
}
|
|
|
|
void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
|
|
{
|
|
dout(10) << "handle_slave_rename_prep_ack " << *mdr
|
|
<< " witnessed by " << ack->get_source()
|
|
<< " " << *ack << dendl;
|
|
int from = ack->get_source().num();
|
|
|
|
// note slave
|
|
mdr->more()->slaves.insert(from);
|
|
|
|
// witnessed? or add extra witnesses?
|
|
assert(mdr->more()->witnessed.count(from) == 0);
|
|
if (ack->witnesses.empty()) {
|
|
mdr->more()->witnessed.insert(from);
|
|
} else {
|
|
dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
|
|
mdr->more()->extra_witnesses.swap(ack->witnesses);
|
|
mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
|
|
}
|
|
|
|
// srci import?
|
|
if (ack->inode_export.length()) {
|
|
dout(10) << " got srci import" << dendl;
|
|
mdr->more()->inode_import.claim(ack->inode_export);
|
|
mdr->more()->inode_import_v = ack->inode_export_v;
|
|
}
|
|
|
|
// remove from waiting list
|
|
assert(mdr->more()->waiting_on_slave.count(from));
|
|
mdr->more()->waiting_on_slave.erase(from);
|
|
|
|
if (mdr->more()->waiting_on_slave.empty())
|
|
dispatch_client_request(mdr); // go again!
|
|
else
|
|
dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===================================
|
|
// TRUNCATE, FSYNC
|
|
|
|
class C_MDS_truncate_purged : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CInode *in;
|
|
version_t pv;
|
|
off_t size;
|
|
utime_t ctime;
|
|
public:
|
|
C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) :
|
|
mds(m), mdr(r), in(i),
|
|
pv(pdv),
|
|
size(sz), ctime(ct) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// apply to cache
|
|
in->inode.size = size;
|
|
in->inode.ctime = ctime;
|
|
in->inode.mtime = ctime;
|
|
in->mark_dirty(pv, mdr->ls);
|
|
|
|
// reply
|
|
mds->server->reply_request(mdr, 0);
|
|
}
|
|
};
|
|
|
|
class C_MDS_truncate_logged : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CInode *in;
|
|
version_t pv;
|
|
off_t size;
|
|
utime_t ctime;
|
|
public:
|
|
C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) :
|
|
mds(m), mdr(r), in(i),
|
|
pv(pdv),
|
|
size(sz), ctime(ct) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// purge
|
|
mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls);
|
|
mds->mdcache->wait_for_purge(in, size,
|
|
new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime));
|
|
}
|
|
};
|
|
|
|
void Server::handle_client_truncate(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
CInode *cur = rdlock_path_pin_ref(mdr, true);
|
|
if (!cur) return;
|
|
|
|
// check permissions?
|
|
|
|
// xlock inode
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(&cur->filelock);
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
// already small enough?
|
|
if (cur->inode.size <= req->head.args.truncate.length) {
|
|
reply_request(mdr, 0);
|
|
return;
|
|
}
|
|
|
|
// prepare
|
|
version_t pdv = cur->pre_dirty();
|
|
utime_t ctime = g_clock.real_now();
|
|
Context *fin = new C_MDS_truncate_logged(mds, mdr, cur,
|
|
pdv, req->head.args.truncate.length, ctime);
|
|
|
|
// log + wait
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "truncate");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
le->metablob.add_dir_context(cur->get_parent_dir());
|
|
le->metablob.add_inode_truncate(cur->ino(), req->head.args.truncate.length, cur->inode.size);
|
|
inode_t *pi = le->metablob.add_dentry(cur->parent, true);
|
|
pi->mtime = ctime;
|
|
pi->ctime = ctime;
|
|
pi->version = pdv;
|
|
pi->size = req->head.args.truncate.length;
|
|
|
|
|
|
mdlog->submit_entry(le, fin);
|
|
}
|
|
|
|
|
|
// ===========================
|
|
// open, openc, close
|
|
|
|
void Server::handle_client_open(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
int flags = req->head.args.open.flags;
|
|
int cmode = req->get_open_file_mode();
|
|
bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) ||
|
|
(flags & O_TRUNC));
|
|
dout(10) << "open flags = " << flags
|
|
<< ", filemode = " << cmode
|
|
<< ", need_auth = " << need_auth
|
|
<< dendl;
|
|
|
|
CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
|
|
if (!cur) return;
|
|
|
|
// regular file?
|
|
if (!cur->inode.is_file() && !cur->inode.is_dir()) {
|
|
dout(7) << "not a file or dir " << *cur << dendl;
|
|
reply_request(mdr, -EINVAL); // FIXME what error do we want?
|
|
return;
|
|
}
|
|
// can only open a dir rdonly, no flags.
|
|
if (cur->inode.is_dir() && (cmode != FILE_MODE_R || flags != 0)) {
|
|
reply_request(mdr, -EINVAL);
|
|
return;
|
|
}
|
|
|
|
// hmm, check permissions or something.
|
|
|
|
|
|
// O_TRUNC
|
|
if (flags & O_TRUNC) {
|
|
assert(cur->is_auth());
|
|
|
|
// xlock file size
|
|
set<SimpleLock*> rdlocks = mdr->rdlocks;
|
|
set<SimpleLock*> wrlocks = mdr->wrlocks;
|
|
set<SimpleLock*> xlocks = mdr->xlocks;
|
|
xlocks.insert(&cur->filelock);
|
|
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
|
|
return;
|
|
|
|
if (cur->inode.size > 0) {
|
|
handle_client_opent(mdr);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// do it
|
|
_do_open(mdr, cur);
|
|
}
|
|
|
|
void Server::_do_open(MDRequest *mdr, CInode *cur)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
int cmode = req->get_open_file_mode();
|
|
|
|
// can we issue the caps they want?
|
|
//version_t fdv = mds->locker->issue_file_data_version(cur);
|
|
Capability *cap = mds->locker->issue_new_caps(cur, cmode, req);
|
|
if (!cap) return; // can't issue (yet), so wait!
|
|
|
|
dout(12) << "_do_open issuing caps " << cap_string(cap->pending())
|
|
<< " for " << req->get_source()
|
|
<< " on " << *cur << dendl;
|
|
|
|
// hit pop
|
|
mdr->now = g_clock.now();
|
|
if (cmode == FILE_MODE_RW ||
|
|
cmode == FILE_MODE_W)
|
|
mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR);
|
|
else
|
|
mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD,
|
|
mdr->client_request->get_client_inst().name.num());
|
|
|
|
// reply
|
|
MClientReply *reply = new MClientReply(req, 0);
|
|
reply->set_file_caps(cap->pending());
|
|
reply->set_file_caps_seq(cap->get_last_seq());
|
|
//reply->set_file_data_version(fdv);
|
|
reply_request(mdr, reply, cur);
|
|
|
|
// journal?
|
|
if (cur->last_open_journaled == 0) {
|
|
queue_journal_open(cur);
|
|
maybe_journal_opens();
|
|
}
|
|
|
|
}
|
|
|
|
void Server::queue_journal_open(CInode *in)
|
|
{
|
|
dout(10) << "queue_journal_open on " << *in << dendl;
|
|
|
|
if (journal_open_queue.count(in) == 0) {
|
|
// pin so our pointer stays valid
|
|
in->get(CInode::PIN_BATCHOPENJOURNAL);
|
|
|
|
// queue it up for a bit
|
|
journal_open_queue.insert(in);
|
|
}
|
|
}
|
|
|
|
|
|
void Server::journal_opens()
|
|
{
|
|
dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << dendl;
|
|
if (journal_open_queue.empty()) return;
|
|
|
|
EOpen *le = 0;
|
|
|
|
// check queued inodes
|
|
LogSegment *ls = mdlog->get_current_segment();
|
|
for (set<CInode*>::iterator p = journal_open_queue.begin();
|
|
p != journal_open_queue.end();
|
|
++p) {
|
|
CInode *in = *p;
|
|
in->put(CInode::PIN_BATCHOPENJOURNAL);
|
|
if (in->is_any_caps()) {
|
|
if (!le) le = new EOpen(mdlog);
|
|
le->add_inode(in);
|
|
in->last_open_journaled = mds->mdlog->get_write_pos();
|
|
ls->open_files.push_back(&in->xlist_open_file);
|
|
}
|
|
}
|
|
journal_open_queue.clear();
|
|
|
|
if (le) {
|
|
// journal
|
|
mdlog->submit_entry(le);
|
|
|
|
// add waiters to journal entry
|
|
for (list<Context*>::iterator p = journal_open_waiters.begin();
|
|
p != journal_open_waiters.end();
|
|
++p)
|
|
mds->mdlog->wait_for_sync(*p);
|
|
journal_open_waiters.clear();
|
|
} else {
|
|
// nothing worth journaling here, just kick the waiters.
|
|
mds->queue_waiters(journal_open_waiters);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
class C_MDS_open_truncate_purged : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CInode *in;
|
|
version_t pv;
|
|
utime_t ctime;
|
|
public:
|
|
C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) :
|
|
mds(m), mdr(r), in(i),
|
|
pv(pdv),
|
|
ctime(ct) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// apply to cache
|
|
in->inode.size = 0;
|
|
in->inode.ctime = ctime;
|
|
in->inode.mtime = ctime;
|
|
in->mark_dirty(pv, mdr->ls);
|
|
|
|
// do the open
|
|
mds->server->_do_open(mdr, in);
|
|
}
|
|
};
|
|
|
|
class C_MDS_open_truncate_logged : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CInode *in;
|
|
version_t pv;
|
|
utime_t ctime;
|
|
public:
|
|
C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) :
|
|
mds(m), mdr(r), in(i),
|
|
pv(pdv),
|
|
ctime(ct) { }
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// hit pop
|
|
mds->balancer->hit_inode(mdr->now, in, META_POP_IWR);
|
|
|
|
// purge also...
|
|
mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls);
|
|
mds->mdcache->wait_for_purge(in, 0,
|
|
new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime));
|
|
}
|
|
};
|
|
|
|
|
|
void Server::handle_client_opent(MDRequest *mdr)
|
|
{
|
|
CInode *cur = mdr->ref;
|
|
assert(cur);
|
|
|
|
// prepare
|
|
version_t pdv = cur->pre_dirty();
|
|
utime_t ctime = g_clock.real_now();
|
|
Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur,
|
|
pdv, ctime);
|
|
|
|
// log + wait
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "open_truncate");
|
|
le->metablob.add_client_req(mdr->reqid);
|
|
le->metablob.add_dir_context(cur->get_parent_dir());
|
|
le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size);
|
|
inode_t *pi = le->metablob.add_dentry(cur->parent, true);
|
|
pi->mtime = ctime;
|
|
pi->ctime = ctime;
|
|
pi->version = pdv;
|
|
pi->size = 0;
|
|
|
|
mdlog->submit_entry(le, fin);
|
|
}
|
|
|
|
|
|
|
|
class C_MDS_openc_finish : public Context {
|
|
MDS *mds;
|
|
MDRequest *mdr;
|
|
CDentry *dn;
|
|
CInode *newi;
|
|
version_t pv;
|
|
public:
|
|
C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) :
|
|
mds(m), mdr(r), dn(d), newi(ni),
|
|
pv(d->get_projected_version()) {}
|
|
void finish(int r) {
|
|
assert(r == 0);
|
|
|
|
// link the inode
|
|
dn->get_dir()->link_primary_inode(dn, newi);
|
|
|
|
// dirty inode, dn, dir
|
|
newi->mark_dirty(pv, mdr->ls);
|
|
|
|
// downgrade xlock to rdlock
|
|
//mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr);
|
|
|
|
// set/pin ref inode for open()
|
|
mdr->ref = newi;
|
|
mdr->pin(newi);
|
|
|
|
// ok, do the open.
|
|
mds->server->handle_client_open(mdr);
|
|
}
|
|
};
|
|
|
|
|
|
void Server::handle_client_openc(MDRequest *mdr)
|
|
{
|
|
MClientRequest *req = mdr->client_request;
|
|
|
|
dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
|
|
|
|
bool excl = (req->head.args.open.flags & O_EXCL);
|
|
CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false);
|
|
if (!dn) return;
|
|
|
|
if (!dn->is_null()) {
|
|
// it existed.
|
|
if (req->head.args.open.flags & O_EXCL) {
|
|
dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl;
|
|
reply_request(mdr, -EEXIST, dn->get_dir()->get_inode());
|
|
return;
|
|
}
|
|
|
|
// pass to regular open handler.
|
|
handle_client_open(mdr);
|
|
return;
|
|
}
|
|
|
|
// created null dn.
|
|
|
|
// create inode.
|
|
mdr->now = g_clock.real_now();
|
|
CInode *in = prepare_new_inode(mdr, dn->dir);
|
|
assert(in);
|
|
|
|
// it's a file.
|
|
in->inode.mode = req->head.args.open.mode;
|
|
in->inode.mode |= S_IFREG;
|
|
in->inode.version = dn->pre_dirty() - 1;
|
|
|
|
// prepare finisher
|
|
C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in);
|
|
mdr->ls = mdlog->get_current_segment();
|
|
EUpdate *le = new EUpdate(mdlog, "openc");
|
|
le->metablob.add_client_req(req->get_reqid());
|
|
le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version());
|
|
le->metablob.add_dir_context(dn->dir);
|
|
le->metablob.add_primary_dentry(dn, true, in, &in->inode);
|
|
|
|
// log + wait
|
|
mdlog->submit_entry(le, fin);
|
|
|
|
/*
|
|
FIXME. this needs to be rewritten when the write capability stuff starts
|
|
getting journaled.
|
|
*/
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|