ceph/branches/sage/pgs/client/Client.cc
sageweil 9213a23f14 eek
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1138 29311d96-e01e-0410-9327-a35deaab8ce9
2007-02-28 18:42:55 +00:00

2648 lines
66 KiB
C++

// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
// unix-ey fs stuff
#include <unistd.h>
#include <sys/types.h>
#include <time.h>
#include <utime.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/statvfs.h>
#include <iostream>
using namespace std;
// ceph stuff
#include "Client.h"
#include "messages/MClientBoot.h"
#include "messages/MClientMount.h"
#include "messages/MClientMountAck.h"
#include "messages/MClientFileCaps.h"
#include "messages/MGenericMessage.h"
#include "messages/MMDSGetMap.h"
#include "messages/MMDSMap.h"
#include "osdc/Filer.h"
#include "osdc/Objecter.h"
#include "osdc/ObjectCacher.h"
#include "common/Cond.h"
#include "common/Mutex.h"
#include "common/Logger.h"
#include "config.h"
#undef dout
#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " "
#define tout if (g_conf.client_trace) cout << "trace: "
// static logger
LogType client_logtype;
Logger *client_logger = 0;
class C_Client_CloseRelease : public Context {
Client *cl;
Inode *in;
public:
C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {}
void finish(int) {
cl->close_release(in);
}
};
class C_Client_CloseSafe : public Context {
Client *cl;
Inode *in;
public:
C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {}
void finish(int) {
cl->close_safe(in);
}
};
// cons/des
Client::Client(Messenger *m, MonMap *mm)
{
// which client am i?
whoami = m->get_myname().num();
monmap = mm;
mounted = false;
unmounting = false;
last_tid = 0;
unsafe_sync_write = 0;
mdsmap = 0;
//
root = 0;
set_cache_size(g_conf.client_cache_size);
// file handles
free_fh_set.insert(10, 1<<30);
// set up messengers
messenger = m;
messenger->set_dispatcher(this);
// osd interfaces
osdmap = new OSDMap(); // initially blank.. see mount()
objecter = new Objecter(messenger, monmap, osdmap);
objectcacher = new ObjectCacher(objecter, client_lock);
filer = new Filer(objecter);
}
Client::~Client()
{
if (messenger) { delete messenger; messenger = 0; }
if (filer) { delete filer; filer = 0; }
if (objectcacher) { delete objectcacher; objectcacher = 0; }
if (objecter) { delete objecter; objecter = 0; }
if (osdmap) { delete osdmap; osdmap = 0; }
tear_down_cache();
}
void Client::tear_down_cache()
{
// fh's
for (hash_map<fh_t, Fh*>::iterator it = fh_map.begin();
it != fh_map.end();
it++) {
Fh *fh = it->second;
dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl;
put_inode(fh->inode);
delete fh;
}
fh_map.clear();
// caps!
// *** FIXME ***
// empty lru
lru.lru_set_max(0);
trim_cache();
assert(lru.lru_get_size() == 0);
// close root ino
assert(inode_map.size() <= 1);
if (root && inode_map.size() == 1) {
delete root;
root = 0;
inode_map.clear();
}
assert(inode_map.empty());
}
// debug crapola
void Client::dump_inode(Inode *in, set<Inode*>& did)
{
dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl;
if (in->dir) {
dout(1) << " dir size " << in->dir->dentries.size() << endl;
//for (hash_map<const char*, Dentry*, hash<const char*>, eqstr>::iterator it = in->dir->dentries.begin();
for (hash_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
it != in->dir->dentries.end();
it++) {
dout(1) << " dn " << it->first << " ref " << it->second->ref << endl;
dump_inode(it->second->inode, did);
}
}
}
void Client::dump_cache()
{
set<Inode*> did;
if (root) dump_inode(root, did);
for (hash_map<inodeno_t, Inode*>::iterator it = inode_map.begin();
it != inode_map.end();
it++) {
if (did.count(it->second)) continue;
dout(1) << "dump_cache: inode " << it->first
<< " ref " << it->second->ref
<< " dir " << it->second->dir << endl;
if (it->second->dir) {
dout(1) << " dir size " << it->second->dir->dentries.size() << endl;
}
}
}
void Client::init() {
}
void Client::shutdown() {
dout(1) << "shutdown" << endl;
messenger->shutdown();
}
// ===================
// metadata cache stuff
void Client::trim_cache()
{
unsigned last = 0;
while (lru.lru_get_size() != last) {
last = lru.lru_get_size();
if (lru.lru_get_size() <= lru.lru_get_max()) break;
// trim!
Dentry *dn = (Dentry*)lru.lru_expire();
if (!dn) break; // done
//dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl;
unlink(dn);
}
// hose root?
if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) {
delete root;
root = 0;
inode_map.clear();
}
}
/** insert_inode
*
* insert + link a single dentry + inode into the metadata cache.
*/
Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname)
{
Dentry *dn = NULL;
if (dir->dentries.count(dname))
dn = dir->dentries[dname];
dout(12) << "insert_inode " << dname << " ino " << st->inode.ino
<< " size " << st->inode.size
<< " mtime " << st->inode.mtime
<< " hashed " << st->hashed
<< endl;
if (dn) {
if (dn->inode->inode.ino == st->inode.ino) {
touch_dn(dn);
dout(12) << " had dentry " << dname
<< " with correct ino " << dn->inode->inode.ino
<< endl;
} else {
dout(12) << " had dentry " << dname
<< " with WRONG ino " << dn->inode->inode.ino
<< endl;
unlink(dn);
dn = NULL;
}
}
if (!dn) {
// have inode linked elsewhere? -> unlink and relink!
if (inode_map.count(st->inode.ino)) {
Inode *in = inode_map[st->inode.ino];
assert(in);
if (in->dn) {
dout(12) << " had ino " << in->inode.ino
<< " linked at wrong position, unlinking"
<< endl;
dn = relink(in->dn, dir, dname);
} else {
// link
dout(12) << " had ino " << in->inode.ino
<< " unlinked, linking" << endl;
dn = link(dir, dname, in);
}
}
}
if (!dn) {
Inode *in = new Inode(st->inode, objectcacher);
inode_map[st->inode.ino] = in;
dn = link(dir, dname, in);
dout(12) << " new dentry+node with ino " << st->inode.ino << endl;
} else {
// actually update info
dout(12) << " stat inode mask is " << st->inode.mask << endl;
dn->inode->inode = st->inode;
// ...but don't clobber our mtime, size!
if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 &&
dn->inode->file_wr_size > dn->inode->inode.size)
dn->inode->inode.size = dn->inode->file_wr_size;
if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 &&
dn->inode->file_wr_mtime > dn->inode->inode.mtime)
dn->inode->inode.mtime = dn->inode->file_wr_mtime;
}
// OK, we found it!
assert(dn && dn->inode);
// or do we have newer size/mtime from writing?
if (dn->inode->file_caps() & CAP_FILE_WR) {
if (dn->inode->file_wr_size > dn->inode->inode.size)
dn->inode->inode.size = dn->inode->file_wr_size;
if (dn->inode->file_wr_mtime > dn->inode->inode.mtime)
dn->inode->inode.mtime = dn->inode->file_wr_mtime;
}
// symlink?
if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) {
if (!dn->inode->symlink)
dn->inode->symlink = new string;
*(dn->inode->symlink) = st->symlink;
}
return dn->inode;
}
/** update_inode_dist
*
* update MDS location cache for a single inode
*/
void Client::update_inode_dist(Inode *in, InodeStat *st)
{
// dir info
in->dir_auth = st->dir_auth;
in->dir_hashed = st->hashed;
in->dir_replicated = st->replicated;
// dir replication
if (st->spec_defined) {
if (st->dist.empty() && !in->dir_contacts.empty())
dout(9) << "lost dist spec for " << in->inode.ino
<< " " << st->dist << endl;
if (!st->dist.empty() && in->dir_contacts.empty())
dout(9) << "got dist spec for " << in->inode.ino
<< " " << st->dist << endl;
in->dir_contacts = st->dist;
}
}
/** insert_trace
*
* insert a trace from a MDS reply into the cache.
*/
Inode* Client::insert_trace(MClientReply *reply)
{
Inode *cur = root;
time_t now = time(NULL);
dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl;
list<string>::const_iterator pdn = reply->get_trace_dn().begin();
for (list<InodeStat*>::const_iterator pin = reply->get_trace_in().begin();
pin != reply->get_trace_in().end();
++pin) {
if (pin == reply->get_trace_in().begin()) {
// root
dout(10) << "insert_trace root" << endl;
if (!root) {
// create
cur = root = new Inode((*pin)->inode, objectcacher);
inode_map[root->inode.ino] = root;
}
} else {
// not root.
dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl;
Dir *dir = cur->open_dir();
cur = this->insert_inode(dir, *pin, *pdn);
++pdn;
// move to top of lru!
if (cur->dn)
lru.lru_touch(cur->dn);
}
// update dist info
update_inode_dist(cur, *pin);
// set cache ttl
if (g_conf.client_cache_stat_ttl)
cur->valid_until = now + g_conf.client_cache_stat_ttl;
}
return cur;
}
Dentry *Client::lookup(filepath& path)
{
dout(14) << "lookup " << path << endl;
Inode *cur = root;
if (!cur) return NULL;
Dentry *dn = 0;
for (unsigned i=0; i<path.depth(); i++) {
dout(14) << " seg " << i << " = " << path[i] << endl;
if (cur->inode.mode & INODE_MODE_DIR &&
cur->dir) {
// dir, we can descend
Dir *dir = cur->dir;
if (dir->dentries.count(path[i])) {
dn = dir->dentries[path[i]];
dout(14) << " hit dentry " << path[i] << " inode is " << dn->inode << " valid_until " << dn->inode->valid_until << endl;
} else {
dout(14) << " dentry " << path[i] << " dne" << endl;
return NULL;
}
cur = dn->inode;
assert(cur);
} else {
return NULL; // not a dir
}
}
if (dn) {
dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl;
}
return dn;
}
// -------
MClientReply *Client::make_request(MClientRequest *req,
bool auth_best,
int use_mds) // this param is purely for debug hacking
{
// assign a unique tid
req->set_tid(++last_tid);
// find deepest known prefix
Inode *diri = root; // the deepest known containing dir
Inode *item = 0; // the actual item... if we know it
int missing_dn = -1; // which dn we miss on (if we miss)
unsigned depth = req->get_filepath().depth();
for (unsigned i=0; i<depth; i++) {
// dir?
if (diri && diri->inode.mode & INODE_MODE_DIR && diri->dir) {
Dir *dir = diri->dir;
// do we have the next dentry?
if (dir->dentries.count( req->get_filepath()[i] ) == 0) {
missing_dn = i; // no.
break;
}
dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl;
if (i == depth-1) { // last one!
item = dir->dentries[ req->get_filepath()[i] ]->inode;
break;
}
// continue..
diri = dir->dentries[ req->get_filepath()[i] ]->inode;
assert(diri);
} else {
missing_dn = i;
break;
}
}
// choose an mds
int mds = 0;
if (!diri || g_conf.client_use_random_mds) {
// no root info, pick a random MDS
mds = rand() % mdsmap->get_num_mds();
} else {
if (auth_best) {
// pick the actual auth (as best we can)
if (item) {
mds = item->authority(mdsmap);
} else if (diri->dir_hashed && missing_dn >= 0) {
mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
mdsmap);
} else {
mds = diri->authority(mdsmap);
}
} else {
// balance our traffic!
if (diri->dir_hashed && missing_dn >= 0)
mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(),
mdsmap);
else
mds = diri->pick_replica(mdsmap);
}
}
dout(20) << "mds is " << mds << endl;
// force use of a particular mds?
if (use_mds >= 0) mds = use_mds;
// time the call
utime_t start = g_clock.now();
bool nojournal = false;
int op = req->get_op();
if (op == MDS_OP_STAT ||
op == MDS_OP_LSTAT ||
op == MDS_OP_READDIR ||
op == MDS_OP_OPEN ||
op == MDS_OP_RELEASE)
nojournal = true;
MClientReply *reply = sendrecv(req, mds);
if (client_logger) {
utime_t lat = g_clock.now();
lat -= start;
dout(20) << "lat " << lat << endl;
client_logger->finc("lsum",(double)lat);
client_logger->inc("lnum");
if (nojournal) {
client_logger->finc("lrsum",(double)lat);
client_logger->inc("lrnum");
} else {
client_logger->finc("lwsum",(double)lat);
client_logger->inc("lwnum");
}
if (op == MDS_OP_STAT) {
client_logger->finc("lstatsum",(double)lat);
client_logger->inc("lstatnum");
}
else if (op == MDS_OP_READDIR) {
client_logger->finc("ldirsum",(double)lat);
client_logger->inc("ldirnum");
}
}
return reply;
}
MClientReply* Client::sendrecv(MClientRequest *req, int mds)
{
// NEW way.
Cond cond;
tid_t tid = req->get_tid();
mds_rpc_cond[tid] = &cond;
messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER);
// wait
while (mds_rpc_reply.count(tid) == 0) {
dout(20) << "sendrecv awaiting reply kick on " << &cond << endl;
cond.Wait(client_lock);
}
// got it!
MClientReply *reply = mds_rpc_reply[tid];
// kick dispatcher (we've got it!)
assert(mds_rpc_dispatch_cond.count(tid));
mds_rpc_dispatch_cond[tid]->Signal();
dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl;
// clean up.
mds_rpc_cond.erase(tid);
mds_rpc_reply.erase(tid);
return reply;
}
void Client::handle_client_reply(MClientReply *reply)
{
tid_t tid = reply->get_tid();
// store reply
mds_rpc_reply[tid] = reply;
// wake up waiter
assert(mds_rpc_cond.count(tid));
dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl;
mds_rpc_cond[tid]->Signal();
// wake for kick back
assert(mds_rpc_dispatch_cond.count(tid) == 0);
Cond cond;
mds_rpc_dispatch_cond[tid] = &cond;
while (mds_rpc_cond.count(tid)) {
dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl;
cond.Wait(client_lock);
}
// ok, clean up!
mds_rpc_dispatch_cond.erase(tid);
}
// ------------------------
// incoming messages
void Client::dispatch(Message *m)
{
client_lock.Lock();
switch (m->get_type()) {
// osd
case MSG_OSD_OPREPLY:
objecter->handle_osd_op_reply((MOSDOpReply*)m);
break;
case MSG_OSD_MAP:
objecter->handle_osd_map((class MOSDMap*)m);
break;
// client
case MSG_MDS_MAP:
handle_mds_map((MMDSMap*)m);
break;
case MSG_CLIENT_REPLY:
handle_client_reply((MClientReply*)m);
break;
case MSG_CLIENT_FILECAPS:
handle_file_caps((MClientFileCaps*)m);
break;
case MSG_CLIENT_MOUNTACK:
handle_mount_ack((MClientMountAck*)m);
break;
case MSG_CLIENT_UNMOUNT:
handle_unmount_ack(m);
break;
default:
cout << "dispatch doesn't recognize message type " << m->get_type() << endl;
assert(0); // fail loudly
break;
}
// unmounting?
if (unmounting) {
dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size()
<< "+" << inode_map.size() << endl;
trim_cache();
if (lru.lru_get_size() == 0 && inode_map.empty()) {
dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl;
mount_cond.Signal();
} else {
dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size()
<< "+" << inode_map.size() << endl;
dump_cache();
}
}
client_lock.Unlock();
}
void Client::handle_mds_map(MMDSMap* m)
{
if (mdsmap == 0)
mdsmap = new MDSMap;
if (whoami < 0) {
whoami = m->get_dest().num();
dout(1) << "handle_mds_map i am now " << m->get_dest() << endl;
messenger->reset_myname(m->get_dest());
}
dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl;
mdsmap->decode(m->get_encoded());
delete m;
// note our inc #
objecter->set_client_incarnation(0); // fixme
mount_cond.Signal(); // mount might be waiting for this.
}
/****
* caps
*/
class C_Client_ImplementedCaps : public Context {
Client *client;
MClientFileCaps *msg;
Inode *in;
public:
C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {}
void finish(int r) {
client->implemented_caps(msg,in);
}
};
/** handle_file_caps
* handle caps update from mds. including mds to mds caps transitions.
* do not block.
*/
void Client::handle_file_caps(MClientFileCaps *m)
{
int mds = m->get_source().num();
Inode *in = 0;
if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ];
m->clear_payload(); // for if/when we send back to MDS
// reap?
if (m->get_special() == MClientFileCaps::FILECAP_REAP) {
int other = m->get_mds();
if (in && in->stale_caps.count(other)) {
dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl;
// fresh from new mds?
if (!in->caps.count(mds)) {
if (in->caps.empty()) in->get();
in->caps[mds].seq = m->get_seq();
in->caps[mds].caps = m->get_caps();
}
assert(in->stale_caps.count(other));
in->stale_caps.erase(other);
if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in
// fall-thru!
} else {
dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl;
// delay!
cap_reap_queue[in->ino()][other] = m;
return;
}
}
assert(in);
// stale?
if (m->get_special() == MClientFileCaps::FILECAP_STALE) {
dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl;
// move to stale list
assert(in->caps.count(mds));
if (in->stale_caps.empty()) in->get();
in->stale_caps[mds] = in->caps[mds];
assert(in->caps.count(mds));
in->caps.erase(mds);
if (in->caps.empty()) in->put();
// delayed reap?
if (cap_reap_queue.count(in->ino()) &&
cap_reap_queue[in->ino()].count(mds)) {
dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl;
// process delayed reap
handle_file_caps( cap_reap_queue[in->ino()][mds] );
cap_reap_queue[in->ino()].erase(mds);
if (cap_reap_queue[in->ino()].empty())
cap_reap_queue.erase(in->ino());
}
return;
}
// release?
if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) {
dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl;
assert(in->caps.count(mds));
in->caps.erase(mds);
for (map<int,InodeCap>::iterator p = in->caps.begin();
p != in->caps.end();
p++)
dout(20) << " left cap " << p->first << " "
<< cap_string(p->second.caps) << " "
<< p->second.seq << endl;
for (map<int,InodeCap>::iterator p = in->stale_caps.begin();
p != in->stale_caps.end();
p++)
dout(20) << " left stale cap " << p->first << " "
<< cap_string(p->second.caps) << " "
<< p->second.seq << endl;
if (in->caps.empty()) {
//dout(0) << "did put_inode" << endl;
put_inode(in);
} else {
//dout(0) << "didn't put_inode" << endl;
}
return;
}
// don't want?
if (in->file_caps_wanted() == 0) {
dout(5) << "handle_file_caps on ino " << m->get_ino()
<< " seq " << m->get_seq()
<< " " << cap_string(m->get_caps())
<< ", which we don't want caps for, releasing." << endl;
m->set_caps(0);
m->set_wanted(0);
messenger->send_message(m, m->get_source_inst(), m->get_source_port());
return;
}
assert(in->caps.count(mds));
// update per-mds caps
const int old_caps = in->caps[mds].caps;
const int new_caps = m->get_caps();
in->caps[mds].caps = new_caps;
in->caps[mds].seq = m->get_seq();
dout(5) << "handle_file_caps on in " << m->get_ino()
<< " mds" << mds << " seq " << m->get_seq()
<< " caps now " << cap_string(new_caps)
<< " was " << cap_string(old_caps) << endl;
// did file size decrease?
if ((old_caps & new_caps & CAP_FILE_RDCACHE) &&
in->inode.size > m->get_inode().size) {
dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl;
// must have been a truncate() by someone.
// trim the buffer cache
// ***** fixme write me ****
in->file_wr_size = m->get_inode().size; //??
}
// update inode
in->inode = m->get_inode(); // might have updated size... FIXME this is overkill!
// preserve our (possibly newer) file size, mtime
if (in->file_wr_size > in->inode.size)
m->get_inode().size = in->inode.size = in->file_wr_size;
if (in->file_wr_mtime > in->inode.mtime)
m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime;
if (g_conf.client_oc) {
// caching on, use FileCache.
Context *onimplement = 0;
if (old_caps & ~new_caps) { // this mds is revoking caps
if (in->fc.get_caps() & ~(in->file_caps())) // net revocation
onimplement = new C_Client_ImplementedCaps(this, m, in);
else {
implemented_caps(m, in); // ack now.
}
}
in->fc.set_caps(new_caps, onimplement);
} else {
// caching off.
// wake up waiters?
if (new_caps & CAP_FILE_RD) {
for (list<Cond*>::iterator it = in->waitfor_read.begin();
it != in->waitfor_read.end();
it++) {
dout(5) << "signaling read waiter " << *it << endl;
(*it)->Signal();
}
in->waitfor_read.clear();
}
if (new_caps & CAP_FILE_WR) {
for (list<Cond*>::iterator it = in->waitfor_write.begin();
it != in->waitfor_write.end();
it++) {
dout(5) << "signaling write waiter " << *it << endl;
(*it)->Signal();
}
in->waitfor_write.clear();
}
if (new_caps & CAP_FILE_LAZYIO) {
for (list<Cond*>::iterator it = in->waitfor_lazy.begin();
it != in->waitfor_lazy.end();
it++) {
dout(5) << "signaling lazy waiter " << *it << endl;
(*it)->Signal();
}
in->waitfor_lazy.clear();
}
// ack?
if (old_caps & ~new_caps) {
if (in->sync_writes) {
// wait for sync writes to finish
dout(5) << "sync writes in progress, will ack on finish" << endl;
in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in));
} else {
// ok now
implemented_caps(m, in);
}
} else {
// discard
delete m;
}
}
}
void Client::implemented_caps(MClientFileCaps *m, Inode *in)
{
dout(5) << "implemented_caps " << cap_string(m->get_caps())
<< ", acking to " << m->get_source() << endl;
if (in->file_caps() == 0) {
in->file_wr_mtime = 0;
in->file_wr_size = 0;
}
messenger->send_message(m, m->get_source_inst(), m->get_source_port());
}
void Client::release_caps(Inode *in,
int retain)
{
dout(5) << "releasing caps on ino " << in->inode.ino << dec
<< " had " << cap_string(in->file_caps())
<< " retaining " << cap_string(retain)
<< " want " << cap_string(in->file_caps_wanted())
<< endl;
for (map<int,InodeCap>::iterator it = in->caps.begin();
it != in->caps.end();
it++) {
//if (it->second.caps & ~retain) {
if (1) {
// release (some of?) these caps
it->second.caps = retain & it->second.caps;
// note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do
MClientFileCaps *m = new MClientFileCaps(in->inode,
it->second.seq,
it->second.caps,
in->file_caps_wanted());
messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
}
}
if (in->file_caps() == 0) {
in->file_wr_mtime = 0;
in->file_wr_size = 0;
}
}
void Client::update_caps_wanted(Inode *in)
{
dout(5) << "updating caps wanted on ino " << in->inode.ino
<< " to " << cap_string(in->file_caps_wanted())
<< endl;
// FIXME: pick a single mds and let the others off the hook..
for (map<int,InodeCap>::iterator it = in->caps.begin();
it != in->caps.end();
it++) {
MClientFileCaps *m = new MClientFileCaps(in->inode,
it->second.seq,
it->second.caps,
in->file_caps_wanted());
messenger->send_message(m,
mdsmap->get_inst(it->first), MDS_PORT_LOCKER);
}
}
// -------------------
// fs ops
int Client::mount()
{
client_lock.Lock();
assert(!mounted); // caller is confused?
// FIXME mds map update race with mount.
dout(2) << "sending boot msg to monitor" << endl;
if (mdsmap)
delete mdsmap;
int mon = monmap->pick_mon();
messenger->send_message(new MClientBoot(),
monmap->get_inst(mon));
while (!mdsmap)
mount_cond.Wait(client_lock);
dout(2) << "mounting" << endl;
MClientMount *m = new MClientMount();
int who = 0; // mdsmap->get_root(); // mount at root, for now
messenger->send_message(m,
mdsmap->get_inst(who),
MDS_PORT_SERVER);
while (!mounted)
mount_cond.Wait(client_lock);
client_lock.Unlock();
/*
dout(3) << "op: // client trace data structs" << endl;
dout(3) << "op: struct stat st;" << endl;
dout(3) << "op: struct utimbuf utim;" << endl;
dout(3) << "op: int readlinkbuf_len = 1000;" << endl;
dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl;
dout(3) << "op: map<string, inode_t*> dir_contents;" << endl;
dout(3) << "op: map<fh_t, fh_t> open_files;" << endl;
dout(3) << "op: fh_t fh;" << endl;
*/
return 0;
}
void Client::handle_mount_ack(MClientMountAck *m)
{
// mdsmap!
if (!mdsmap) mdsmap = new MDSMap;
mdsmap->decode(m->get_mds_map_state());
// we got osdmap!
osdmap->decode(m->get_osd_map_state());
dout(2) << "mounted" << endl;
mounted = true;
mount_cond.Signal();
delete m;
}
int Client::unmount()
{
client_lock.Lock();
assert(mounted); // caller is confused?
dout(2) << "unmounting" << endl;
unmounting = true;
// NOTE: i'm assuming all caches are already flushing (because all files are closed).
assert(fh_map.empty());
// empty lru cache
lru.lru_set_max(0);
trim_cache();
if (g_conf.client_oc) {
// release any/all caps
for (hash_map<inodeno_t, Inode*>::iterator p = inode_map.begin();
p != inode_map.end();
p++) {
Inode *in = p->second;
if (!in->caps.empty()) {
in->fc.release_clean();
if (in->fc.is_dirty()) {
dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl;
in->fc.empty(new C_Client_CloseRelease(this, in));
} else {
dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl;
release_caps(in);
}
}
}
}
while (lru.lru_get_size() > 0 ||
!inode_map.empty()) {
dout(2) << "cache still has " << lru.lru_get_size()
<< "+" << inode_map.size() << " items"
<< ", waiting (presumably for safe or for caps to be released?)"
<< endl;
dump_cache();
mount_cond.Wait(client_lock);
}
assert(lru.lru_get_size() == 0);
assert(inode_map.empty());
// unsafe writes
if (!g_conf.client_oc) {
while (unsafe_sync_write > 0) {
dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
<< endl;
mount_cond.Wait(client_lock);
}
}
// send unmount!
Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT);
messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER);
while (mounted)
mount_cond.Wait(client_lock);
dout(2) << "unmounted" << endl;
client_lock.Unlock();
return 0;
}
void Client::handle_unmount_ack(Message* m)
{
dout(1) << "got unmount ack" << endl;
mounted = false;
mount_cond.Signal();
delete m;
}
// namespace ops
int Client::link(const char *existing, const char *newname)
{
client_lock.Lock();
dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl;
tout << "link" << endl;
tout << existing << endl;
tout << newname << endl;
// main path arg is new link name
// sarg is target (existing file)
MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami);
req->set_path(newname);
req->set_sarg(existing);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "link result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::unlink(const char *relpath)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->unlink\(\"" << path << "\");" << endl;
tout << "unlink" << endl;
tout << path << endl;
MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami);
req->set_path(path);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
if (res == 0) {
// remove from local cache
filepath fp(path);
Dentry *dn = lookup(fp);
if (dn) {
assert(dn->inode);
unlink(dn);
}
}
insert_trace(reply);
delete reply;
dout(10) << "unlink result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::rename(const char *relfrom, const char *relto)
{
client_lock.Lock();
string absfrom;
mkabspath(relfrom, absfrom);
const char *from = absfrom.c_str();
string absto;
mkabspath(relto, absto);
const char *to = absto.c_str();
dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl;
tout << "rename" << endl;
tout << from << endl;
tout << to << endl;
MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami);
req->set_path(from);
req->set_sarg(to);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "rename result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
// dirs
int Client::mkdir(const char *relpath, mode_t mode)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl;
tout << "mkdir" << endl;
tout << path << endl;
tout << mode << endl;
MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami);
req->set_path(path);
req->set_iarg( (int)mode );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "mkdir result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::rmdir(const char *relpath)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->rmdir(\"" << path << "\");" << endl;
tout << "rmdir" << endl;
tout << path << endl;
MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami);
req->set_path(path);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
if (res == 0) {
// remove from local cache
filepath fp(path);
Dentry *dn = lookup(fp);
if (dn) {
if (dn->inode->dir && dn->inode->dir->is_empty())
close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache?
unlink(dn);
}
}
insert_trace(reply);
delete reply;
dout(10) << "rmdir result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
// symlinks
int Client::symlink(const char *reltarget, const char *rellink)
{
client_lock.Lock();
string abstarget;
mkabspath(reltarget, abstarget);
const char *target = abstarget.c_str();
string abslink;
mkabspath(rellink, abslink);
const char *link = abslink.c_str();
dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl;
tout << "symlink" << endl;
tout << target << endl;
tout << link << endl;
MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami);
req->set_path(link);
req->set_sarg(target);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply); //FIXME assuming trace of link, not of target
delete reply;
dout(10) << "symlink result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::readlink(const char *relpath, char *buf, off_t size)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl;
tout << "readlink" << endl;
tout << path << endl;
client_lock.Unlock();
// stat first (FIXME, PERF access cache directly) ****
struct stat stbuf;
int r = this->lstat(path, &stbuf);
if (r != 0) return r;
client_lock.Lock();
// pull symlink content from cache
Inode *in = inode_map[stbuf.st_ino];
assert(in); // i just did a stat
// copy into buf (at most size bytes)
unsigned res = in->symlink->length();
if (res > size) res = size;
memcpy(buf, in->symlink->c_str(), res);
trim_cache();
client_lock.Unlock();
return res; // return length in bytes (to mimic the system call)
}
// inode stuff
int Client::_lstat(const char *path, int mask, Inode **in)
{
MClientRequest *req = 0;
filepath fpath(path);
// check whether cache content is fresh enough
int res = 0;
Dentry *dn = lookup(fpath);
inode_t inode;
time_t now = time(NULL);
if (dn &&
now <= dn->inode->valid_until &&
((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) {
inode = dn->inode->inode;
dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl;
if (g_conf.client_cache_stat_ttl == 0)
dn->inode->valid_until = 0; // only one stat allowed after each readdir
*in = dn->inode;
} else {
// FIXME where does FUSE maintain user information
//struct fuse_context *fc = fuse_get_context();
//req->set_caller_uid(fc->uid);
//req->set_caller_gid(fc->gid);
req = new MClientRequest(MDS_OP_LSTAT, whoami);
req->set_iarg(mask);
req->set_path(fpath);
MClientReply *reply = make_request(req);
res = reply->get_result();
dout(10) << "lstat res is " << res << endl;
if (res == 0) {
//Transfer information from reply to stbuf
inode = reply->get_inode();
//Update metadata cache
*in = insert_trace(reply);
}
delete reply;
if (res != 0)
*in = 0; // not a success.
}
return res;
}
void Client::fill_stat(inode_t& inode, struct stat *st)
{
memset(st, 0, sizeof(struct stat));
st->st_ino = inode.ino;
st->st_mode = inode.mode;
st->st_nlink = inode.nlink;
st->st_uid = inode.uid;
st->st_gid = inode.gid;
st->st_ctime = inode.ctime;
st->st_atime = inode.atime;
st->st_mtime = inode.mtime;
st->st_size = inode.size;
st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
st->st_blksize = 4096;
}
void Client::fill_statlite(inode_t& inode, struct statlite *st)
{
memset(st, 0, sizeof(struct stat));
st->st_ino = inode.ino;
st->st_mode = inode.mode;
st->st_nlink = inode.nlink;
st->st_uid = inode.uid;
st->st_gid = inode.gid;
#ifndef DARWIN
// FIXME what's going on here with darwin?
st->st_ctime = inode.ctime;
st->st_atime = inode.atime;
st->st_mtime = inode.mtime;
#endif
st->st_size = inode.size;
st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0;
st->st_blksize = 4096;
/*
S_REQUIREBLKSIZE(st->st_litemask);
if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask);
if (inode.mask & INODE_MASK_SIZE) {
S_REQUIRESIZE(st->st_litemask);
S_REQUIREBLOCKS(st->st_litemask);
}
if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask);
if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask);
*/
}
int Client::lstat(const char *relpath, struct stat *stbuf)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl;
tout << "lstat" << endl;
tout << path << endl;
Inode *in = 0;
int res = _lstat(path, INODE_MASK_ALL_STAT, &in);
if (res == 0) {
assert(in);
fill_stat(in->inode,stbuf);
dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl;
}
trim_cache();
client_lock.Unlock();
return res;
}
int Client::lstatlite(const char *relpath, struct statlite *stl)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl;
tout << "lstatlite" << endl;
tout << path << endl;
// make mask
int mask = INODE_MASK_BASE | INODE_MASK_PERM;
if (S_ISVALIDSIZE(stl->st_litemask) ||
S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE;
if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME;
if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME;
Inode *in = 0;
int res = _lstat(path, mask, &in);
if (res == 0) {
fill_statlite(in->inode,stl);
dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl;
}
trim_cache();
client_lock.Unlock();
return res;
}
int Client::chmod(const char *relpath, mode_t mode)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl;
tout << "chmod" << endl;
tout << path << endl;
tout << mode << endl;
MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami);
req->set_path(path);
req->set_iarg( (int)mode );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "chmod result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::chown(const char *relpath, uid_t uid, gid_t gid)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl;
tout << "chown" << endl;
tout << path << endl;
tout << uid << endl;
tout << gid << endl;
MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami);
req->set_path(path);
req->set_iarg( (int)uid );
req->set_iarg2( (int)gid );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "chown result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::utime(const char *relpath, struct utimbuf *buf)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl;
dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl;
tout << "utime" << endl;
tout << path << endl;
tout << buf->actime << endl;
tout << buf->modtime << endl;
MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami);
req->set_path(path);
req->set_targ( buf->modtime );
req->set_targ2( buf->actime );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << "utime result is " << res << endl;
trim_cache();
client_lock.Unlock();
return res;
}
int Client::mknod(const char *relpath, mode_t mode)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl;
tout << "mknod" << endl;
tout << path << endl;
tout << mode << endl;
MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami);
req->set_path(path);
req->set_iarg( mode );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
dout(10) << "mknod result is " << res << endl;
delete reply;
trim_cache();
client_lock.Unlock();
return res;
}
//readdir usually include inode info for each entry except of locked entries
//
// getdir
// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino);
int Client::getdir(const char *relpath, map<string,inode_t>& contents)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl;
tout << "getdir" << endl;
tout << path << endl;
MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami);
req->set_path(path);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
//FIXME enforce caller uid rights?
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
if (res == 0) {
// dir contents to cache!
inodeno_t ino = reply->get_ino();
Inode *diri = inode_map[ ino ];
assert(diri);
assert(diri->inode.mode & INODE_MODE_DIR);
if (!reply->get_dir_in().empty()) {
// only open dir if we're actually adding stuff to it!
Dir *dir = diri->open_dir();
assert(dir);
time_t now = time(NULL);
list<string>::const_iterator pdn = reply->get_dir_dn().begin();
for (list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
pin != reply->get_dir_in().end();
++pin, ++pdn) {
// count entries
res++;
// put in cache
Inode *in = this->insert_inode(dir, *pin, *pdn);
if (g_conf.client_cache_stat_ttl)
in->valid_until = now + g_conf.client_cache_stat_ttl;
else if (g_conf.client_cache_readdir_ttl)
in->valid_until = now + g_conf.client_cache_readdir_ttl;
// contents to caller too!
contents[*pdn] = in->inode;
}
}
// add .. too?
if (diri != root && diri->dn && diri->dn->dir) {
Inode *parent = diri->dn->dir->parent_inode;
contents[".."] = parent->inode;
}
// FIXME: remove items in cache that weren't in my readdir?
// ***
}
delete reply; //fix thing above first
client_lock.Unlock();
return res;
}
/** POSIX stubs **/
DIR *Client::opendir(const char *name)
{
DirResult *d = new DirResult;
d->size = getdir(name, d->contents);
d->p = d->contents.begin();
d->off = 0;
return (DIR*)d;
}
int Client::closedir(DIR *dir)
{
DirResult *d = (DirResult*)dir;
delete d;
return 0;
}
//struct dirent {
// ino_t d_ino; /* inode number */
// off_t d_off; /* offset to the next dirent */
// unsigned short d_reclen; /* length of this record */
// unsigned char d_type; /* type of file */
// char d_name[256]; /* filename */
//};
struct dirent *Client::readdir(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
// end of dir?
if (d->p == d->contents.end())
return 0;
// fill the dirent
d->dp.d_dirent.d_ino = d->p->second.ino;
#ifndef __CYGWIN__
#ifndef DARWIN
if (d->p->second.is_symlink())
d->dp.d_dirent.d_type = DT_LNK;
else if (d->p->second.is_dir())
d->dp.d_dirent.d_type = DT_DIR;
else if (d->p->second.is_file())
d->dp.d_dirent.d_type = DT_REG;
else
d->dp.d_dirent.d_type = DT_UNKNOWN;
d->dp.d_dirent.d_off = d->off;
d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
#endif // DARWIN
#endif
strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
// move up
++d->off;
++d->p;
return &d->dp.d_dirent;
}
void Client::rewinddir(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
d->p = d->contents.begin();
d->off = 0;
}
off_t Client::telldir(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
return d->off;
}
void Client::seekdir(DIR *dirp, off_t offset)
{
DirResult *d = (DirResult*)dirp;
d->p = d->contents.begin();
d->off = 0;
if (offset >= d->size) offset = d->size-1;
while (offset > 0) {
++d->p;
++d->off;
--offset;
}
}
struct dirent_plus *Client::readdirplus(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
// end of dir?
if (d->p == d->contents.end())
return 0;
// fill the dirent
d->dp.d_dirent.d_ino = d->p->second.ino;
#ifndef __CYGWIN__
#ifndef DARWIN
if (d->p->second.is_symlink())
d->dp.d_dirent.d_type = DT_LNK;
else if (d->p->second.is_dir())
d->dp.d_dirent.d_type = DT_DIR;
else if (d->p->second.is_file())
d->dp.d_dirent.d_type = DT_REG;
else
d->dp.d_dirent.d_type = DT_UNKNOWN;
d->dp.d_dirent.d_off = d->off;
d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
#endif // DARWIN
#endif
strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
// plus
if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
// have it
fill_stat(d->p->second, &d->dp.d_stat);
d->dp.d_stat_err = 0;
} else {
// don't have it, stat it
string path = d->path;
path += "/";
path += d->p->first;
d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat);
}
// move up
++d->off;
++d->p;
return &d->dp;
}
/*
struct dirent_lite *Client::readdirlite(DIR *dirp)
{
DirResult *d = (DirResult*)dirp;
// end of dir?
if (d->p == d->contents.end())
return 0;
// fill the dirent
d->dp.d_dirent.d_ino = d->p->second.ino;
if (d->p->second.is_symlink())
d->dp.d_dirent.d_type = DT_LNK;
else if (d->p->second.is_dir())
d->dp.d_dirent.d_type = DT_DIR;
else if (d->p->second.is_file())
d->dp.d_dirent.d_type = DT_REG;
else
d->dp.d_dirent.d_type = DT_UNKNOWN;
strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256);
d->dp.d_dirent.d_off = d->off;
d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.)
// plus
if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) {
// have it
fill_statlite(d->p->second,d->dp.d_stat);
d->dp.d_stat_err = 0;
} else {
// don't have it, stat it
string path = p->path;
path += "/";
path += p->first;
d->dp.d_statlite
d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite);
}
// move up
++d->off;
++d->p;
return &d->dp;
}
*/
/****** file i/o **********/
int Client::open(const char *relpath, int flags)
{
client_lock.Lock();
string abspath;
mkabspath(relpath, abspath);
const char *path = abspath.c_str();
dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl;
tout << "open" << endl;
tout << path << endl;
tout << flags << endl;
int cmode = 0;
bool tryauth = false;
if (flags & O_LAZY)
cmode = FILE_MODE_LAZY;
else if (flags & O_WRONLY) {
cmode = FILE_MODE_W;
tryauth = true;
} else if (flags & O_RDWR) {
cmode = FILE_MODE_RW;
tryauth = true;
} else if (flags & O_APPEND) {
cmode = FILE_MODE_W;
tryauth = true;
} else
cmode = FILE_MODE_R;
// go
MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami);
req->set_path(path);
req->set_iarg(flags);
req->set_iarg2(cmode);
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
MClientReply *reply = make_request(req, tryauth); // try auth if writer
assert(reply);
dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl;
tout << reply->get_result() << endl;
insert_trace(reply);
int result = reply->get_result();
// success?
fh_t fh = 0;
if (result >= 0) {
// yay
Fh *f = new Fh;
f->mode = cmode;
// inode
f->inode = inode_map[reply->get_ino()];
assert(f->inode);
f->inode->get();
if (cmode & FILE_MODE_R) f->inode->num_open_rd++;
if (cmode & FILE_MODE_W) f->inode->num_open_wr++;
if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++;
// caps included?
int mds = reply->get_source().num();
if (f->inode->caps.empty()) {// first caps?
dout(7) << " first caps on " << f->inode->inode.ino << endl;
f->inode->get();
}
int new_caps = reply->get_file_caps();
assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq);
if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) {
dout(7) << "open got caps " << cap_string(new_caps)
<< " for " << f->inode->ino()
<< " seq " << reply->get_file_caps_seq()
<< " from mds" << mds << endl;
int old_caps = f->inode->caps[mds].caps;
f->inode->caps[mds].caps = new_caps;
f->inode->caps[mds].seq = reply->get_file_caps_seq();
// we shouldn't ever lose caps at this point.
// actually, we might...?
assert((old_caps & ~f->inode->caps[mds].caps) == 0);
if (g_conf.client_oc)
f->inode->fc.set_caps(new_caps);
} else {
dout(7) << "open got SAME caps " << cap_string(new_caps)
<< " for " << f->inode->ino()
<< " seq " << reply->get_file_caps_seq()
<< " from mds" << mds << endl;
}
// put in map
result = fh = get_fh();
assert(fh_map.count(fh) == 0);
fh_map[fh] = f;
dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl;
} else {
dout(0) << "open failure result " << result << endl;
}
delete reply;
trim_cache();
client_lock.Unlock();
return result;
}
void Client::close_release(Inode *in)
{
dout(10) << "close_release on " << in->ino() << endl;
dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd
<< " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << endl;
if (!in->num_open_rd)
in->fc.release_clean();
int retain = 0;
if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND;
if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE;
release_caps(in, retain); // release caps now.
}
void Client::close_safe(Inode *in)
{
dout(10) << "close_safe on " << in->ino() << endl;
put_inode(in);
if (unmounting)
mount_cond.Signal();
}
int Client::close(fh_t fh)
{
client_lock.Lock();
dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl;
dout(3) << "op: open_files.erase( " << fh << " );" << endl;
tout << "close" << endl;
tout << fh << endl;
// get Fh, Inode
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
// update inode rd/wr counts
int before = in->file_caps_wanted();
if (f->mode & FILE_MODE_R)
in->num_open_rd--;
if (f->mode & FILE_MODE_W)
in->num_open_wr--;
int after = in->file_caps_wanted();
// does this change what caps we want?
if (before != after && after)
update_caps_wanted(in);
// hose fh
fh_map.erase(fh);
delete f;
// release caps right away?
dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl;
if (g_conf.client_oc) {
// caching on.
if (in->num_open_rd == 0 && in->num_open_wr == 0) {
in->fc.empty(new C_Client_CloseRelease(this, in));
}
else if (in->num_open_rd == 0) {
in->fc.release_clean();
close_release(in);
}
else if (in->num_open_wr == 0) {
in->fc.flush_dirty(new C_Client_CloseRelease(this,in));
}
// pin until safe?
if (in->num_open_wr == 0 && !in->fc.all_safe()) {
dout(10) << "pinning ino " << in->ino() << " until safe" << endl;
in->get();
in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in));
}
} else {
// caching off.
if (in->num_open_rd == 0 && in->num_open_wr == 0) {
dout(10) << " releasing caps on " << in->ino() << endl;
release_caps(in); // release caps now.
}
}
put_inode( in );
int result = 0;
client_lock.Unlock();
return result;
}
// ------------
// read, write
// blocking osd interface
int Client::read(fh_t fh, char *buf, off_t size, off_t offset)
{
client_lock.Lock();
dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl;
tout << "read" << endl;
tout << fh << endl;
tout << size << endl;
tout << offset << endl;
assert(offset >= 0);
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
if (offset < 0)
offset = f->pos;
bool lazy = f->mode == FILE_MODE_LAZY;
// do we have read file cap?
while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) {
dout(7) << " don't have read cap, waiting" << endl;
Cond cond;
in->waitfor_read.push_back(&cond);
cond.Wait(client_lock);
}
// lazy cap?
while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
dout(7) << " don't have lazy cap, waiting" << endl;
Cond cond;
in->waitfor_lazy.push_back(&cond);
cond.Wait(client_lock);
}
// determine whether read range overlaps with file
// ...ONLY if we're doing async io
if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) {
// we're doing buffered i/o. make sure we're inside the file.
// we can trust size info bc we get accurate info when buffering/caching caps are issued.
dout(10) << "file size: " << in->inode.size << endl;
if (offset > 0 && offset >= in->inode.size) {
client_lock.Unlock();
return 0;
}
if (offset + size > (unsigned)in->inode.size) size = (unsigned)in->inode.size - offset;
if (size == 0) {
dout(10) << "read is size=0, returning 0" << endl;
client_lock.Unlock();
return 0;
}
} else {
// unbuffered, synchronous file i/o.
// or lazy.
// defer to OSDs for file bounds.
}
bufferlist blist; // data will go here
int rvalue = 0;
int r = 0;
if (g_conf.client_oc) {
// object cache ON
rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block.
} else {
// object cache OFF -- legacy inconsistent way.
Cond cond;
bool done = false;
C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
r = filer->read(in->inode, offset, size, &blist, onfinish);
assert(r >= 0);
// wait!
while (!done)
cond.Wait(client_lock);
}
// adjust fd pos
f->pos = offset+blist.length();
// copy data into caller's char* buf
blist.copy(0, blist.length(), buf);
//dout(10) << "i read '" << blist.c_str() << "'" << endl;
dout(10) << "read rvalue " << rvalue << ", r " << r << endl;
// done!
client_lock.Unlock();
return rvalue;
}
/*
* hack --
* until we properly implement synchronous writes wrt buffer cache,
* make sure we delay shutdown until they're all safe on disk!
*/
class C_Client_HackUnsafe : public Context {
Client *cl;
public:
C_Client_HackUnsafe(Client *c) : cl(c) {}
void finish(int) {
cl->hack_sync_write_safe();
}
};
void Client::hack_sync_write_safe()
{
client_lock.Lock();
assert(unsafe_sync_write > 0);
unsafe_sync_write--;
if (unsafe_sync_write == 0 && unmounting) {
dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl;
mount_cond.Signal();
}
client_lock.Unlock();
}
int Client::write(fh_t fh, const char *buf, off_t size, off_t offset)
{
client_lock.Lock();
//dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl;
dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl;
tout << "write" << endl;
tout << fh << endl;
tout << size << endl;
tout << offset << endl;
assert(offset >= 0);
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
if (offset < 0)
offset = f->pos;
bool lazy = f->mode == FILE_MODE_LAZY;
dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl;
// do we have write file cap?
while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) {
dout(7) << " don't have write cap, waiting" << endl;
Cond cond;
in->waitfor_write.push_back(&cond);
cond.Wait(client_lock);
}
while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
dout(7) << " don't have lazy cap, waiting" << endl;
Cond cond;
in->waitfor_lazy.push_back(&cond);
cond.Wait(client_lock);
}
// adjust fd pos
f->pos = offset+size;
// time it.
utime_t start = g_clock.now();
// copy into fresh buffer (since our write may be resub, async)
bufferptr bp = buffer::copy(buf, size);
bufferlist blist;
blist.push_back( bp );
if (g_conf.client_oc) { // buffer cache ON?
assert(objectcacher);
// write (this may block!)
in->fc.write(offset, size, blist, client_lock);
} else {
// legacy, inconsistent synchronous write.
dout(7) << "synchronous write" << endl;
// prepare write
Cond cond;
bool done = false;
C_Cond *onfinish = new C_Cond(&cond, &done);
C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this);
unsafe_sync_write++;
in->sync_writes++;
dout(20) << " sync write start " << onfinish << endl;
filer->write(in->inode, offset, size, blist, 0,
onfinish, onsafe
//, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots
);
while (!done) {
cond.Wait(client_lock);
dout(20) << " sync write bump " << onfinish << endl;
}
in->sync_writes--;
if (in->sync_writes == 0 &&
!in->waitfor_no_write.empty()) {
for (list<Context*>::iterator i = in->waitfor_no_write.begin();
i != in->waitfor_no_write.end();
i++)
(*i)->finish(0);
in->waitfor_no_write.clear();
}
dout(20) << " sync write done " << onfinish << endl;
}
// time
utime_t lat = g_clock.now();
lat -= start;
if (client_logger) {
client_logger->finc("wrlsum",(double)lat);
client_logger->inc("wrlnum");
}
// assume success for now. FIXME.
off_t totalwritten = size;
// extend file?
if (totalwritten + offset > in->inode.size) {
in->inode.size = in->file_wr_size = totalwritten + offset;
dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl;
} else {
dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl;
}
// mtime
in->file_wr_mtime = in->inode.mtime = g_clock.gettime();
// ok!
client_lock.Unlock();
return totalwritten;
}
int Client::truncate(const char *file, off_t size)
{
client_lock.Lock();
dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl;
tout << "truncate" << endl;
tout << file << endl;
tout << size << endl;
MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami);
req->set_path(file);
req->set_sizearg( size );
// FIXME where does FUSE maintain user information
req->set_caller_uid(getuid());
req->set_caller_gid(getgid());
MClientReply *reply = make_request(req, true);
int res = reply->get_result();
insert_trace(reply);
delete reply;
dout(10) << " truncate result is " << res << endl;
client_lock.Unlock();
return res;
}
int Client::fsync(fh_t fh, bool syncdataonly)
{
client_lock.Lock();
dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl;
tout << "fsync" << endl;
tout << fh << endl;
tout << syncdataonly << endl;
int r = 0;
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl;
// metadata?
if (!syncdataonly) {
dout(0) << "fsync - not syncing metadata yet.. implement me" << endl;
}
// data?
Cond cond;
bool done = false;
if (!objectcacher->commit_set(in->ino(),
new C_Cond(&cond, &done))) {
// wait for callback
while (!done) cond.Wait(client_lock);
}
client_lock.Unlock();
return r;
}
// not written yet, but i want to link!
int Client::chdir(const char *path)
{
// fake it for now!
string abs;
mkabspath(path, abs);
dout(3) << "chdir " << path << " -> cwd now " << abs << endl;
cwd = abs;
return 0;
}
int Client::statfs(const char *path, struct statvfs *stbuf)
{
bzero (stbuf, sizeof (struct statvfs));
// FIXME
stbuf->f_bsize = 1024;
stbuf->f_frsize = 1024;
stbuf->f_blocks = 1024 * 1024;
stbuf->f_bfree = 1024 * 1024;
stbuf->f_bavail = 1024 * 1024;
stbuf->f_files = 1024 * 1024;
stbuf->f_ffree = 1024 * 1024;
stbuf->f_favail = 1024 * 1024;
stbuf->f_namemax = 1024;
return 0;
}
int Client::lazyio_propogate(int fd, off_t offset, size_t count)
{
client_lock.Lock();
dout(3) << "op: client->lazyio_propogate(" << fd
<< ", " << offset << ", " << count << ")" << endl;
assert(fh_map.count(fd));
Fh *f = fh_map[fd];
Inode *in = f->inode;
if (f->mode & FILE_MODE_LAZY) {
// wait for lazy cap
while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
dout(7) << " don't have lazy cap, waiting" << endl;
Cond cond;
in->waitfor_lazy.push_back(&cond);
cond.Wait(client_lock);
}
if (g_conf.client_oc) {
Cond cond;
bool done = false;
in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done));
while (!done)
cond.Wait(client_lock);
} else {
// mmm, nothin to do.
}
}
client_lock.Unlock();
return 0;
}
int Client::lazyio_synchronize(int fd, off_t offset, size_t count)
{
client_lock.Lock();
dout(3) << "op: client->lazyio_synchronize(" << fd
<< ", " << offset << ", " << count << ")" << endl;
assert(fh_map.count(fd));
Fh *f = fh_map[fd];
Inode *in = f->inode;
if (f->mode & FILE_MODE_LAZY) {
// wait for lazy cap
while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) {
dout(7) << " don't have lazy cap, waiting" << endl;
Cond cond;
in->waitfor_lazy.push_back(&cond);
cond.Wait(client_lock);
}
if (g_conf.client_oc) {
in->fc.flush_dirty(0); // flush to invalidate.
in->fc.release_clean();
} else {
// mm, nothin to do.
}
}
client_lock.Unlock();
return 0;
}
void Client::ms_handle_failure(Message *m, const entity_inst_t& inst)
{
entity_name_t dest = inst.name;
if (dest.is_mon()) {
// resend to a different monitor.
int mon = monmap->pick_mon(true);
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", resending to mon" << mon
<< endl;
messenger->send_message(m, monmap->get_inst(mon));
}
else if (dest.is_osd()) {
objecter->ms_handle_failure(m, dest, inst);
}
else if (dest.is_mds()) {
dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl;
// help!
assert(0);
}
else {
// client?
dout(0) << "ms_handle_failure " << dest << " inst " << inst
<< ", dropping" << endl;
delete m;
}
}