* scatterlock. untested.

git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1346 29311d96-e01e-0410-9327-a35deaab8ce9
This commit is contained in:
sageweil 2007-04-24 21:45:32 +00:00
parent bd8c9409fd
commit 53abf9474d
12 changed files with 586 additions and 124 deletions

View File

@ -42,6 +42,9 @@ mds
- discover
- open_remote_ino needs major work...
- scatterlock
- unlink, link, rename need to pre_dirty and update dir inode's mtime
- FIXME how to journal root and stray inode content?
- in particular, i care about dirfragtree.. get it on rejoin?
- and dir sizes, if i add that... also on rejoin?

View File

@ -64,6 +64,7 @@ ostream& operator<<(ostream& out, CInode& in)
out << " link=" << in.linklock;
out << " dft=" << in.dirfragtreelock;
out << " file=" << in.filelock;
out << " dir=" << in.dirlock;
if (in.get_num_ref()) {
out << " |";
@ -332,12 +333,14 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
{
switch (type) {
case LOCK_OTYPE_IAUTH:
::_encode(inode.ctime, bl);
::_encode(inode.mode, bl);
::_encode(inode.uid, bl);
::_encode(inode.gid, bl);
break;
case LOCK_OTYPE_ILINK:
::_encode(inode.ctime, bl);
::_encode(inode.nlink, bl);
::_encode(inode.anchored, bl);
break;
@ -351,6 +354,19 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
::_encode(inode.mtime, bl);
::_encode(inode.atime, bl);
break;
case LOCK_OTYPE_IDIR:
::_encode(inode.mtime, bl);
{
map<frag_t,int> dfsz;
for (map<frag_t,CDir*>::iterator p = dirfrags.begin();
p != dirfrags.end();
++p)
if (p->second->is_auth())
dfsz[p->first] = p->second->get_nitems();
::_encode(dfsz, bl);
}
break;
default:
assert(0);
@ -360,14 +376,20 @@ void CInode::encode_lock_state(int type, bufferlist& bl)
void CInode::decode_lock_state(int type, bufferlist& bl)
{
int off = 0;
utime_t tm;
switch (type) {
case LOCK_OTYPE_IAUTH:
::_decode(tm, bl, off);
if (inode.ctime < tm) inode.ctime = tm;
::_decode(inode.mode, bl, off);
::_decode(inode.uid, bl, off);
::_decode(inode.gid, bl, off);
break;
case LOCK_OTYPE_ILINK:
::_decode(tm, bl, off);
if (inode.ctime < tm) inode.ctime = tm;
::_decode(inode.nlink, bl, off);
::_decode(inode.anchored, bl, off);
break;
@ -382,6 +404,17 @@ void CInode::decode_lock_state(int type, bufferlist& bl)
::_decode(inode.atime, bl, off);
break;
case LOCK_OTYPE_IDIR:
//::_decode(inode.size, bl, off);
::_decode(tm, bl, off);
if (inode.mtime < tm) inode.mtime = tm;
{
map<frag_t,int> dfsz;
::_decode(dfsz, bl, off);
// hmm which to keep?
}
break;
default:
assert(0);
}

View File

@ -25,6 +25,7 @@
#include "CDentry.h"
#include "SimpleLock.h"
#include "FileLock.h"
#include "ScatterLock.h"
#include "Capability.h"
@ -97,17 +98,16 @@ class CInode : public MDSCacheObject {
static const int WAIT_SLAVEAGREE = (1<<0);
static const int WAIT_AUTHPINNABLE = (1<<1);
//static const int WAIT_SINGLEAUTH = (1<<2);
static const int WAIT_DIR = (1<<3);
static const int WAIT_LINK = (1<<4); // as in remotely nlink++
static const int WAIT_ANCHORED = (1<<5);
static const int WAIT_UNANCHORED = (1<<6);
static const int WAIT_UNLINK = (1<<7); // as in remotely nlink--
static const int WAIT_CAPS = (1<<8);
static const int WAIT_DIR = (1<<2);
static const int WAIT_ANCHORED = (1<<3);
static const int WAIT_UNANCHORED = (1<<4);
static const int WAIT_CAPS = (1<<5);
static const int WAIT_AUTHLOCK_OFFSET = 9;
static const int WAIT_LINKLOCK_OFFSET = 9 + SimpleLock::WAIT_BITS;
static const int WAIT_DIRFRAGTREELOCK_OFFSET = 9 + 2*SimpleLock::WAIT_BITS;;
static const int WAIT_FILELOCK_OFFSET = 9 + 3*SimpleLock::WAIT_BITS;;
static const int WAIT_AUTHLOCK_OFFSET = 6;
static const int WAIT_LINKLOCK_OFFSET = 6 + SimpleLock::WAIT_BITS;
static const int WAIT_DIRFRAGTREELOCK_OFFSET = 6 + 2*SimpleLock::WAIT_BITS;
static const int WAIT_FILELOCK_OFFSET = 6 + 3*SimpleLock::WAIT_BITS;
static const int WAIT_DIRLOCK_OFFSET = 6 + 4*SimpleLock::WAIT_BITS;
static const int WAIT_ANY = 0xffffffff;
@ -123,6 +123,7 @@ class CInode : public MDSCacheObject {
inode_t inode; // the inode itself
string symlink; // symlink dest, if symlink
fragtree_t dirfragtree; // dir frag tree, if any
map<frag_t,int> dirfrag_size; // size of each dirfrag
off_t last_open_journaled; // log offset for the last journaled EOpen
@ -187,7 +188,8 @@ protected:
authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET),
linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET),
dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET),
filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET)
filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET),
dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET)
{
state = 0;
if (auth) state_set(STATE_AUTH);
@ -253,6 +255,7 @@ public:
SimpleLock linklock;
SimpleLock dirfragtreelock;
FileLock filelock;
ScatterLock dirlock;
SimpleLock* get_lock(int type) {
switch (type) {
@ -260,6 +263,7 @@ public:
case LOCK_OTYPE_IAUTH: return &authlock;
case LOCK_OTYPE_ILINK: return &linklock;
case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock;
case LOCK_OTYPE_IDIR: return &dirlock;
default: assert(0);
}
}
@ -359,6 +363,8 @@ public:
if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0)
filelock.replicate_relax();
dirlock.replicate_relax();
}

View File

@ -214,8 +214,8 @@ inline ostream& operator<<(ostream& out, FileLock& l)
//out << get_lock_type_name(l.get_type()) << " ";
out << get_filelock_state_name(l.get_state());
if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
if (l.get_num_rdlock())
out << " r=" << l.get_num_rdlock();
if (l.is_rdlocked())
out << " r=" << l.get_num_rdlocks();
if (l.is_xlocked())
out << " x=" << l.get_xlocked_by();
out << ")";

View File

@ -116,6 +116,7 @@ void Locker::send_lock_message(SimpleLock *lock, int msg, bufferlist &data)
bool Locker::acquire_locks(MDRequest *mdr,
set<SimpleLock*> &rdlocks,
set<SimpleLock*> &wrlocks,
set<SimpleLock*> &xlocks)
{
dout(10) << "acquire_locks " << *mdr << endl;
@ -125,13 +126,20 @@ bool Locker::acquire_locks(MDRequest *mdr,
// (local) AUTH PINS
// can i auth_pin everything?
for (set<SimpleLock*>::iterator p = xlocks.begin();
p != xlocks.end();
// make list of items to authpin
set<SimpleLock*> mustpin = xlocks;
for (set<SimpleLock*>::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p)
mustpin.insert(*p);
// can i auth pin them all now?
for (set<SimpleLock*>::iterator p = mustpin.begin();
p != mustpin.end();
++p) {
dout(10) << "will xlock " << **p << " " << *(*p)->get_parent() << endl;
dout(10) << "must authpin " << **p << " " << *(*p)->get_parent() << endl;
// sort in
sorted.insert(*p);
if ((*p)->get_type() == LOCK_OTYPE_DN) {
CDir *dir = ((CDentry*)(*p)->get_parent())->dir;
dout(10) << "might auth_pin " << *dir << endl;
@ -159,8 +167,8 @@ bool Locker::acquire_locks(MDRequest *mdr,
}
// ok, grab the auth pins
for (set<SimpleLock*>::iterator p = xlocks.begin();
p != xlocks.end();
for (set<SimpleLock*>::iterator p = mustpin.begin();
p != mustpin.end();
++p) {
if ((*p)->get_type() == LOCK_OTYPE_DN) {
CDir *dir = ((CDentry*)(*p)->get_parent())->dir;
@ -175,6 +183,7 @@ bool Locker::acquire_locks(MDRequest *mdr,
}
}
// sort in rdlocks too
for (set<SimpleLock*>::iterator p = rdlocks.begin();
p != rdlocks.end();
++p) {
@ -193,7 +202,9 @@ bool Locker::acquire_locks(MDRequest *mdr,
if (existing != mdr->locks.end() && *existing == *p) {
// right kind?
SimpleLock *had = *existing;
if (xlocks.count(*p) == (had->get_xlocked_by() == mdr)) {
if (xlocks.count(*p) == mdr->xlocks.count(*p) &&
wrlocks.count(*p) == mdr->wrlocks.count(*p) &&
rdlocks.count(*p) == mdr->rdlocks.count(*p)) {
dout(10) << "acquire_locks already locked " << *had << " " << *had->get_parent() << endl;
existing++;
continue;
@ -206,19 +217,25 @@ bool Locker::acquire_locks(MDRequest *mdr,
existing++;
dout(10) << "acquire_locks unlocking out-of-order " << **existing
<< " " << *(*existing)->get_parent() << endl;
if (had->get_xlocked_by() == mdr)
if (mdr->xlocks.count(had))
xlock_finish(had, mdr);
else if (mdr->wrlocks.count(had))
wrlock_finish(had, mdr);
else
rdlock_finish(had, mdr);
}
// lock
if (xlocks.count(*p)) {
if (!xlock_start(*p, mdr))
if (!xlock_start(*p, mdr))
return false;
dout(10) << "acquire_locks got xlock on " << **p << " " << *(*p)->get_parent() << endl;
} else if (wrlocks.count(*p)) {
if (!wrlock_start(*p, mdr))
return false;
dout(10) << "acquire_locks got wrlock on " << **p << " " << *(*p)->get_parent() << endl;
} else {
if (!rdlock_start(*p, mdr))
if (!rdlock_start(*p, mdr))
return false;
dout(10) << "acquire_locks got rdlock on " << **p << " " << *(*p)->get_parent() << endl;
}
@ -228,8 +245,10 @@ bool Locker::acquire_locks(MDRequest *mdr,
while (existing != mdr->locks.end()) {
dout(10) << "acquire_locks unlocking " << *existing
<< " " << *(*existing)->get_parent() << endl;
if ((*existing)->get_xlocked_by() == mdr)
if (mdr->xlocks.count(*existing))
xlock_finish(*existing, mdr);
else if (mdr->wrlocks.count(*existing))
wrlock_finish(*existing, mdr);
else
rdlock_finish(*existing, mdr);
}
@ -241,23 +260,13 @@ bool Locker::acquire_locks(MDRequest *mdr,
// generics
/*
bool Locker::rdlock_try(SimpleLock *lock, Context *con)
{
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_try((FileLock*)lock, con);
default:
return simple_rdlock_try(lock, con);
}
}
*/
bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_start((FileLock*)lock, mdr);
case LOCK_OTYPE_IDIR:
return scatter_rdlock_start((ScatterLock*)lock, mdr);
default:
return simple_rdlock_start(lock, mdr);
}
@ -268,16 +277,40 @@ void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr)
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_rdlock_finish((FileLock*)lock, mdr);
case LOCK_OTYPE_IDIR:
return scatter_rdlock_finish((ScatterLock*)lock, mdr);
default:
return simple_rdlock_finish(lock, mdr);
}
}
bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
case LOCK_OTYPE_IDIR:
return scatter_wrlock_start((ScatterLock*)lock, mdr);
default:
assert(0);
}
}
void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
case LOCK_OTYPE_IDIR:
return scatter_wrlock_finish((ScatterLock*)lock, mdr);
default:
assert(0);
}
}
bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr)
{
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_xlock_start((FileLock*)lock, mdr);
case LOCK_OTYPE_IDIR:
assert(0);
default:
return simple_xlock_start(lock, mdr);
}
@ -288,6 +321,8 @@ void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr)
switch (lock->get_type()) {
case LOCK_OTYPE_IFILE:
return file_xlock_finish((FileLock*)lock, mdr);
case LOCK_OTYPE_IDIR:
assert(0);
default:
return simple_xlock_finish(lock, mdr);
}
@ -712,6 +747,18 @@ void Locker::handle_lock(MLock *m)
}
break;
case LOCK_OTYPE_IDIR:
{
CInode *in = mdcache->get_inode(m->get_ino());
if (!in) {
dout(7) << "dont' have ino " << m->get_ino() << endl;
delete m;
return;
}
handle_scatter_lock(&in->dirlock, m);
}
break;
default:
dout(7) << "handle_lock got otype " << m->get_otype() << endl;
assert(0);
@ -721,6 +768,11 @@ void Locker::handle_lock(MLock *m)
// ==========================================================================
// simple lock
void Locker::handle_simple_lock(SimpleLock *lock, MLock *m)
{
int from = m->get_asker();
@ -739,11 +791,11 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m)
//|| lock->get_state() == LOCK_GLOCKR);
// wait for readers to finish?
if (lock->get_num_rdlock() > 0) {
dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock << " on " << *lock->get_parent()
<< endl;
if (lock->is_rdlocked()) {
dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->set_state(LOCK_GLOCKR);
lock->add_waiter(SimpleLock::WAIT_NORD, new C_MDS_RetryMessage(mds, m));
lock->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m));
return;
}
@ -836,7 +888,7 @@ void Locker::simple_eval(SimpleLock *lock)
switch (lock->get_state()) {
case LOCK_GLOCKR:
lock->set_state(LOCK_LOCK);
lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE);
lock->finish_waiters(SimpleLock::WAIT_STABLE);
break;
default:
@ -851,7 +903,8 @@ void Locker::simple_eval(SimpleLock *lock)
if (lock->get_state() != LOCK_SYNC &&
lock->get_parent()->is_replicated() &&
!lock->is_waiter_for(SimpleLock::WAIT_WR)) {
dout(7) << "simple_eval stable, syncing " << *lock << " on " << *lock->get_parent() << endl;
dout(7) << "simple_eval stable, syncing " << *lock
<< " on " << *lock->get_parent() << endl;
simple_sync(lock);
}
@ -965,9 +1018,9 @@ void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr)
dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
if (lock->get_state() == LOCK_GLOCKR &&
lock->get_num_rdlock() == 0) {
!lock->is_rdlocked()) {
lock->set_state(LOCK_SYNC); // return state to sync, in case the unpinner flails
lock->finish_waiters(SimpleLock::WAIT_NORD);
lock->finish_waiters(SimpleLock::WAIT_NOLOCKS);
}
}
@ -1003,7 +1056,7 @@ bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr)
return true;
} else {
// wait for lock
lock->add_waiter(SimpleLock::WAIT_LOCK, new C_MDS_RetryRequest(mdcache, mdr));
lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr));
return false;
}
} else {
@ -1118,9 +1171,264 @@ void Locker::dentry_anon_rdlock_trace_finish(vector<CDentry*>& trace)
// ==========================================================================
// scatter lock
bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr)
{
dout(7) << "scatter_rdlock_start on " << *lock
<< " on " << *lock->get_parent() << endl;
// pre-twiddle?
if (lock->get_state() == LOCK_SCATTER &&
lock->get_parent()->is_auth() &&
!lock->get_parent()->is_replicated() &&
!lock->is_wrlocked())
scatter_sync(lock);
// can rdlock?
if (lock->can_rdlock(mdr)) {
lock->get_rdlock();
mdr->rdlocks.insert(lock);
mdr->locks.insert(lock);
return true;
}
// wait for read.
lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
// initiate sync?
if (lock->get_state() == LOCK_SCATTER &&
lock->get_parent()->is_auth())
scatter_sync(lock);
return false;
}
void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr)
{
dout(7) << "scatter_rdlock_finish on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->put_rdlock();
if (mdr) {
mdr->rdlocks.erase(lock);
mdr->locks.erase(lock);
}
scatter_eval(lock);
}
// ===============================
bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr)
{
dout(7) << "scatter_wrlock_start on " << *lock
<< " on " << *lock->get_parent() << endl;
// pre-twiddle?
if (lock->get_state() == LOCK_SYNC &&
lock->get_parent()->is_auth() &&
!lock->get_parent()->is_replicated() &&
!lock->is_rdlocked())
scatter_scatter(lock);
// can wrlock?
if (lock->can_wrlock()) {
lock->get_wrlock();
mdr->wrlocks.insert(lock);
mdr->locks.insert(lock);
return true;
}
// wait for write.
lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr));
// initiate scatter?
if (lock->get_state() == LOCK_SYNC &&
lock->get_parent()->is_auth())
scatter_scatter(lock);
return false;
}
void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr)
{
dout(7) << "scatter_wrlock_finish on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->put_wrlock();
if (mdr) {
mdr->wrlocks.erase(lock);
mdr->locks.erase(lock);
}
scatter_eval(lock);
}
void Locker::scatter_eval(ScatterLock *lock)
{
if (!lock->get_parent()->is_auth()) {
// REPLICA
if (lock->get_state() == LOCK_GSYNCS &&
!lock->is_wrlocked()) {
dout(10) << "scatter_eval no wrlocks, acking sync" << endl;
bufferlist data;
lock->encode_locked_state(data);
mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data),
lock->get_parent()->authority().first, MDS_PORT_LOCKER);
lock->set_state(LOCK_SYNC);
}
} else {
// AUTH
// gsyncs -> sync?
if (lock->get_state() == LOCK_GSYNCS &&
!lock->is_gathering() &&
!lock->is_wrlocked()) {
dout(7) << "scatter_eval finished gather/un-wrlock on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->set_state(LOCK_SYNC);
lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD|SimpleLock::WAIT_NOLOCKS);
}
// gscatters -> scatter?
if (lock->get_state() == LOCK_GSCATTERS &&
!lock->is_rdlocked()) {
assert(lock->get_parent()->is_auth());
if (lock->get_parent()->is_replicated()) {
// encode and bcast
bufferlist data;
lock->encode_locked_state(data);
send_lock_message(lock, LOCK_AC_SCATTER, data);
}
lock->set_state(LOCK_SCATTER);
lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
}
// waiting for rd?
if (lock->get_state() == LOCK_SCATTER &&
!lock->is_wrlocked() &&
lock->is_waiter_for(SimpleLock::WAIT_RD)) {
dout(10) << "scatter_eval no wrlocks, read waiter, syncing" << endl;
scatter_sync(lock);
}
// re-scatter?
if (lock->get_state() == LOCK_SYNC &&
!lock->is_rdlocked()) {
dout(10) << "scatter_eval no rdlocks, scattering" << endl;
scatter_scatter(lock);
}
}
}
void Locker::scatter_sync(ScatterLock *lock)
{
dout(10) << "scatter_sync " << *lock
<< " on " << *lock->get_parent() << endl;
assert(lock->get_parent()->is_auth());
if (lock->get_state() == LOCK_SYNC) return;
assert(lock->get_state() == LOCK_SCATTER);
// bcast
if (lock->get_parent()->is_replicated()) {
send_lock_message(lock, LOCK_AC_SYNC);
lock->set_state(LOCK_GSYNCS);
lock->init_gather();
}
else if (lock->is_wrlocked()) {
lock->set_state(LOCK_GSYNCS);
} else {
lock->set_state(LOCK_SYNC);
lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
}
}
void Locker::scatter_scatter(ScatterLock *lock)
{
dout(10) << "scatter_scatter " << *lock
<< " on " << *lock->get_parent() << endl;
assert(lock->get_parent()->is_auth());
if (lock->get_state() == LOCK_SCATTER) return;
assert(lock->get_state() == LOCK_SYNC);
if (lock->is_rdlocked()) {
lock->set_state(LOCK_GSCATTERS);
} else {
if (lock->get_parent()->is_replicated()) {
// encode and bcast
bufferlist data;
lock->encode_locked_state(data);
send_lock_message(lock, LOCK_AC_SCATTER, data);
}
lock->set_state(LOCK_SCATTER);
lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
}
}
void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m)
{
int from = m->get_asker();
switch (m->get_action()) {
// -- replica --
case LOCK_AC_SYNC:
assert(lock->get_state() == LOCK_SCATTER);
// wait for wrlocks to close?
if (lock->is_wrlocked()) {
dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock
<< " on " << *lock->get_parent() << endl;
lock->set_state(LOCK_GSYNCS);
} else {
// encode and reply
bufferlist data;
lock->encode_locked_state(data);
mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data),
from, MDS_PORT_LOCKER);
}
break;
case LOCK_AC_SCATTER:
assert(lock->get_state() == LOCK_SYNC);
lock->decode_locked_state(m->get_data());
lock->set_state(LOCK_SCATTER);
lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
break;
// -- for auth --
case LOCK_AC_SYNCACK:
assert(lock->get_state() == LOCK_GSYNCS);
assert(lock->is_gathering(from));
lock->remove_gather(from);
lock->decode_locked_state(m->get_data());
if (lock->is_gathering()) {
dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
<< " from " << from << ", still gathering " << lock->get_gather_set()
<< endl;
} else {
dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent()
<< " from " << from << ", last one"
<< endl;
simple_eval(lock);
}
break;
}
delete m;
}
// ==========================================================================
// file lock
@ -1154,7 +1462,7 @@ bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr)
mdr->rdlocks.insert(lock);
mdr->locks.insert(lock);
lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE);
lock->finish_waiters(SimpleLock::WAIT_STABLE);
return true;
}
} else {
@ -1202,8 +1510,8 @@ void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr)
dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
if (lock->get_num_rdlock() == 0) {
lock->finish_waiters(SimpleLock::WAIT_NORD);
if (!lock->is_rdlocked()) {
lock->finish_waiters(SimpleLock::WAIT_NOLOCKS);
file_eval(lock);
}
}
@ -1262,7 +1570,7 @@ void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr)
dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl;
// drop lock?
if (!lock->is_waiter_for(SimpleLock::WAIT_LOCK))
if (!lock->is_waiter_for(SimpleLock::WAIT_STABLE))
file_eval(lock);
}
@ -1297,7 +1605,7 @@ void Locker::file_eval(FileLock *lock)
// waiters
lock->get_rdlock();
lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE);
lock->finish_waiters(SimpleLock::WAIT_STABLE);
lock->put_rdlock();
}
break;
@ -1415,7 +1723,7 @@ void Locker::file_eval(FileLock *lock)
<< endl;
// * -> loner?
if (lock->get_num_rdlock() == 0 &&
if (!lock->is_rdlocked() &&
!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
(wanted & CAP_FILE_WR) &&
loner &&
@ -1425,7 +1733,7 @@ void Locker::file_eval(FileLock *lock)
}
// * -> mixed?
else if (lock->get_num_rdlock() == 0 &&
else if (!lock->is_rdlocked() &&
!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
(wanted & CAP_FILE_RD) &&
(wanted & CAP_FILE_WR) &&
@ -1796,9 +2104,9 @@ void Locker::handle_file_lock(FileLock *lock, MLock *m)
dout(7) << "handle_file_lock client readers, gathering caps on " << *in << endl;
issue_caps(in);
}
if (lock->get_num_rdlock() > 0) {
dout(7) << "handle_file_lock readers, waiting before ack on " << *in << endl;
in->add_waiter(SimpleLock::WAIT_NORD, new C_MDS_RetryMessage(mds, m));
if (lock->is_rdlocked()) {
dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << endl;
in->add_waiter(SimpleLock::WAIT_NOLOCKS, new C_MDS_RetryMessage(mds, m));
lock->set_state(LOCK_GLOCKR);
assert(0);// i am broken.. why retry message when state captures all the info i need?
return;

View File

@ -45,6 +45,7 @@ class Capability;
class SimpleLock;
class FileLock;
class ScatterLock;
class Locker {
private:
@ -63,13 +64,15 @@ private:
// -- locks --
bool acquire_locks(MDRequest *mdr,
set<SimpleLock*> &rdlocks,
set<SimpleLock*> &wrlocks,
set<SimpleLock*> &xlocks);
bool rdlock_try(SimpleLock *lock, Context *con);
bool rdlock_start(SimpleLock *lock, MDRequest *mdr);
void rdlock_finish(SimpleLock *lock, MDRequest *mdr);
bool xlock_start(SimpleLock *lock, MDRequest *mdr);
void xlock_finish(SimpleLock *lock, MDRequest *mdr);
bool wrlock_start(SimpleLock *lock, MDRequest *mdr);
void wrlock_finish(SimpleLock *lock, MDRequest *mdr);
// simple
void handle_simple_lock(SimpleLock *lock, MLock *m);
@ -86,6 +89,16 @@ private:
void dentry_anon_rdlock_trace_start(vector<CDentry*>& trace);
void dentry_anon_rdlock_trace_finish(vector<CDentry*>& trace);
// scatter
void handle_scatter_lock(ScatterLock *lock, MLock *m);
void scatter_eval(ScatterLock *lock);
void scatter_sync(ScatterLock *lock);
void scatter_scatter(ScatterLock *lock);
bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr);
void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr);
bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr);
void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr);
// file
void handle_file_lock(FileLock *lock, MLock *m);
void file_eval(FileLock *lock);

View File

@ -3348,27 +3348,8 @@ void MDCache::request_drop_locks(MDRequest *mdr)
mds->locker->xlock_finish(*mdr->xlocks.begin(), mdr);
while (!mdr->rdlocks.empty())
mds->locker->rdlock_finish(*mdr->rdlocks.begin(), mdr);
/*
// foreign xlocks?
if (active_requests[req].foreign_xlocks.size()) {
set<CDentry*> dns = active_requests[req].foreign_xlocks;
active_requests[req].foreign_xlocks.clear();
for (set<CDentry*>::iterator it = dns.begin();
it != dns.end();
it++) {
CDentry *dn = *it;
dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl;
assert(dn->is_xlocked());
int dauth = dn->dir->dentry_authority(dn->name).first;
MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid());
m->set_dn(dn->dir->dirfrag(), dn->name);
mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
}
}
*/
while (!mdr->wrlocks.empty())
mds->locker->wrlock_finish(*mdr->wrlocks.begin(), mdr);
// make sure ref and trace are empty
// if we are doing our own locking, we can't use them!

View File

@ -79,7 +79,8 @@ struct MDRequest {
// held locks
set< SimpleLock* > rdlocks; // always local.
set< SimpleLock* > xlocks; // may be remote.
set< SimpleLock* > wrlocks; // always local.
set< SimpleLock* > xlocks; // local or remote.
set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering
// projected updates

View File

@ -0,0 +1,102 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef __SCATTERLOCK_H
#define __SCATTERLOCK_H
#include "SimpleLock.h"
// lock state machine states.
#define LOCK_SYNC__ // rdlocks allowed (e.g., for stat)
#define LOCK_GSYNCS -20 // waiting for replicas to gather
#define LOCK_SCATTER 21 // mtime updates on replicas allowed, no reads.
#define LOCK_GSCATTERS 22 // waiting for rdlocks to release
inline const char *get_scatterlock_state_name(int s) {
switch(s) {
case LOCK_SYNC: return "sync";
case LOCK_GSYNCS: return "gsyncs";
case LOCK_SCATTER: return "scatter";
case LOCK_GSCATTERS: return "gscatters";
default: assert(0);
}
}
class ScatterLock : public SimpleLock {
int num_wrlock;
public:
ScatterLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) {}
char get_replica_state() {
switch (state) {
case LOCK_SYNC:
case LOCK_GSYNCS:
case LOCK_GSCATTERS:
return LOCK_SYNC;
case LOCK_SCATTER:
return LOCK_SCATTER;
default:
assert(0);
}
}
void replicate_relax() {
if (state == LOCK_SYNC && !is_rdlocked())
state = LOCK_SCATTER;
}
// rdlock
bool can_rdlock(MDRequest *mdr) {
return state == LOCK_SYNC;
}
bool can_rdlock_soon() {
return state == LOCK_SYNC || state == LOCK_GSYNCS;
}
// wrlock
bool can_wrlock() {
return state == LOCK_SCATTER;
}
void get_wrlock() {
assert(state == LOCK_SCATTER);
++num_wrlock;
}
void put_wrlock() {
--num_wrlock;
}
bool is_wrlocked() { return num_wrlock > 0; }
int get_num_wrlocks() { return num_wrlock; }
};
inline ostream& operator<<(ostream& out, ScatterLock& l)
{
out << "(";
//out << get_lock_type_name(l.get_type()) << " ";
out << get_scatterlock_state_name(l.get_state());
if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
if (l.is_rdlocked())
out << " r=" << l.get_num_rdlocks();
//if (l.is_xlocked())
//out << " x=" << l.get_xlocked_by();
if (l.is_wrlocked())
out << " wr=" << l.get_num_wrlocks();
out << ")";
return out;
}
#endif

View File

@ -721,13 +721,12 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth)
}
// lock the path
set<SimpleLock*> rdlocks;
set<SimpleLock*> xlocks;
set<SimpleLock*> rdlocks, empty;
for (unsigned i=0; i<trace.size(); i++)
rdlocks.insert(&trace[i]->lock);
if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks))
if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty))
return 0;
// set and pin ref
@ -796,17 +795,17 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus
}
// -- lock --
set<SimpleLock*> rdlocks;
set<SimpleLock*> xlocks;
set<SimpleLock*> rdlocks, wrlocks, xlocks;
for (unsigned i=0; i<trace.size(); i++)
rdlocks.insert(&trace[i]->lock);
if (dn->is_null())
xlocks.insert(&dn->lock); // new dn, xlock
else
if (dn->is_null()) {
xlocks.insert(&dn->lock); // new dn, xlock
wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime
} else
rdlocks.insert(&dn->lock); // existing dn, rdlock
if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks))
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return 0;
// save the locked trace.
@ -900,23 +899,24 @@ void Server::handle_client_stat(MDRequest *mdr)
CInode *ref = rdlock_path_pin_ref(mdr, false);
if (!ref) return;
// FIXME: this is really not the way to handle the statlite mask.
// do I need file info?
// which inode locks do I want?
/* note: this works because we include existing locks in our lists,
and because all new locks are on inodes and sort to the right of
the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */
set<SimpleLock*> rdlocks = mdr->rdlocks;
set<SimpleLock*> wrlocks = mdr->wrlocks;
set<SimpleLock*> xlocks = mdr->xlocks;
int mask = req->args.stat.mask;
if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) {
// yes. do a full stat.
if (!mds->locker->rdlock_start(&ref->filelock, mdr))
return; // syncing
mds->locker->rdlock_finish(&ref->filelock, mdr);
} else {
// nope! easy peasy.
}
mds->balancer->hit_inode(ref, META_POP_IRD);
if (mask & INODE_MASK_LINK) rdlocks.insert(&ref->linklock);
if (mask & INODE_MASK_AUTH) rdlocks.insert(&ref->authlock);
if (ref->is_file() && mask & INODE_MASK_FILE) rdlocks.insert(&ref->filelock);
if (ref->is_dir() && mask & INODE_MASK_MTIME) rdlocks.insert(&ref->dirlock);
mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
// reply
//dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
dout(10) << "reply to stat on " << *req << endl;
MClientReply *reply = new MClientReply(req);
reply_request(mdr, reply, ref);
}
@ -1451,17 +1451,17 @@ void Server::handle_client_link(MDRequest *mdr)
if (!dn) return;
// create lock lists
set<SimpleLock*> rdlocks;
set<SimpleLock*> xlocks;
set<SimpleLock*> rdlocks, wrlocks, xlocks;
for (unsigned i=0; i<linktrace.size(); i++)
rdlocks.insert(&linktrace[i]->lock);
xlocks.insert(&dn->lock);
wrlocks.insert(&dn->dir->inode->dirlock);
for (unsigned i=0; i<targettrace.size(); i++)
rdlocks.insert(&targettrace[i]->lock);
xlocks.insert(&targeti->linklock);
if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks))
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
// go!
@ -1699,15 +1699,15 @@ void Server::handle_client_unlink(MDRequest *mdr)
}
// lock
set<SimpleLock*> rdlocks;
set<SimpleLock*> xlocks;
set<SimpleLock*> rdlocks, wrlocks, xlocks;
for (unsigned i=0; i<trace.size()-1; i++)
rdlocks.insert(&trace[i]->lock);
xlocks.insert(&dn->lock);
wrlocks.insert(&dn->dir->inode->dirlock);
xlocks.insert(&in->linklock);
if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks))
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
// ok!
@ -2087,23 +2087,24 @@ void Server::handle_client_rename(MDRequest *mdr)
// -- locks --
set<SimpleLock*> rdlocks;
set<SimpleLock*> xlocks;
set<SimpleLock*> rdlocks, wrlocks, xlocks;
// rdlock sourcedir path, xlock src dentry
for (unsigned i=0; i<srctrace.size()-1; i++)
rdlocks.insert(&srctrace[i]->lock);
xlocks.insert(&srcdn->lock);
wrlocks.insert(&srcdn->dir->inode->dirlock);
// rdlock destdir path, xlock dest dentry
for (unsigned i=0; i<desttrace.size(); i++)
rdlocks.insert(&desttrace[i]->lock);
xlocks.insert(&destdn->lock);
wrlocks.insert(&destdn->dir->inode->dirlock);
// xlock oldin
if (oldin) xlocks.insert(&oldin->linklock);
if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks))
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;

View File

@ -23,8 +23,9 @@
#define LOCK_OTYPE_IAUTH 3
#define LOCK_OTYPE_ILINK 4
#define LOCK_OTYPE_IDIRFRAGTREE 5
#define LOCK_OTYPE_IDIR 6
#define LOCK_OTYPE_DIR 7 // not used
//#define LOCK_OTYPE_DIR 7 // not used
inline const char *get_lock_type_name(int t) {
switch (t) {
@ -33,6 +34,7 @@ inline const char *get_lock_type_name(int t) {
case LOCK_OTYPE_IAUTH: return "inode_auth";
case LOCK_OTYPE_ILINK: return "inode_link";
case LOCK_OTYPE_IDIRFRAGTREE: return "inode_dirfragtree";
case LOCK_OTYPE_IDIR: return "inode_dir";
default: assert(0);
}
}
@ -59,12 +61,12 @@ class MDRequest;
class SimpleLock {
public:
static const int WAIT_RD = (1<<0); // to read
static const int WAIT_NORD = (1<<1); // for last rdlock to finish
static const int WAIT_WR = (1<<2); // to write
static const int WAIT_LOCK = (1<<3); // for locked state
static const int WAIT_STABLE = (1<<4); // for a stable state
static const int WAIT_REMOTEXLOCK = (1<<5); // for a remote xlock
static const int WAIT_BITS = 6;
static const int WAIT_WR = (1<<1); // to write
static const int WAIT_NOLOCKS = (1<<2); // for last rdlock to finish
//static const int WAIT_LOCK = (1<<3); // for locked state
static const int WAIT_STABLE = (1<<3); // for a stable state
static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock
static const int WAIT_BITS = 5;
protected:
// parent (what i lock)
@ -154,7 +156,7 @@ public:
assert(num_rdlock>0);
return --num_rdlock;
}
int get_num_rdlock() { return num_rdlock; }
int get_num_rdlocks() { return num_rdlock; }
void get_xlock(MDRequest *who) {
assert(xlock_by == 0);
@ -248,8 +250,8 @@ inline ostream& operator<<(ostream& out, SimpleLock& l)
//out << get_lock_type_name(l.get_type()) << " ";
out << get_simplelock_state_name(l.get_state());
if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set();
if (l.get_num_rdlock())
out << " r=" << l.get_num_rdlock();
if (l.is_rdlocked())
out << " r=" << l.get_num_rdlocks();
if (l.is_xlocked())
out << " w=" << l.get_xlocked_by();
out << ")";

View File

@ -26,8 +26,7 @@
#define LOCK_AC_REQXLOCKACK -4 // req dentry xlock
#define LOCK_AC_REQXLOCKNAK -5 // req dentry xlock
#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
#define LOCK_AC_SCATTER -6
// for auth
#define LOCK_AC_SYNCACK 1
@ -42,6 +41,9 @@
#define LOCK_AC_FINISH 8
#define LOCK_AC_FOR_REPLICA(a) ((a) < 0)
#define LOCK_AC_FOR_AUTH(a) ((a) > 0)
class MLock : public Message {
int asker; // who is initiating this request
@ -79,6 +81,14 @@ class MLock : public Message {
this->action = action;
this->asker = asker;
}
MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) :
Message(MSG_MDS_LOCK) {
this->otype = lock->get_type();
lock->get_parent()->set_mlock_info(this);
this->action = action;
this->asker = asker;
data.claim(bl);
}
virtual char *get_type_name() { return "ILock"; }
void set_ino(inodeno_t ino, char ot) {
@ -88,10 +98,12 @@ class MLock : public Message {
void set_ino(inodeno_t ino) {
this->ino = ino;
}
/*
void set_dirfrag(dirfrag_t df) {
otype = LOCK_OTYPE_DIR;
this->dirfrag = df;
}
*/
void set_dn(dirfrag_t df, const string& dn) {
otype = LOCK_OTYPE_DN;
this->dirfrag = df;