From eb36db6f2a93347f1016befbcf192c6f16c194a7 Mon Sep 17 00:00:00 2001 From: sageweil Date: Sat, 17 Mar 2007 14:03:33 +0000 Subject: [PATCH] * singleauth waiting; discover waits * moved auth_pins out of Locker; explicitly in Server.cc handlers now * prepare_null_dentry/prepare_new_inode cleanup * reqeust_auth_pin stuff git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1258 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 6 + .../sage/cephmds2/client/SyntheticClient.cc | 15 +- .../sage/cephmds2/client/SyntheticClient.h | 2 +- branches/sage/cephmds2/mds/CDir.cc | 41 ++- branches/sage/cephmds2/mds/CDir.h | 16 +- branches/sage/cephmds2/mds/CInode.cc | 18 +- branches/sage/cephmds2/mds/CInode.h | 2 + branches/sage/cephmds2/mds/Locker.cc | 29 -- branches/sage/cephmds2/mds/MDCache.cc | 158 ++++++--- branches/sage/cephmds2/mds/MDCache.h | 7 + branches/sage/cephmds2/mds/Migrator.cc | 2 +- branches/sage/cephmds2/mds/Server.cc | 307 +++++++++++++----- branches/sage/cephmds2/mds/Server.h | 22 +- branches/sage/cephmds2/mds/events/EMetaBlob.h | 8 +- 14 files changed, 415 insertions(+), 218 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index d0b07bda951..0587e979280 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -37,6 +37,12 @@ mds - rejoin and replicas that are not in recovered node's cache... fetch storm? +- locking madness +/ - request_auth_pin, request_drop_auth_pins, and _link/_unlink_local should pre-pin dn dir and targeti. + - move auth_pinning _out_ of locking _start and _finish methods + - clean up multi-auth_pin code paths (e.g. link_local) + + - mds failure vs clients - clean up client op redirection - idempotent ops diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc index 101451e8b26..0c8f7081abb 100644 --- a/branches/sage/cephmds2/client/SyntheticClient.cc +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -126,7 +126,7 @@ void parse_syn_options(vector& args) syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); - + syn_iargs.push_back( atoi(args[++i]) ); } else if (strcmp(args[i],"foo") == 0) { syn_modes.push_back( SYNCLIENT_MODE_FOO ); @@ -355,9 +355,10 @@ int SyntheticClient::run() int iarg1 = iargs.front(); iargs.pop_front(); int iarg2 = iargs.front(); iargs.pop_front(); int iarg3 = iargs.front(); iargs.pop_front(); + int iarg4 = iargs.front(); iargs.pop_front(); if (run_me()) { dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3); + thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); } } break; @@ -1336,17 +1337,15 @@ void SyntheticClient::foo() client->rmdir("d"); } -int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth) +int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) { - dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth << endl; + dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth + << " links " << n + << endl; if (time_to_stop()) return 0; - // first make dir/file tree - make_dirs(basedir,dirs,files,depth); - // now link shit up - int n = files*dirs; for (int i=0; iparent->dir->add_waiter(tag, c); return; } } + + // at subtree root? + if (tag & WAIT_ATSUBTREEROOT) { + if (!is_subtree_root()) { + // try parent + dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << endl; + inode->parent->dir->add_waiter(tag, c); + return; + } + } + // this dir. if (waiting.empty() && waiting_on_dentry.size() == 0) @@ -571,9 +582,12 @@ void CDir::fetch(Context *c) { dout(10) << "fetch on " << *this << endl; + assert(is_auth()); + assert(!is_complete()); + if (c) add_waiter(WAIT_COMPLETE, c); - // alrady fetching? + // already fetching? if (state_test(CDir::STATE_FETCHING)) { dout(7) << "already fetching; waiting" << endl; return; @@ -604,6 +618,7 @@ void CDir::_fetched(bufferlist &bl) //ondisk_size = 0; // kick waiters? + state_clear(CDir::STATE_FETCHING); finish_waiting(WAIT_COMPLETE, -1); return; } @@ -780,7 +795,7 @@ void CDir::commit(version_t want, Context *c) if (want == 0) want = version; // preconditions - assert(want <= version); // can't commit the future + assert(want <= version || version == 0); // can't commit the future assert(committed_version < want); // the caller is stupid assert(is_auth()); assert(can_auth_pin()); @@ -827,7 +842,7 @@ void CDir::_commit(version_t want) // we can't commit things in the future. // (even the projected future.) - assert(want <= version); + assert(want <= version || version == 0); // check pre+postconditions. assert(is_auth()); @@ -1080,6 +1095,7 @@ void CDir::set_dir_auth(pair a, bool iamauth) << " on " << *this << endl; bool was_subtree = is_subtree_root(); + bool was_ambiguous = dir_auth.second >= 0; // set it. dir_auth = a; @@ -1105,6 +1121,13 @@ void CDir::set_dir_auth(pair a, bool iamauth) if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) inode->auth_pin(); } + + // newly single auth? + if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { + list ls; + take_waiting(WAIT_SINGLEAUTH, ls); + cache->mds->queue_finished(ls); + } } diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h index 9f4c21ab06a..3e2b9474616 100644 --- a/branches/sage/cephmds2/mds/CDir.h +++ b/branches/sage/cephmds2/mds/CDir.h @@ -156,21 +156,13 @@ class CDir : public MDSCacheObject { // -- wait masks -- static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache - // waiters: path_traverse - // trigger: handle_discover, fetch_dir_2 static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - // waiters: fetch_dir, commit_dir - // trigger: fetch_dir_2 static const int WAIT_FREEZEABLE = (1<<2); // hard_pins removed - // waiters: freeze, freeze_finish - // trigger: auth_unpin, adjust_nested_auth_pins static const int WAIT_UNFREEZE = (1<<3); // unfreeze - // waiters: path_traverse, handle_discover, handle_inode_update, - // export_dir_frozen (mdcache) - // handle_client_readdir (mds) - // trigger: unfreeze static const int WAIT_AUTHPINNABLE = WAIT_UNFREEZE; static const int WAIT_IMPORTED = (1<<4); // import finish + static const int WAIT_SINGLEAUTH = (1<<5); + static const int WAIT_DNREAD = (1<<20); static const int WAIT_DNLOCK = (1<<21); static const int WAIT_DNUNPINNED = (1<<22); @@ -178,8 +170,8 @@ class CDir : public MDSCacheObject { static const int WAIT_DNREQXLOCK = (1<<23); static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|\ - WAIT_UNFREEZE); // hmm, same same + static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE); + static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 1172c0367c6..1be0a2dbaa2 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -461,23 +461,23 @@ bool CInode::waiting_for(int tag) return waiting.count(tag) > 0; } -void CInode::add_waiter(int tag, Context *c) { - // waiting on hierarchy? +void CInode::add_waiter(int tag, Context *c) +{ + // wait on the directory? if (tag & WAIT_AUTHPINNABLE) { - assert(tag == WAIT_AUTHPINNABLE); - assert(is_freezing() || is_frozen()); - - // wait on the directory parent->dir->add_waiter(CDir::WAIT_AUTHPINNABLE, c); return; } - + if (tag & WAIT_SINGLEAUTH) { + parent->dir->add_waiter(CDir::WAIT_SINGLEAUTH, c); + return; + } + // this inode. - if (waiting.size() == 0) + if (waiting.empty()) get(PIN_WAITER); waiting.insert(pair(tag,c)); dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; - } void CInode::take_waiting(int mask, list& ls) diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index 964ca18e1ca..6b7250d8fe2 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -108,6 +108,8 @@ class CInode : public MDSCacheObject { // waiters: write_hard_start, read_file_start, write_file_start (mdcache) // handle_client_chmod, handle_client_touch (mds) // trigger: (see CDIR_WAIT_UNFREEZE) + static const int WAIT_SINGLEAUTH = (1<<11); + static const int WAIT_DIR = (1<<13); // waiters: traverse_path // triggers: handle_disocver_reply diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 7089176f16e..ebb78b14c84 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -589,13 +589,6 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) // can write? grab ref. if (in->hardlock.can_write(in->is_auth())) { assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* in->hardlock.get_write(m); return true; } @@ -631,7 +624,6 @@ void Locker::inode_hard_write_finish(CInode *in) // drop ref //assert(in->hardlock.can_write(in->is_auth())); in->hardlock.put_write(); - in->auth_unpin(); dout(7) << "inode_hard_write_finish on " << *in << endl; // others waiting? @@ -974,13 +966,6 @@ bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) if (in->filelock.can_write(in->is_auth())) { // can i auth pin? assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); in->filelock.get_write(m); return true; } else { @@ -1747,14 +1732,6 @@ bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) assert(dn->lockstate == DN_LOCK_SYNC || dn->lockstate == DN_LOCK_UNPINNING); - // dir auth pinnable? - if (!dn->dir->can_auth_pin()) { - dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl; - dn->dir->add_waiter(CDir::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - // is dentry path pinned? if (dn->is_pinned()) { dout(7) << "dentry " << *dn << " pinned, waiting" << endl; @@ -1781,9 +1758,6 @@ bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; } } - - // pin dir! - dn->dir->auth_pin(); // mine! dn->xlockedby = m; @@ -1859,9 +1833,6 @@ void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) } } - // unpin dir - dn->dir->auth_unpin(); - // kick waiters list finished; dn->dir->take_waiting(CDir::WAIT_DNREAD, finished); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 67335723e43..10a00a64b94 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -1654,13 +1654,18 @@ bool MDCache::trim(int max) else { assert(dn->is_null()); } - dir->remove_dentry(dn); // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! + // NOTE: we can safely remove a clean, null dentry without effecting + // directory completeness. + if (!(dn->is_null() && dn->is_clean())) + dir->state_clear(CDir::STATE_COMPLETE); + + // remove dentry + dir->remove_dentry(dn); // reexport? - if (dir->get_size() == 0) + if (dir->get_size() == 0 && dir->is_subtree_root()) migrator->export_empty_import(dir); if (mds->logger) mds->logger->inc("cex"); @@ -1699,6 +1704,8 @@ void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expire { assert(dir->get_num_ref() == 0); + dout(15) << "trim_dirfrag " << *dir << endl; + CInode *in = dir->get_inode(); if (!dir->is_auth()) { @@ -2397,21 +2404,28 @@ int MDCache::path_traverse(filepath& origpath, curdir = cur->get_or_open_dirfrag(this, fg); } else { - // discover dir from/via inode auth + // discover? assert(!cur->is_auth()); if (cur->waiting_for(CInode::WAIT_DIR)) { - dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl; - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), + dout(10) << "traverse: need dir, already doing discover for " << *cur << endl; + } + else if (cur->auth_is_ambiguous()) { + dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl; + cur->add_waiter(CInode::WAIT_SINGLEAUTH, ondelay); + if (onfinish) delete onfinish; + return 1; + } else { + filepath want = path.postfixpath(depth); + dout(10) << "traverse: need dir, doing discover, want " << want.get_path() + << " from " << *cur << endl; + mds->send_message_mds(new MDiscover(mds->get_nodeid(), cur->ino(), want, true), // need this dir too cur->authority().first, MDS_PORT_CACHE); dir_discovers[cur->ino()].insert(cur->authority().first); - } - cur->add_waiter(CInode::WAIT_DIR, ondelay); + } + cur->add_waiter(CInode::WAIT_DIR, ondelay); if (onfinish) delete onfinish; return 1; } @@ -2594,39 +2608,29 @@ int MDCache::path_traverse(filepath& origpath, // dirfrag/dentry is not mine. pair dauth = curdir->authority(); - /* no, let's let auth handle the discovery/replication .. - if (onfail == MDS_TRAVERSE_FORWARD && - onfinish == 0 && // no funnyness - cur->dir->is_rep()) { - dout(5) << "trying to discover in popular dir " << *cur->dir << endl; - onfail = MDS_TRAVERSE_DISCOVER; - } - */ - if ((onfail == MDS_TRAVERSE_DISCOVER || onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover - + // discover? filepath want = path.postfixpath(depth); + if (curdir->waiting_for(CDir::WAIT_DENTRY, path[depth])) { - dout(7) << "traverse: already waiting for discover on " << *curdir << " for " << want.get_path() << endl; - } else { - dout(7) << "traverse: discover on " << *curdir << " for " << want.get_path() << endl; - + dout(7) << "traverse: already waiting for discover " << want.get_path() + << " from " << *curdir << endl; + } + else if (curdir->auth_is_ambiguous()) { + dout(7) << "traverse: waiting for single auth on " << *curdir << endl; + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, + new C_MDC_TraverseDiscover(onfinish, ondelay)); + return 1; + } else { + dout(7) << "traverse: discover " << want << " from " << *curdir << endl; touch_inode(cur); - + mds->send_message_mds(new MDiscover(mds->get_nodeid(), cur->ino(), want, false), dauth.first, MDS_PORT_CACHE); - if (dauth.second >= 0) - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - false), - dauth.second, MDS_PORT_CACHE); - if (mds->logger) mds->logger->inc("dis"); } @@ -2643,20 +2647,31 @@ int MDCache::path_traverse(filepath& origpath, if (onfail == MDS_TRAVERSE_FORWARD) { // forward dout(7) << "traverse: not auth for " << path << " in " << *curdir << endl; + + if (curdir->auth_is_ambiguous()) { + // wait + dout(7) << "traverse: waiting for single auth in " << *curdir << endl; + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, ondelay); + if (onfinish) delete onfinish; + return 1; + } else { + dout(7) << "traverse: forwarding, not auth for " << *curdir << endl; - if (is_client_req && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *curdir << " req " << *(MClientRequest*)req << endl; - ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); - //show_subtrees(); - - if (mds->logger) mds->logger->inc("cfw"); - if (onfinish) delete onfinish; - delete ondelay; - return 2; + // request replication? + if (is_client_req && curdir->is_rep()) { + dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " + << *curdir << " req " << *(MClientRequest*)req << endl; + ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(curdir->ino()); + req->clear_payload(); // reencode! + } + + mds->forward_message_mds(req, dauth.first, req->get_dest_port()); + + if (mds->logger) mds->logger->inc("cfw"); + if (onfinish) delete onfinish; + delete ondelay; + return 2; + } } if (onfail == MDS_TRAVERSE_FAIL) { delete ondelay; @@ -2893,6 +2908,51 @@ void MDCache::request_pin_dir(Message *req, CDir *dir) } } +void MDCache::request_auth_pin(Message *req, CDir *dir) +{ + if (active_requests[req].dir_auth_pins.count(dir) == 0) { + dir->auth_pin(); + active_requests[req].dir_auth_pins.insert(dir); + } +} + +void MDCache::request_auth_pin(Message *req, CInode *in) +{ + if (active_requests[req].inode_auth_pins.count(in) == 0) { + in->auth_pin(); + active_requests[req].inode_auth_pins.insert(in); + } +} + +bool MDCache::request_auth_pinned(Message *req, CDir *dir) +{ + return active_requests[req].dir_auth_pins.count(dir); +} + +bool MDCache::request_auth_pinned(Message *req, CInode *in) +{ + return active_requests[req].inode_auth_pins.count(in); +} + +void MDCache::request_drop_auth_pins(Message *req) +{ + // dirs + for (set::iterator p = active_requests[req].dir_auth_pins.begin(); + p != active_requests[req].dir_auth_pins.end(); + ++p) + (*p)->auth_unpin(); + active_requests[req].dir_auth_pins.clear(); + + // inodes + for (set::iterator p = active_requests[req].inode_auth_pins.begin(); + p != active_requests[req].inode_auth_pins.end(); + ++p) + (*p)->auth_unpin(); + active_requests[req].inode_auth_pins.clear(); +} + + + void MDCache::request_cleanup(Message *req) { @@ -2961,6 +3021,10 @@ void MDCache::request_cleanup(Message *req) (*it)->request_pin_put(); } + // auth pins + request_drop_auth_pins(req); + + // remove from map active_requests.erase(req); diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 66cb2ebac8e..9f8bd395ed0 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -71,6 +71,8 @@ typedef struct { map< CDentry*, vector > traces; // path pins held set< CDentry* > xlocks; // xlocks (local) set< CDentry* > foreign_xlocks; // xlocks on foreign hosts + set< CDir* > dir_auth_pins; + set< CInode* > inode_auth_pins; } active_request_t; namespace __gnu_cxx { @@ -348,6 +350,11 @@ public: void request_forward(Message *req, int mds, int port=0); void request_pin_inode(Message *req, CInode *in); void request_pin_dir(Message *req, CDir *dir); + void request_auth_pin(Message *req, CDir *dir); + void request_auth_pin(Message *req, CInode *in); + bool request_auth_pinned(Message *req, CDir *dir); + bool request_auth_pinned(Message *req, CInode *in); + void request_drop_auth_pins(Message *req); // -- anchors -- public: diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 020967bc543..8ee0b06dae7 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -1910,7 +1910,7 @@ int Migrator::decode_import_dir(bufferlist& bl, } // add dentry to journal entry - if (le) + if (le) le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead? } diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 084b02d4384..3041505ce66 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -443,6 +443,8 @@ void Server::dispatch_request(Message *m, CInode *ref) // MClientRequest. + dout(7) << "handle_client " << *m << " ref " << *ref << endl; + switch (req->get_op()) { // files @@ -651,6 +653,14 @@ public: void Server::handle_client_utime(MClientRequest *req, CInode *cur) { + // auth pin + if (!cur->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *cur << endl; + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + mdcache->request_auth_pin(req, cur); + // write if (!mds->locker->inode_file_write_start(cur, req)) return; // fw or (wait for) sync @@ -716,6 +726,14 @@ public: void Server::handle_client_chmod(MClientRequest *req, CInode *cur) { + // auth pin + if (!cur->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *cur << endl; + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + mdcache->request_auth_pin(req, cur); + // write if (!mds->locker->inode_hard_write_start(cur, req)) return; // fw or (wait for) lock @@ -774,6 +792,14 @@ public: void Server::handle_client_chown(MClientRequest *req, CInode *cur) { + // auth pin + if (!cur->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *cur << endl; + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + mdcache->request_auth_pin(req, cur); + // write if (!mds->locker->inode_hard_write_start(cur, req)) return; // fw or (wait for) lock @@ -943,16 +969,21 @@ public: void Server::handle_client_mknod(MClientRequest *req, CInode *diri) { CDir *dir = 0; - CInode *newi = 0; CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &dir, &newi, &dn)) + + // create null dentry + if (!prepare_null_dentry(req, diri, &dir, &dn)) return; assert(dir); - assert(newi); assert(dn); + // xlock dentry + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; + + CInode *newi = prepare_new_inode(req, dir); + assert(newi); + // it's a file. dn->pre_dirty(); newi->inode.mode = req->args.mknod.mode; @@ -973,7 +1004,8 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -/* +/** validate_dentry_dir + * * verify that the dir exists and would own the dname. * do not check if the dentry exists. */ @@ -993,6 +1025,7 @@ CDir *Server::validate_dentry_dir(MClientRequest *req, CInode *diri, string& nam if (!dir) return 0; + /* // dir auth pinnable? if (!dir->can_auth_pin()) { dout(7) << "validate_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; @@ -1000,6 +1033,7 @@ CDir *Server::validate_dentry_dir(MClientRequest *req, CInode *diri, string& nam new C_MDS_RetryRequest(mds, req, diri)); return false; } + */ // frozen? if (dir->is_frozen()) { @@ -1012,7 +1046,8 @@ CDir *Server::validate_dentry_dir(MClientRequest *req, CInode *diri, string& nam return dir; } -/* +/** prepare_null_dentry + * * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). * create the inode and dentry, but do not link them. * pre_dirty the dentry+dir. @@ -1023,11 +1058,11 @@ CDir *Server::validate_dentry_dir(MClientRequest *req, CInode *diri, string& nam * 1 - created * 2 - already exists (only if okexist=true) */ -int Server::prepare_mknod(MClientRequest *req, CInode *diri, - CDir **pdir, CInode **pin, CDentry **pdn, - bool okexist) +int Server::prepare_null_dentry(MClientRequest *req, CInode *diri, + CDir **pdir, CDentry **pdn, + bool okexist) { - dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; + dout(10) << "prepare_null_dentry " << req->get_filepath() << " in " << *diri << endl; // get containing directory (without last bit) filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); @@ -1049,7 +1084,6 @@ int Server::prepare_mknod(MClientRequest *req, CInode *diri, // name already exists if (okexist) { dout(10) << "dentry " << name << " exists in " << *dir << endl; - *pin = (*pdn)->inode; return 2; } else { dout(10) << "dentry " << name << " exists in " << *dir << endl; @@ -1067,29 +1101,34 @@ int Server::prepare_mknod(MClientRequest *req, CInode *diri, } // create null dentry - if (!*pdn) + if (!*pdn) { *pdn = dir->add_dentry(name, 0); - - // xlock dentry - bool res = mds->locker->dentry_xlock_start(*pdn, req, diri); - if (!res) - return 0; - - // yay! - - // create inode? - if (pin) { - *pin = mdcache->create_inode(); - (*pin)->inode.uid = req->get_caller_uid(); - (*pin)->inode.gid = req->get_caller_gid(); - (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now - // note: inode.version will get set by finisher's mark_dirty. + dout(10) << "prepare_null_dentry added " << **pdn << endl; + } else { + dout(10) << "prepare_null_dentry had " << **pdn << endl; } + + return 1; +} + + +/** prepare_new_inode + * + * create a new inode. set c/m/atime. hit dir pop. + */ +CInode* Server::prepare_new_inode(MClientRequest *req, CDir *dir) +{ + CInode *in = mdcache->create_inode(); + in->inode.uid = req->get_caller_uid(); + in->inode.gid = req->get_caller_gid(); + in->inode.ctime = in->inode.mtime = in->inode.atime = g_clock.gettime(); // now + dout(10) << "prepare_new_inode " << *in << endl; + // bump modify pop mds->balancer->hit_dir(dir, META_POP_DWR); - return 1; + return in; } @@ -1101,16 +1140,22 @@ int Server::prepare_mknod(MClientRequest *req, CInode *diri, void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) { CDir *dir = 0; - CInode *newi = 0; CDentry *dn = 0; - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &dir, &newi, &dn)) + // make dentry + if (!prepare_null_dentry(req, diri, &dir, &dn)) return; assert(dir); - assert(newi); assert(dn); + // xlock + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; + + // new inode + CInode *newi = prepare_new_inode(req, dir); + assert(newi); + // it's a directory. dn->pre_dirty(); newi->inode.mode = req->args.mkdir.mode; @@ -1158,16 +1203,21 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) void Server::handle_client_symlink(MClientRequest *req, CInode *diri) { CDir *dir = 0; - CInode *newi = 0; CDentry *dn = 0; - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &dir, &newi, &dn)) + // make null dentry + if (!prepare_null_dentry(req, diri, &dir, &dn)) return; assert(dir); - assert(newi); assert(dn); + // xlock + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; + + CInode *newi = prepare_new_inode(req, dir); + assert(newi); + // it's a symlink dn->pre_dirty(); newi->inode.mode &= ~INODE_TYPE_MASK; @@ -1210,11 +1260,12 @@ public: void Server::handle_client_link(MClientRequest *req, CInode *ref) { - // figure out name string dname = req->get_filepath().last_dentry(); - dout(7) << "handle_client_link dname is " << dname << endl; - - // validate dir + dout(7) << "handle_client_link " << dname << " in " << *ref + << " to " << req->get_sarg() + << endl; + + // make sure we own the dname CDir *dir = validate_dentry_dir(req, ref, dname); if (!dir) return; @@ -1246,7 +1297,7 @@ void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vect if (trace.size()) targeti = trace[trace.size()-1]->inode; assert(targeti); - // dir? + // not a dir? dout(7) << "target is " << *targeti << endl; if (targeti->is_dir()) { dout(7) << "target is a dir, failing" << endl; @@ -1254,25 +1305,38 @@ void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vect return; } + // does the target need an anchor? + if (targeti->is_auth()) { + if (targeti->get_parent_dir()->get_inode() == diri) { + dout(7) << "target is in the same dir, sweet" << endl; + } + else if (targeti->is_anchored() && !targeti->is_unanchoring()) { + dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; + } else { + dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; + + mdcache->anchor_create(targeti, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + } + // can we create the dentry? CDir *dir = 0; CDentry *dn = 0; // make dentry and inode, xlock dentry. - r = prepare_mknod(req, diri, &dir, 0, &dn); + r = prepare_null_dentry(req, diri, &dir, &dn); if (!r) return; // wait on something assert(dir); assert(dn); - // ok! - assert(dn->is_xlockedbyme(req)); - // local or remote? if (targeti->is_auth()) - link_local(req, diri, dn, targeti); + _link_local(req, diri, dn, targeti); else - link_remote(req, diri, dn, targeti); + _link_remote(req, diri, dn, targeti); } @@ -1297,28 +1361,37 @@ public: }; -void Server::link_local(MClientRequest *req, CInode *diri, +void Server::_link_local(MClientRequest *req, CInode *diri, CDentry *dn, CInode *targeti) { - dout(10) << "link_local " << *dn << " to " << *targeti << endl; + dout(10) << "_link_local " << *dn << " to " << *targeti << endl; - // anchor target? - if (targeti->get_parent_dir() == dn->get_dir()) { - dout(7) << "target is in the same dir, sweet" << endl; - } - else if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - - mdcache->anchor_create(targeti, - new C_MDS_RetryRequest(mds, req, diri)); + // first, auth pin the dentry dir and targeti. + if (!mdcache->request_auth_pinned(req, dn->get_dir()) && + !dn->get_dir()->can_auth_pin()) { + dn->get_dir()->add_waiter(CDir::WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds, req, diri)); + mdcache->request_drop_auth_pins(req); return; } + if (!mdcache->request_auth_pinned(req, targeti) && + !targeti->can_auth_pin()) { + targeti->add_waiter(CDir::WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds, req, diri)); + mdcache->request_drop_auth_pins(req); + return; + } + mdcache->request_auth_pin(req, dn->get_dir()); + mdcache->request_auth_pin(req, targeti); + + // sweet. let's get our locks. + // lock dentry + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; - // wrlock the target inode + // lock target inode if (!mds->locker->inode_hard_write_start(targeti, req)) - return; // fw or (wait for) lock + return; // ok, let's do it. // prepare log entry @@ -1376,10 +1449,10 @@ void Server::_link_local_finish(MClientRequest *req, CDentry *dn, CInode *target -void Server::link_remote(MClientRequest *req, CInode *ref, +void Server::_link_remote(MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) { - dout(10) << "link_remote " << *dn << " to " << *targeti << endl; + dout(10) << "_link_remote " << *dn << " to " << *targeti << endl; // pin the target replica in our cache assert(!targeti->is_auth()); @@ -1467,8 +1540,7 @@ public: // UNLINK -void Server::handle_client_unlink(MClientRequest *req, - CInode *diri) +void Server::handle_client_unlink(MClientRequest *req, CInode *diri) { // rmdir or unlink? bool rmdir = false; @@ -1583,10 +1655,6 @@ void Server::handle_client_unlink(MClientRequest *req, return; } - // xlock dentry - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; - mds->balancer->hit_dir(dn->dir, META_POP_DWR); // ok! @@ -1622,14 +1690,40 @@ public: void Server::_unlink_local(MClientRequest *req, CDentry *dn, CInode *in) { dout(10) << "_unlink_local " << *dn << endl; - + // if we're not the only link, wrlock the target (we need to nlink--) if (in->inode.nlink > 1) { assert(dn->is_remote()); // unlinking primary is handled like a rename.. not here + dout(10) << "_unlink_local nlink>1, will wrlock " << *in << endl; - dout(10) << "_unlink_local nlink>1, wrlocking " << *in << endl; + // auth pin + if (!dn->get_dir()->can_auth_pin()) { + dn->get_dir()->add_waiter(CDir::WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds, req, dn->get_dir()->get_inode())); + mdcache->request_drop_auth_pins(req); + return; + } + if (!in->can_auth_pin()) { + in->add_waiter(CInode::WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds, req, dn->get_dir()->get_inode())); + mdcache->request_drop_auth_pins(req); + return; + } + mdcache->request_auth_pin(req, dn->get_dir()); + mdcache->request_auth_pin(req, in); + + // lock + if (!mds->locker->dentry_xlock_start(dn, req, dn->get_dir()->get_inode())) + return; if (!mds->locker->inode_hard_write_start(in, req)) - return; // fw or (wait for) lock + return; + } else { + // the inode will go away. + dout(10) << "_unlink_local nlink==1, will destroy " << *in << endl; + + // just xlock dentry. + if (!mds->locker->dentry_xlock_start(dn, req, dn->get_dir()->get_inode())) + return; } // ok, let's do it. @@ -2308,6 +2402,14 @@ void Server::handle_client_rename_local(MClientRequest *req, void Server::handle_client_truncate(MClientRequest *req, CInode *cur) { + // auth pin + if (!cur->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *cur << endl; + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + mdcache->request_auth_pin(req, cur); + // write if (!mds->locker->inode_file_write_start(cur, req)) return; // fw or (wait for) lock @@ -2335,8 +2437,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) // =========================== // open, openc, close -void Server::handle_client_open(MClientRequest *req, - CInode *cur) +void Server::handle_client_open(MClientRequest *req, CInode *cur) { int flags = req->args.open.flags; int mode = req->args.open.mode; @@ -2364,12 +2465,20 @@ void Server::handle_client_open(MClientRequest *req, // O_TRUNC if (flags & O_TRUNC) { + // auth pin + if (!cur->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *cur << endl; + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + mdcache->request_auth_pin(req, cur); + // write if (!mds->locker->inode_file_write_start(cur, req)) return; // fw or (wait for) lock // do update - cur->inode.size = req->get_sizearg(); + cur->inode.size = 0; cur->_mark_dirty(); // fixme mds->locker->inode_file_write_finish(cur); @@ -2433,20 +2542,27 @@ void Server::handle_client_openc(MClientRequest *req, CInode *diri) dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; CDir *dir = 0; - CInode *in = 0; CDentry *dn = 0; // make dentry and inode, xlock dentry. bool excl = (req->args.open.flags & O_EXCL); - int r = prepare_mknod(req, diri, &dir, &in, &dn, !excl); // okexist = !excl - if (r <= 0) - return; // wait on something + int r = prepare_null_dentry(req, diri, &dir, &dn, !excl); // okexist = !excl + if (r == 0) return; // wait on something assert(dir); - assert(in); assert(dn); + if (r == 1) { - // created. + // created null dn. + + // xlock + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; + + // create inode. + CInode *in = prepare_new_inode(req, dir); + assert(in); + // it's a file. dn->pre_dirty(); in->inode.mode = 0644; // FIXME req should have a umask @@ -2469,16 +2585,33 @@ void Server::handle_client_openc(MClientRequest *req, CInode *diri) */ } else { // exists! - + // O_EXCL? if (req->args.open.flags & O_EXCL) { // fail. dout(10) << "O_EXCL, target exists, failing with -EEXIST" << endl; - reply_request(req, -EEXIST, in); + reply_request(req, -EEXIST, diri); return; } - - // FIXME: do i need to repin path based existant inode? hmm. + + // get inode + CInode *in = dn->inode; + if (!in) { + assert(dn->is_remote()); + in = mdcache->get_inode(dn->get_remote_ino()); + if (in) { + dout(7) << "linking in remote in " << *in << endl; + dn->link_remote(in); + } else { + dout(10) << "remote dn, opening inode for " << *dn << endl; + mdcache->open_remote_ino(dn->get_remote_ino(), req, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + } + assert(in); + + // FIXME: do i need to repin path based existent inode? hmm. handle_client_open(req, in); } } diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index d70a5dbeea7..7e67d434988 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -96,13 +96,13 @@ public: // link void handle_client_link(MClientRequest *req, CInode *ref); void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); - void link_local(MClientRequest *req, CInode *diri, - CDentry *dn, CInode *targeti); + void _link_local(MClientRequest *req, CInode *diri, + CDentry *dn, CInode *targeti); void _link_local_finish(MClientRequest *req, CDentry *dn, CInode *targeti, version_t, time_t, version_t); - void link_remote(MClientRequest *req, CInode *diri, - CDentry *dn, CInode *targeti); + void _link_remote(MClientRequest *req, CInode *diri, + CDentry *dn, CInode *targeti); // unlink void handle_client_unlink(MClientRequest *req, CInode *ref); @@ -129,21 +129,19 @@ public: // file - void handle_client_open(MClientRequest *req, CInode *ref); - void handle_client_openc(MClientRequest *req, CInode *ref); + void handle_client_open(MClientRequest *req, CInode *in); + void handle_client_openc(MClientRequest *req, CInode *diri); void handle_client_release(MClientRequest *req, CInode *in); void handle_client_truncate(MClientRequest *req, CInode *in); void handle_client_fsync(MClientRequest *req, CInode *in); // some helpers - CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc - CDir *validate_dentry_dir(MClientRequest *req, CInode *diri, string& dname); - int prepare_mknod(MClientRequest *req, CInode *diri, - CDir **pdir, CInode **pin, CDentry **pdn, - bool okexist=false); - + int prepare_null_dentry(MClientRequest *req, CInode *diri, + CDir **pdir, CDentry **pdn, + bool okexist=false); + CInode *prepare_new_inode(MClientRequest *req, CDir *dir); }; diff --git a/branches/sage/cephmds2/mds/events/EMetaBlob.h b/branches/sage/cephmds2/mds/events/EMetaBlob.h index 0820a3f45c0..1286202777a 100644 --- a/branches/sage/cephmds2/mds/events/EMetaBlob.h +++ b/branches/sage/cephmds2/mds/events/EMetaBlob.h @@ -287,10 +287,12 @@ class EMetaBlob { if (dn->is_remote()) { add_remote_dentry(dn, dirty); return 0; - } else { - assert(dn->is_primary()); - return add_primary_dentry(dn, dirty); + } else if (dn->is_null()) { + add_null_dentry(dn, dirty); + return 0; } + assert(dn->is_primary()); + return add_primary_dentry(dn, dirty); }