// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef __CINODE_H #define __CINODE_H #include "config.h" #include "include/types.h" #include "include/lru.h" #include "mdstypes.h" #include "CDentry.h" #include "Lock.h" #include "Capability.h" #include #include #include #include #include #include using namespace std; // wait reasons #define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE // waiters: write_hard_start, read_file_start, write_file_start (mdcache) // handle_client_chmod, handle_client_touch (mds) // trigger: (see CDIR_WAIT_UNFREEZE) #define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode // waiters: import_dentry_inode // trigger: handle_inode_replicate_ack #define CINODE_WAIT_DIR (1<<13) // waiters: traverse_path // triggers: handle_disocver_reply #define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++ #define CINODE_WAIT_ANCHORED (1<<15) #define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink-- #define CINODE_WAIT_HARDR (1<<17) // 131072 #define CINODE_WAIT_HARDW (1<<18) // 262... #define CINODE_WAIT_HARDB (1<<19) #define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) #define CINODE_WAIT_HARDSTABLE (1<<20) #define CINODE_WAIT_HARDNORD (1<<21) #define CINODE_WAIT_FILER (1<<22) #define CINODE_WAIT_FILEW (1<<23) #define CINODE_WAIT_FILEB (1<<24) #define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) #define CINODE_WAIT_FILESTABLE (1<<25) #define CINODE_WAIT_FILENORD (1<<26) #define CINODE_WAIT_FILENOWR (1<<27) #define CINODE_WAIT_RENAMEACK (1<<28) #define CINODE_WAIT_RENAMENOTIFYACK (1<<29) #define CINODE_WAIT_CAPS (1<<30) #define CINODE_WAIT_ANY 0xffffffff // misc #define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export #define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME??? class Context; class CDentry; class CDir; class Message; class CInode; class CInodeDiscover; class MDCache; ostream& operator<<(ostream& out, CInode& in); // cached inode wrapper class CInode : public MDSCacheObject { public: // -- pins -- static const int PIN_CACHED = 1; static const int PIN_DIR = 2; static const int PIN_DIRTY = 4; // must flush static const int PIN_PROXY = 5; // can't expire yet static const int PIN_WAITER = 6; // waiter static const int PIN_CAPS = 7; // local fh's static const int PIN_AUTHPIN = 8; static const int PIN_IMPORTING = 9; // multipurpose, for importing static const int PIN_REQUEST = 10; // request is logging, finishing static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename static const int PIN_ANCHORING = 12; static const int PIN_OPENINGDIR = 13; static const int PIN_DENTRYLOCK = 14; static const char *pin_name(int p) { switch (p) { case PIN_CACHED: return "cached"; case PIN_DIR: return "dir"; case PIN_DIRTY: return "dirty"; case PIN_PROXY: return "proxy"; case PIN_WAITER: return "waiter"; case PIN_CAPS: return "caps"; case PIN_AUTHPIN: return "authpin"; case PIN_IMPORTING: return "importing"; case PIN_REQUEST: return "request"; case PIN_RENAMESRC: return "renamesrc"; case PIN_ANCHORING: return "anchoring"; case PIN_OPENINGDIR: return "openingdir"; case PIN_DENTRYLOCK: return "dentrylock"; default: assert(0); } } // state static const int STATE_AUTH = (1<<0); static const int STATE_ROOT = (1<<1); static const int STATE_DIRTY = (1<<2); static const int STATE_UNSAFE = (1<<3); // not logged yet static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry static const int STATE_UNLINKING = (1<<5); static const int STATE_PROXY = (1<<6); // can't expire yet static const int STATE_EXPORTING = (1<<7); // on nonauth bystander. static const int STATE_ANCHORING = (1<<8); static const int STATE_OPENINGDIR = (1<<9); //static const int STATE_RENAMING = (1<<8); // moving me //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked) public: MDCache *mdcache; inode_t inode; // the inode itself CDir *dir; // directory, if we have it opened. string symlink; // symlink dest, if symlink protected: // parent dentries in cache int num_parents; CDentry *parent; // primary link set remote_parents; // if hard linked // -- distributed caching int dangling_auth; // explicit auth, when dangling. int num_request_pins; // waiters multimap waiting; // -- distributed state -- public: // inode metadata locks CLock hardlock; CLock filelock; protected: // file capabilities map client_caps; // client -> caps map mds_caps_wanted; // [auth] mds -> caps wanted int replica_caps_wanted; // [replica] what i've requested from auth utime_t replica_caps_wanted_keep_until; private: // lock nesting int auth_pins; int nested_auth_pins; public: meta_load_t popularity[MDS_NPOP]; // friends friend class Server; friend class Locker; friend class Migrator; friend class MDCache; friend class CDir; friend class CInodeExport; friend class CInodeDiscover; public: // --------------------------- CInode(MDCache *c, bool auth=true); ~CInode(); // -- accessors -- bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; } bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; } bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; } bool is_anchored() { return inode.anchored; } bool is_root() { return state & STATE_ROOT; } bool is_proxy() { return state & STATE_PROXY; } bool is_auth() { return state & STATE_AUTH; } void set_auth(bool auth); inodeno_t ino() { return inode.ino; } inode_t& get_inode() { return inode; } CDentry* get_parent_dn() { return parent; } CDir *get_parent_dir(); CInode *get_parent_inode(); CInode *get_realm_root(); // import, hash, or root CDir *get_or_open_dir(MDCache *mdcache); CDir *set_dir(CDir *newdir); void close_dir(); bool dir_is_auth(); // -- misc -- void make_path(string& s); void make_anchor_trace(vector& trace); // -- state -- bool is_unsafe() { return state & STATE_UNSAFE; } bool is_dangling() { return state & STATE_DANGLING; } bool is_unlinking() { return state & STATE_UNLINKING; } void mark_unsafe() { state |= STATE_UNSAFE; } void mark_safe() { state &= ~STATE_UNSAFE; } // -- state encoding -- //void encode_basic_state(bufferlist& r); //void decode_basic_state(bufferlist& r, int& off); void encode_file_state(bufferlist& r); void decode_file_state(bufferlist& r, int& off); void encode_hard_state(bufferlist& r); void decode_hard_state(bufferlist& r, int& off); // -- dirtyness -- version_t get_version() { return inode.version; } bool is_dirty() { return state & STATE_DIRTY; } bool is_clean() { return !is_dirty(); } version_t pre_dirty(); void _mark_dirty(); void mark_dirty(version_t projected_dirv); void mark_clean(); CInodeDiscover* replicate_to(int rep); // -- waiting -- bool waiting_for(int tag); void add_waiter(int tag, Context *c); void take_waiting(int tag, list& ls); void finish_waiting(int mask, int result = 0); bool is_hardlock_write_wanted() { return waiting_for(CINODE_WAIT_HARDW); } bool is_filelock_write_wanted() { return waiting_for(CINODE_WAIT_FILEW); } // -- caps -- (new) // client caps map& get_client_caps() { return client_caps; } void add_client_cap(int client, Capability& cap) { if (client_caps.empty()) get(PIN_CAPS); assert(client_caps.count(client) == 0); client_caps[client] = cap; } void remove_client_cap(int client) { assert(client_caps.count(client) == 1); client_caps.erase(client); if (client_caps.empty()) put(PIN_CAPS); } Capability* get_client_cap(int client) { if (client_caps.count(client)) return &client_caps[client]; return 0; } /* void set_client_caps(map& cl) { if (client_caps.empty() && !cl.empty()) get(PIN_CAPS); client_caps.clear(); client_caps = cl; } */ void take_client_caps(map& cl) { if (!client_caps.empty()) put(PIN_CAPS); cl = client_caps; client_caps.clear(); } void merge_client_caps(map& cl, set& new_client_caps) { if (client_caps.empty() && !cl.empty()) get(PIN_CAPS); for (map::iterator it = cl.begin(); it != cl.end(); it++) { new_client_caps.insert(it->first); if (client_caps.count(it->first)) { // merge client_caps[it->first].merge(it->second); } else { // new client_caps[it->first] = it->second; } } } // caps issued, wanted int get_caps_issued() { int c = 0; for (map::iterator it = client_caps.begin(); it != client_caps.end(); it++) c |= it->second.issued(); return c; } int get_caps_wanted() { int w = 0; for (map::iterator it = client_caps.begin(); it != client_caps.end(); it++) { w |= it->second.wanted(); //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; } if (is_auth()) for (map::iterator it = mds_caps_wanted.begin(); it != mds_caps_wanted.end(); it++) { w |= it->second; //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; } return w; } void replicate_relax_locks() { assert(is_auth()); assert(!is_replicated()); dout(10) << " relaxing locks on " << *this << endl; if (hardlock.get_state() == LOCK_LOCK && !hardlock.is_used()) { dout(10) << " hard now sync " << *this << endl; hardlock.set_state(LOCK_SYNC); } if (filelock.get_state() == LOCK_LOCK) { if (!filelock.is_used() && (get_caps_issued() & CAP_FILE_WR) == 0) { filelock.set_state(LOCK_SYNC); dout(10) << " file now sync " << *this << endl; } else { dout(10) << " can't relax filelock on " << *this << endl; } } } // -- authority -- int authority(); // -- auth pins -- int is_auth_pinned() { return auth_pins; } int adjust_nested_auth_pins(int a); bool can_auth_pin(); void auth_pin(); void auth_unpin(); // -- freeze -- bool is_frozen(); bool is_frozen_dir(); bool is_freezing(); // -- reference counting -- /* these can be pinned any # of times, and are linked to an active_request, so they're automatically cleaned up when a request is finished. pin at will! */ void request_pin_get() { if (num_request_pins == 0) get(PIN_REQUEST); num_request_pins++; } void request_pin_put() { num_request_pins--; if (num_request_pins == 0) put(PIN_REQUEST); assert(num_request_pins >= 0); } void bad_put(int by) { dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; assert(ref_set.count(by) == 1); assert(ref > 0); } void bad_get(int by) { dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; assert(ref_set.count(by) == 0); } void first_get(); void last_put(); // -- hierarchy stuff -- private: void get_parent(); void put_parent(); public: void set_primary_parent(CDentry *p) { assert(parent == 0); parent = p; get_parent(); } void remove_primary_parent(CDentry *dn) { assert(dn == parent); parent = 0; put_parent(); } void add_remote_parent(CDentry *p) { if (remote_parents.empty()) get_parent(); remote_parents.insert(p); } void remove_remote_parent(CDentry *p) { remote_parents.erase(p); if (remote_parents.empty()) put_parent(); } int num_remote_parents() { return remote_parents.size(); } /* // for giving to clients void get_dist_spec(set& ls, int auth, timepair_t& now) { if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; ls = cached_by; } } */ // dbg void dump(int d = 0); }; // -- encoded state // discover class CInodeDiscover { inode_t inode; int replica_nonce; int hardlock_state; int filelock_state; public: CInodeDiscover() {} CInodeDiscover(CInode *in, int nonce) { inode = in->inode; replica_nonce = nonce; hardlock_state = in->hardlock.get_replica_state(); filelock_state = in->filelock.get_replica_state(); } inodeno_t get_ino() { return inode.ino; } int get_replica_nonce() { return replica_nonce; } void update_inode(CInode *in) { in->inode = inode; in->replica_nonce = replica_nonce; in->hardlock.set_state(hardlock_state); in->filelock.set_state(filelock_state); } void _encode(bufferlist& bl) { bl.append((char*)&inode, sizeof(inode)); bl.append((char*)&replica_nonce, sizeof(replica_nonce)); bl.append((char*)&hardlock_state, sizeof(hardlock_state)); bl.append((char*)&filelock_state, sizeof(filelock_state)); } void _decode(bufferlist& bl, int& off) { bl.copy(off,sizeof(inode_t), (char*)&inode); off += sizeof(inode_t); bl.copy(off, sizeof(int), (char*)&replica_nonce); off += sizeof(int); bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); off += sizeof(hardlock_state); bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); off += sizeof(filelock_state); } }; // export class CInodeExport { struct { inode_t inode; meta_load_t popularity_justme; meta_load_t popularity_curdom; bool is_dirty; // dirty inode? int num_caps; } st; map replicas; map cap_map; CLock hardlock,filelock; //int remaining_issued; public: CInodeExport() {} CInodeExport(CInode *in) { st.inode = in->inode; st.is_dirty = in->is_dirty(); replicas = in->replicas; hardlock = in->hardlock; filelock = in->filelock; st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; // steal WRITER caps from inode in->take_client_caps(cap_map); //remaining_issued = in->get_caps_issued(); } ~CInodeExport() { } inodeno_t get_ino() { return st.inode.ino; } void update_inode(CInode *in, set& new_client_caps) { in->inode = st.inode; in->popularity[MDS_POP_JUSTME] += st.popularity_justme; in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; in->popularity[MDS_POP_NESTED] += st.popularity_curdom; if (st.is_dirty) in->_mark_dirty(); in->replicas = replicas; if (!replicas.empty()) in->get(CInode::PIN_CACHED); in->hardlock = hardlock; in->filelock = filelock; // caps in->merge_client_caps(cap_map, new_client_caps); } void _encode(bufferlist& bl) { st.num_caps = cap_map.size(); bl.append((char*)&st, sizeof(st)); // cached_by + nonce ::_encode(replicas, bl); hardlock.encode_state(bl); filelock.encode_state(bl); // caps for (map::iterator it = cap_map.begin(); it != cap_map.end(); it++) { bl.append((char*)&it->first, sizeof(it->first)); it->second._encode(bl); } } int _decode(bufferlist& bl, int off = 0) { bl.copy(off, sizeof(st), (char*)&st); off += sizeof(st); ::_decode(replicas, bl, off); hardlock.decode_state(bl, off); filelock.decode_state(bl, off); // caps for (int i=0; i