// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef __CLIENT_H #define __CLIENT_H #include "mds/MDSMap.h" #include "osd/OSDMap.h" #include "mon/MonMap.h" #include "msg/Message.h" #include "msg/Dispatcher.h" #include "msg/Messenger.h" #include "msg/SerialMessenger.h" #include "messages/MClientRequest.h" #include "messages/MClientReply.h" //#include "msgthread.h" #include "include/types.h" #include "include/lru.h" #include "include/filepath.h" #include "include/interval_set.h" #include "common/Mutex.h" #include "FileCache.h" // stl #include #include using namespace std; #include using namespace __gnu_cxx; #define O_LAZY 01000000 class Filer; class Objecter; class ObjectCacher; extern class LogType client_logtype; extern class Logger *client_logger; // ============================================ // types for my local metadata cache /* basic structure: - Dentries live in an LRU loop. they get expired based on last access. see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - Inode has ref count for each Fh, Dir, or Dentry that points to it. - when Inode ref goes to 0, it's expired. - when Dir is empty, it's removed (and it's Inode ref--) */ typedef int fh_t; class Dir; class Inode; class Dentry : public LRUObject { public: string name; // sort of lame //const char *name; Dir *dir; Inode *inode; int ref; // 1 if there's a dir beneath me. void get() { assert(ref == 0); ref++; lru_pin(); } void put() { assert(ref == 1); ref--; lru_unpin(); } Dentry() : dir(0), inode(0), ref(0) { } /*Dentry() : name(0), dir(0), inode(0), ref(0) { } Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { name = new char[n.length()+1]; strcpy((char*)name, n.c_str()); } ~Dentry() { delete[] name; }*/ }; class Dir { public: Inode *parent_inode; // my inode //hash_map, eqstr> dentries; hash_map dentries; Dir(Inode* in) { parent_inode = in; } bool is_empty() { return dentries.empty(); } }; class InodeCap { public: int caps; long seq; InodeCap() : caps(0), seq(0) {} }; class Inode { public: inode_t inode; // the actual inode time_t valid_until; // about the dir (if this is one!) int dir_auth; set dir_contacts; bool dir_hashed, dir_replicated; // per-mds caps map caps; // mds -> InodeCap map stale_caps; // mds -> cap .. stale time_t file_wr_mtime; // [writers] time of last write off_t file_wr_size; // [writers] largest offset we've written to int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers int ref; // ref count. 1 for each dentry, fh that links to me. Dir *dir; // if i'm a dir. Dentry *dn; // if i'm linked to a dentry. string *symlink; // symlink content, if it's a symlink // for caching i/o mode FileCache fc; // for sync i/o mode int sync_reads; // sync reads in progress int sync_writes; // sync writes in progress list waitfor_write; list waitfor_read; list waitfor_lazy; list waitfor_no_read, waitfor_no_write; void get() { ref++; //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; } void put() { ref--; assert(ref >= 0); //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; } Inode(inode_t _inode, ObjectCacher *_oc) : inode(_inode), valid_until(0), dir_auth(-1), dir_hashed(false), dir_replicated(false), file_wr_mtime(0), file_wr_size(0), num_open_rd(0), num_open_wr(0), num_open_lazy(0), ref(0), dir(0), dn(0), symlink(0), fc(_oc, _inode), sync_reads(0), sync_writes(0) { } ~Inode() { if (symlink) { delete symlink; symlink = 0; } } inodeno_t ino() { return inode.ino; } bool is_dir() { return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } int file_caps() { int c = 0; for (map::iterator it = caps.begin(); it != caps.end(); it++) c |= it->second.caps; for (map::iterator it = stale_caps.begin(); it != stale_caps.end(); it++) c |= it->second.caps; return c; } int file_caps_wanted() { int w = 0; if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; if (num_open_lazy) w |= CAP_FILE_LAZYIO; return w; } int authority(MDSMap *mdsmap) { //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; // parent? if (dn && dn->dir && dn->dir->parent_inode) { // parent hashed? if (dn->dir->parent_inode->dir_hashed) { // hashed assert(0); // fixme //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), //dn->name ); } if (dir_auth >= 0) return dir_auth; else return dn->dir->parent_inode->authority(mdsmap); } if (dir_auth >= 0) return dir_auth; assert(0); // !!! return 0; } int dentry_authority(const char *dn, MDSMap *mdsmap) { assert(0); return 0; //return ->hash_dentry( ino(), //dn ); } int pick_replica(MDSMap *mdsmap) { // replicas? if (ino() > 1ULL && dir_contacts.size()) { //cout << "dir_contacts if " << dir_contacts << endl; set::iterator it = dir_contacts.begin(); if (dir_contacts.size() == 1) return *it; else { int r = rand() % dir_contacts.size(); while (r--) it++; return *it; } } if (dir_replicated || ino() == 1) { //cout << "num_mds is " << mdcluster->get_num_mds() << endl; return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! } else return authority(mdsmap); } // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). Dir *open_dir() { if (!dir) { if (dn) dn->get(); // pin dentry get(); dir = new Dir(this); } return dir; } }; // file handle for any open file state struct Fh { Inode *inode; off_t pos; int mds; // have to talk to mds we opened with (for now) int mode; // the mode i opened the file with bool is_lazy() { return mode & O_LAZY; } Fh() : inode(0), pos(0), mds(0), mode(0) {} }; // ======================================================== // client interface class Client : public Dispatcher { public: /* getdir result */ struct DirResult { string path; map contents; map::iterator p; int off; int size; struct dirent_plus dp; struct dirent_lite dl; DirResult() : p(contents.end()), off(-1), size(0) {} }; protected: Messenger *messenger; int whoami; MonMap *monmap; // mds fake RPC tid_t last_tid; map mds_rpc_cond; map mds_rpc_reply; map mds_rpc_dispatch_cond; // cluster descriptors MDSMap *mdsmap; OSDMap *osdmap; bool mounted; bool unmounting; Cond mount_cond; int unsafe_sync_write; public: entity_name_t get_myname() { return messenger->get_myname(); } void hack_sync_write_safe(); protected: Filer *filer; ObjectCacher *objectcacher; Objecter *objecter; // (non-blocking) osd interface // cache hash_map inode_map; Inode* root; LRU lru; // lru list of Dentry's in our local metadata cache. // cap weirdness map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap // file handles, etc. string cwd; interval_set free_fh_set; // unused fh's hash_map fh_map; fh_t get_fh() { fh_t fh = free_fh_set.start(); free_fh_set.erase(fh, 1); return fh; } void put_fh(fh_t fh) { free_fh_set.insert(fh, 1); } void mkabspath(const char *rel, string& abs) { if (rel[0] == '/') { abs = rel; } else { abs = cwd; abs += "/"; abs += rel; } } // global client lock // - protects Client and buffer cache both! Mutex client_lock; // -- metadata cache stuff // decrease inode ref. delete if dangling. void put_inode(Inode *in) { in->put(); if (in->ref == 0) { inode_map.erase(in->inode.ino); if (in == root) root = 0; delete in; } } void close_dir(Dir *dir) { assert(dir->is_empty()); Inode *in = dir->parent_inode; if (in->dn) in->dn->put(); // unpin dentry delete in->dir; in->dir = 0; put_inode(in); } int get_cache_size() { return lru.lru_get_size(); } void set_cache_size(int m) { lru.lru_set_max(m); } Dentry* link(Dir *dir, const string& name, Inode *in) { Dentry *dn = new Dentry; dn->name = name; // link to dir dn->dir = dir; dir->dentries[dn->name] = dn; // link to inode dn->inode = in; in->dn = dn; in->get(); lru.lru_insert_mid(dn); // mid or top? return dn; } void unlink(Dentry *dn) { Inode *in = dn->inode; // unlink from inode dn->inode = 0; in->dn = 0; put_inode(in); // unlink from dir dn->dir->dentries.erase(dn->name); if (dn->dir->is_empty()) close_dir(dn->dir); dn->dir = 0; // delete den lru.lru_remove(dn); delete dn; } Dentry *relink(Dentry *dn, Dir *dir, const string& name) { // first link new dn to dir /* char *oldname = (char*)dn->name; dn->name = new char[name.length()+1]; strcpy((char*)dn->name, name.c_str()); dir->dentries[dn->name] = dn; */ dir->dentries[name] = dn; // unlink from old dir dn->dir->dentries.erase(dn->name); //delete[] oldname; if (dn->dir->is_empty()) close_dir(dn->dir); // fix up dn dn->name = name; dn->dir = dir; return dn; } // move dentry to top of lru void touch_dn(Dentry *dn) { lru.lru_touch(dn); } // trim cache. void trim_cache(); void dump_inode(Inode *in, set& did); void dump_cache(); // debug // find dentry based on filepath Dentry *lookup(filepath& path); // make blocking mds request MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1); MClientReply* sendrecv(MClientRequest *req, int mds); void handle_client_reply(MClientReply *reply); void fill_stat(inode_t& inode, struct stat *st); void fill_statlite(inode_t& inode, struct statlite *st); // friends friend class SyntheticClient; public: Client(Messenger *m, MonMap *mm); ~Client(); void tear_down_cache(); int get_nodeid() { return whoami; } void init(); void shutdown(); // messaging void dispatch(Message *m); void handle_mount_ack(class MClientMountAck*); void handle_unmount_ack(Message*); void handle_mds_map(class MMDSMap *m); // file caps void handle_file_caps(class MClientFileCaps *m); void implemented_caps(class MClientFileCaps *m, Inode *in); void release_caps(Inode *in, int retain=0); void update_caps_wanted(Inode *in); void close_release(Inode *in); void close_safe(Inode *in); // metadata cache Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); void update_inode_dist(Inode *in, InodeStat *st); Inode* insert_trace(MClientReply *reply); // ---------------------- // fs ops. int mount(); int unmount(); // these shoud (more or less) mirror the actual system calls. int statfs(const char *path, struct statvfs *stbuf); // crap int chdir(const char *s); // namespace ops int getdir(const char *path, list& contents); int getdir(const char *path, map& contents); DIR *opendir(const char *name); int closedir(DIR *dir); struct dirent *readdir(DIR *dir); void rewinddir(DIR *dir); off_t telldir(DIR *dir); void seekdir(DIR *dir, off_t offset); struct dirent_plus *readdirplus(DIR *dirp); int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); struct dirent_lite *readdirlite(DIR *dirp); int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); int link(const char *existing, const char *newname); int unlink(const char *path); int rename(const char *from, const char *to); // dirs int mkdir(const char *path, mode_t mode); int rmdir(const char *path); // symlinks int readlink(const char *path, char *buf, off_t size); int symlink(const char *existing, const char *newname); // inode stuff int _lstat(const char *path, int mask, Inode **in); int lstat(const char *path, struct stat *stbuf); int lstatlite(const char *path, struct statlite *buf); int chmod(const char *path, mode_t mode); int chown(const char *path, uid_t uid, gid_t gid); int utime(const char *path, struct utimbuf *buf); // file ops int mknod(const char *path, mode_t mode); int open(const char *path, int mode); int close(fh_t fh); int read(fh_t fh, char *buf, off_t size, off_t offset=-1); int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); int truncate(const char *file, off_t size); //int truncate(fh_t fh, long long size); int fsync(fh_t fh, bool syncdataonly); // hpc lazyio int lazyio_propogate(int fd, off_t offset, size_t count); int lazyio_synchronize(int fd, off_t offset, size_t count); int describe_layout(char *fn, list& result); void ms_handle_failure(Message*, const entity_inst_t& inst); }; #endif