diff --git a/src/client/Client.cc b/src/client/Client.cc index 478e5b8f728..6f9c50fc2fb 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -494,6 +494,11 @@ Inode * Client::add_update_inode(InodeStat *st, utime_t from, int mds) in->dirstat = st->dirstat; in->rstat = st->rstat; + if (in->is_dir()) { + in->dir_layout = st->dir_layout; + dout(20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl; + } + in->layout = st->layout; in->ctime = st->ctime; in->max_size = st->max_size; // right? @@ -651,6 +656,10 @@ Inode* Client::insert_trace(MetaRequest *request, utime_t from, int mds) return NULL; } + Connection *con = request->reply->get_connection(); + int features = con->get_features(); + dout(10) << " features 0x" << hex << features << dec << dendl; + // snap trace if (reply->snapbl.length()) update_snap_trace(reply->snapbl); @@ -667,7 +676,7 @@ Inode* Client::insert_trace(MetaRequest *request, utime_t from, int mds) InodeStat ist; if (reply->head.is_dentry) { - dirst.decode(p); + dirst.decode(p, features); dst.decode(p); ::decode(dname, p); ::decode(dlease, p); @@ -675,7 +684,7 @@ Inode* Client::insert_trace(MetaRequest *request, utime_t from, int mds) Inode *in = 0; if (reply->head.is_target) { - ist.decode(p); + ist.decode(p, features); in = add_update_inode(&ist, from, mds); } @@ -759,7 +768,7 @@ Inode* Client::insert_trace(MetaRequest *request, utime_t from, int mds) for (unsigned i=0; iinode) { in = req->inode; + if (req->path.depth()) { + hash = ceph_str_hash(in->dir_layout.dl_dir_hash, + req->path[0].data(), + req->path[0].length()); + dout(20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << " on " << req->path[0] + << " => " << hash << dendl; + is_hash = true; + + } } else if (req->dentry) { if (req->dentry->inode) { in = req->dentry->inode; } else { in = req->dentry->dir->parent_inode; - hash = ceph_str_hash_linux(req->dentry->name.data(), - req->dentry->name.length()); + hash = ceph_str_hash(in->dir_layout.dl_dir_hash, + req->dentry->name.data(), + req->dentry->name.length()); + dout(20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << " on " << req->dentry->name + << " => " << hash << dendl; is_hash = true; } } diff --git a/src/client/Client.h b/src/client/Client.h index 13aae7926a9..9af2439e0fa 100644 --- a/src/client/Client.h +++ b/src/client/Client.h @@ -387,6 +387,7 @@ class Inode { int32_t nlink; // file (data access) + ceph_dir_layout dir_layout; ceph_file_layout layout; uint64_t size; // on directory, # dentries uint32_t truncate_seq; diff --git a/src/cmds.cc b/src/cmds.cc index 3fe9e28559c..a0696308e85 100644 --- a/src/cmds.cc +++ b/src/cmds.cc @@ -114,8 +114,9 @@ int main(int argc, const char **argv) return 1; uint64_t supported = - CEPH_FEATURE_UID | - CEPH_FEATURE_NOSRCADDR; + CEPH_FEATURE_UID | + CEPH_FEATURE_NOSRCADDR | + CEPH_FEATURE_DIRLAYOUTHASH; messenger->set_default_policy(SimpleMessenger::Policy::client(supported, 0)); messenger->set_policy(entity_name_t::TYPE_MON, SimpleMessenger::Policy::client(supported, diff --git a/src/config.cc b/src/config.cc index 44d7d6060cd..ec5f13edd5d 100644 --- a/src/config.cc +++ b/src/config.cc @@ -96,6 +96,7 @@ std::map g_fake_kill_after; md_config_t g_conf; bool g_daemon = false; + #include #include @@ -440,6 +441,7 @@ static struct config_option config_optionsp[] = { OPTION(mds_early_reply, 0, OPT_BOOL, true), OPTION(mds_short_reply_trace, 0, OPT_BOOL, true), OPTION(mds_use_tmap, 0, OPT_BOOL, true), // use trivialmap for dir updates + OPTION(mds_default_dir_hash, 0, OPT_INT, CEPH_STR_HASH_RJENKINS), OPTION(mds_log, 0, OPT_BOOL, true), OPTION(mds_log_unsafe, 0, OPT_BOOL, false), // only wait for log sync, when it's mostly safe to do so OPTION(mds_log_skip_corrupt_events, 0, OPT_BOOL, false), diff --git a/src/config.h b/src/config.h index 447365406cd..6382c95f838 100644 --- a/src/config.h +++ b/src/config.h @@ -261,6 +261,8 @@ struct md_config_t { bool mds_use_tmap; + int mds_default_dir_hash; + bool mds_log; bool mds_log_unsafe; bool mds_log_skip_corrupt_events; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 83953f77dde..272dccfc705 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -46,6 +46,7 @@ #define CEPH_FEATURE_SUBSCRIBE2 (1<<4) #define CEPH_FEATURE_MONNAMES (1<<5) #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* @@ -58,10 +59,10 @@ struct ceph_file_layout { __le32 fl_stripe_count; /* over this many objects */ __le32 fl_object_size; /* until objects are this big, then move to new objects */ - __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ /* pg -> disk layout */ - __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ /* object -> pg layout */ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ @@ -72,6 +73,12 @@ struct ceph_file_layout { int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +struct ceph_dir_layout { + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ + __u8 dl_unused1; + __u16 dl_unused2; + __u32 dl_unused3; +} __attribute__ ((packed)); /* crypto algorithms */ #define CEPH_CRYPTO_NONE 0x0 @@ -463,7 +470,7 @@ struct ceph_mds_reply_inode { struct ceph_timespec rctime; struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ } __attribute__ ((packed)); -/* followed by frag array, then symlink string, then xattr blob */ +/* followed by frag array, symlink string, dir layout, xattr blob */ /* reply_lease follows dname, and reply_inode */ struct ceph_mds_reply_lease { diff --git a/src/include/types.h b/src/include/types.h index 4855f645848..b04d348985a 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -211,6 +211,7 @@ struct ltstr WRITE_RAW_ENCODER(ceph_fsid) WRITE_RAW_ENCODER(ceph_file_layout) +WRITE_RAW_ENCODER(ceph_dir_layout) WRITE_RAW_ENCODER(ceph_pg_pool) WRITE_RAW_ENCODER(ceph_mds_session_head) WRITE_RAW_ENCODER(ceph_mds_request_head) diff --git a/src/mds/Anchor.h b/src/mds/Anchor.h index 813701d1e44..52e71f9ea68 100644 --- a/src/mds/Anchor.h +++ b/src/mds/Anchor.h @@ -40,10 +40,6 @@ public: Anchor() : dn_hash(0), nref(0), updated(0) {} Anchor(inodeno_t i, inodeno_t di, __u32 hash, int nr, version_t u) : ino(i), dirino(di), dn_hash(hash), nref(nr), updated(u) { } - Anchor(inodeno_t i, inodeno_t di, const string &dname, int nr, version_t u) : - ino(i), dirino(di), - dn_hash(ceph_str_hash_linux(dname.data(), dname.length())), - nref(nr), updated(u) { } void encode(bufferlist &bl) const { __u8 struct_v = 1; diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index 8c87d97dc79..99c9cec2409 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -262,7 +262,7 @@ void CDentry::make_anchor_trace(vector& trace, CInode *in) dir->inode->make_anchor_trace(trace); // add this inode (in my dirfrag) to the end - trace.push_back(Anchor(in->ino(), dir->ino(), name, 0, 0)); + trace.push_back(Anchor(in->ino(), dir->ino(), get_hash(), 0, 0)); dout(10) << "make_anchor_trace added " << trace.back() << dendl; } diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 9143e394a1e..ca3f2f97649 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -96,6 +96,7 @@ public: public: string name; + __u32 hash; snapid_t first, last; dentry_key_t key() { @@ -163,9 +164,9 @@ public: public: // cons - CDentry(const string& n, + CDentry(const string& n, __u32 h, snapid_t f, snapid_t l) : - name(n), + name(n), hash(h), first(f), last(l), dir(0), version(0), projected_version(0), @@ -176,9 +177,9 @@ public: g_num_dn++; g_num_dna++; } - CDentry(const string& n, inodeno_t ino, unsigned char dt, + CDentry(const string& n, __u32 h, inodeno_t ino, unsigned char dt, snapid_t f, snapid_t l) : - name(n), + name(n), hash(h), first(f), last(l), dir(0), version(0), projected_version(0), @@ -200,6 +201,8 @@ public: CDir *get_dir() const { return dir; } const string& get_name() const { return name; } + __u32 get_hash() const { return hash; } + /* CInode *get_inode() const { return linkage.inode; } inodeno_t get_remote_ino() { return linkage.remote_ino; } diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 7203ee92407..27ed87acff0 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -229,7 +229,7 @@ CDentry* CDir::add_null_dentry(const string& dname, assert(lookup_exact_snap(dname, last) == 0); // create dentry - CDentry* dn = new CDentry(dname, first, last); + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last); if (is_auth()) dn->state_set(CDentry::STATE_AUTH); cache->lru.lru_insert_mid(dn); @@ -265,7 +265,7 @@ CDentry* CDir::add_primary_dentry(const string& dname, CInode *in, assert(lookup_exact_snap(dname, last) == 0); // create dentry - CDentry* dn = new CDentry(dname, first, last); + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), first, last); if (is_auth()) dn->state_set(CDentry::STATE_AUTH); cache->lru.lru_insert_mid(dn); @@ -303,9 +303,9 @@ CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned ch { // foreign assert(lookup_exact_snap(dname, last) == 0); - + // create dentry - CDentry* dn = new CDentry(dname, ino, d_type, first, last); + CDentry* dn = new CDentry(dname, inode->hash_dentry_name(dname), ino, d_type, first, last); if (is_auth()) dn->state_set(CDentry::STATE_AUTH); cache->lru.lru_insert_mid(dn); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index de9f2ae6001..401530d8da1 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -420,12 +420,20 @@ void CInode::pop_projected_snaprealm(sr_t *next_snaprealm) // dirfrags +__u32 CInode::hash_dentry_name(const string &dn) +{ + int which = inode.dir_layout.dl_dir_hash; + if (!which) + which = CEPH_STR_HASH_LINUX; + return ceph_str_hash(which, dn.data(), dn.length()); +} + frag_t CInode::pick_dirfrag(const string& dn) { if (dirfragtree.empty()) return frag_t(); // avoid the string hash if we can. - __u32 h = ceph_str_hash_linux(dn.data(), dn.length()); + __u32 h = hash_dentry_name(dn); return dirfragtree[h]; } @@ -2076,6 +2084,8 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, { int client = session->inst.name.num(); assert(snapid); + + assert(session->connection); bool valid = true; @@ -2299,6 +2309,10 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, ::encode(p->second, bl); } ::encode(symlink, bl); + if (session->connection->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { + i = pfile ? pi : oi; + ::encode(i->dir_layout, bl); + } ::encode(xbl, bl); return valid; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 71098250e08..c6f7ad9d196 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -358,6 +358,7 @@ private: int stickydir_ref; public: + __u32 hash_dentry_name(const string &dn); frag_t pick_dirfrag(const string &dn); bool has_dirfrags() { return !dirfrags.empty(); } CDir* get_dirfrag(frag_t fg) { diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 4e4a420c141..c106a07d6ca 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -256,10 +256,14 @@ CInode *MDCache::create_system_inode(inodeno_t ino, int mode) in->inode.mtime = g_clock.now(); in->inode.nlink = 1; in->inode.truncate_size = -1ull; - if (in->inode.is_dir()) + + memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); + if (in->inode.is_dir()) { memset(&in->inode.layout, 0, sizeof(in->inode.layout)); - else + in->inode.dir_layout.dl_dir_hash = g_conf.mds_default_dir_hash; + } else { in->inode.layout = default_file_layout; + } if (in->is_base()) { if (in->is_root()) @@ -6845,7 +6849,7 @@ void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) in->make_anchor_trace(trace); if (!trace.size()) { assert(MDS_INO_IS_BASE(in->ino())); - trace.push_back(Anchor(in->ino(), in->ino(), "", 0, 0)); + trace.push_back(Anchor(in->ino(), in->ino(), 0, 0, 0)); } // do it diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 03c7d1b6f61..9e43cec8f8a 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1652,6 +1652,13 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, u in->inode.version = 1; in->inode.nlink = 1; // FIXME + + in->inode.mode = mode; + + memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); + if (in->inode.is_dir()) + in->inode.dir_layout.dl_dir_hash = g_conf.mds_default_dir_hash; + if (layout) in->inode.layout = *layout; else if (in->inode.is_dir()) @@ -1677,7 +1684,6 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir, inodeno_t useino, u in->inode.gid = mdr->client_request->get_caller_gid(); in->inode.uid = mdr->client_request->get_caller_uid(); - in->inode.mode = mode; in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index 7014b11b198..253f84642c5 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -917,6 +917,7 @@ struct inode_t { bool anchored; // auth only? // file (data access) + ceph_dir_layout dir_layout; // [dir only] ceph_file_layout layout; uint64_t size; // on directory, # dentries uint32_t truncate_seq; @@ -994,7 +995,7 @@ struct inode_t { } void encode(bufferlist &bl) const { - __u8 v = 3; + __u8 v = 4; ::encode(v, bl); ::encode(ino, bl); @@ -1008,6 +1009,7 @@ struct inode_t { ::encode(nlink, bl); ::encode(anchored, bl); + ::encode(dir_layout, bl); ::encode(layout, bl); ::encode(size, bl); ::encode(truncate_seq, bl); @@ -1042,6 +1044,10 @@ struct inode_t { ::decode(nlink, p); ::decode(anchored, p); + if (v >= 4) + ::decode(dir_layout, p); + else + memset(&dir_layout, 0, sizeof(dir_layout)); ::decode(layout, p); ::decode(size, p); ::decode(truncate_seq, p); diff --git a/src/messages/MClientReply.h b/src/messages/MClientReply.h index 4d2d444c580..6db029a4bf4 100644 --- a/src/messages/MClientReply.h +++ b/src/messages/MClientReply.h @@ -113,15 +113,18 @@ struct InodeStat { version_t xattr_version; bufferlist xattrbl; + + ceph_dir_layout dir_layout; + //map xattrs; public: InodeStat() {} - InodeStat(bufferlist::iterator& p) { - decode(p); + InodeStat(bufferlist::iterator& p, int features) { + decode(p, features); } - void decode(bufferlist::iterator &p) { + void decode(bufferlist::iterator &p, int features) { struct ceph_mds_reply_inode e; ::decode(e, p); vino.ino = inodeno_t(e.ino); @@ -160,6 +163,11 @@ struct InodeStat { } ::decode(symlink, p); + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ::decode(dir_layout, p); + else + memset(&dir_layout, 0, sizeof(dir_layout)); + xattr_version = e.xattr_version; ::decode(xattrbl, p); } diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h index a4355003266..1b8c0d7e42c 100644 --- a/src/msg/SimpleMessenger.h +++ b/src/msg/SimpleMessenger.h @@ -57,7 +57,8 @@ using namespace __gnu_cxx; CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_MONNAMES | \ CEPH_FEATURE_FLOCK | \ - CEPH_FEATURE_RECONNECT_SEQ + CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH class SimpleMessenger : public Messenger { public: