From fd7c1f3f99386282ca26561f8e4c56294b6c29d2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Jun 2009 10:02:27 -0700 Subject: [PATCH 01/13] mds: mark CDir objects with parent ino, dname --- src/mds/CDir.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index c452c596034..e9e8b9f81dd 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1486,6 +1486,15 @@ void CDir::_commit(version_t want) inode->make_path_string(path); m.setxattr("path", path); + CDentry *pdn = inode->get_parent_dn(); + if (pdn) { + bufferlist parent(16 + pdn->name.length()); + __u64 ino = pdn->get_dir()->get_inode()->ino(); + ::encode(ino, parent); + ::encode(pdn->name, parent); + m.setxattr("parent", parent); + } + object_t oid = get_ondisk_object(); OSDMap *osdmap = cache->mds->objecter->osdmap; ceph_object_layout ol = osdmap->make_object_layout(oid, From 60ec1cbe0ecc1e57d22c0eccffcb3b30faf7f5b9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Jun 2009 15:35:10 -0700 Subject: [PATCH 02/13] kclient: fix I_COMPLETE The previous use of I_READDIR vs I_COMPLETE was flawed, mainly because the state was maintained on a per-inode basis, but readdir proceeds on a per-file basis. Instead of flags, maintain a counter in the inode that is incremented each time a dentry is released. When readdir starts, note the counter, and if it is the same when readdir completes, AND we did not do any forward seeks on the file handle, AND prepopulate succeeded on each hunk, then we can set I_COMPLETE. --- src/kernel/dir.c | 28 +++++++++++++++------------- src/kernel/inode.c | 5 ++++- src/kernel/mds_client.h | 2 ++ src/kernel/super.h | 3 ++- src/mds/Server.cc | 2 +- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/kernel/dir.c b/src/kernel/dir.c index 0c472e7ad39..fdeff4b1e24 100644 --- a/src/kernel/dir.c +++ b/src/kernel/dir.c @@ -157,7 +157,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - int complete = 0, len; + int len; const int max_entries = client->mount_args.max_readdir; dout(5, "readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); @@ -165,8 +165,8 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) return 0; if (filp->f_pos == 0) { - /* set I_READDIR at start of readdir */ - ceph_i_set(inode, CEPH_I_READDIR); + /* note dir version at start of readdir */ + fi->dir_release_count = ci->i_release_count; dout(10, "readdir off 0 -> '.'\n"); if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), @@ -242,8 +242,10 @@ more: (int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_complete); - if (req->r_reply_info.dir_complete) - complete = 1; + if (!req->r_did_prepopulate) { + dout(10, "readdir !did_prepopulate"); + fi->dir_release_count--; + } fi->off = fi->next_off; kfree(fi->last_name); @@ -312,15 +314,14 @@ more: fi->at_end = 1; /* - * if I_READDIR is still set, no dentries were released - * during the whole readdir, and we should have the complete - * dir contents in our cache. + * if dir_release_count still matches the dir, no dentries + * were released during the whole readdir, and we should have + * the complete dir contents in our cache. */ spin_lock(&inode->i_lock); - if (complete && (ci->i_ceph_flags & CEPH_I_READDIR)) { + if (ci->i_release_count == fi->dir_release_count) { dout(10, " marking %p complete\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; - ci->i_ceph_flags &= ~CEPH_I_READDIR; ci->i_max_offset = filp->f_pos; } spin_unlock(&inode->i_lock); @@ -364,9 +365,9 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) fi->at_end = 0; } - /* clear I_READDIR if we did a forward seek */ + /* bump dir_release_count if we did a forward seek */ if (offset > old_offset) - ceph_inode(inode)->i_ceph_flags &= ~CEPH_I_READDIR; + fi->dir_release_count--; } mutex_unlock(&inode->i_mutex); return retval; @@ -903,7 +904,8 @@ static void ceph_dentry_release(struct dentry *dentry) if (ci->i_rdcache_gen == di->lease_rdcache_gen) { dout(10, " clearing %p complete (d_release)\n", parent_inode); - ci->i_ceph_flags &= ~(CEPH_I_COMPLETE|CEPH_I_READDIR); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; } spin_unlock(&parent_inode->i_lock); } diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 5473ab64552..7c2a90c892d 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -251,6 +251,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; + ci->i_release_count = 0; ci->i_symlink = NULL; ci->i_fragtree = RB_ROOT; @@ -854,7 +855,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ceph_inode(req->r_locked_dir); dout(10, " clearing %p complete (empty trace)\n", req->r_locked_dir); - ci->i_ceph_flags &= ~(CEPH_I_READDIR | CEPH_I_COMPLETE); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; } return 0; } @@ -1134,6 +1136,7 @@ retry_lookup: req->r_session, req->r_request_started); dput(dn); } + req->r_did_prepopulate = true; out: if (snapdir) { diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h index 4661e35cd2c..4316c5b2cbe 100644 --- a/src/kernel/mds_client.h +++ b/src/kernel/mds_client.h @@ -220,6 +220,8 @@ struct ceph_mds_request { struct list_head r_unsafe_item; /* per-session unsafe list item */ bool r_got_unsafe, r_got_safe; + bool r_did_prepopulate; + struct ceph_cap_reservation r_caps_reservation; int r_num_caps; }; diff --git a/src/kernel/super.h b/src/kernel/super.h index 4651284acef..4a0381c009d 100644 --- a/src/kernel/super.h +++ b/src/kernel/super.h @@ -274,7 +274,6 @@ struct ceph_inode_xattrs_info { * Ceph inode. */ #define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_READDIR 2 /* no dentries trimmed since readdir start */ #define CEPH_I_NODELAY 4 /* do not delay cap release */ #define CEPH_I_FLUSH 8 /* do not delay cap send */ @@ -285,6 +284,7 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; + unsigned long i_release_count; struct ceph_file_layout i_layout; char *i_symlink; @@ -622,6 +622,7 @@ struct ceph_file_info { unsigned next_off; struct dentry *dentry; int at_end; + unsigned long dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index a894243b102..4212f20f51b 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2209,7 +2209,7 @@ void Server::handle_client_readdir(MDRequest *mdr) } __u8 end = (it == dir->end()); - __u8 complete = (end && !offset); + __u8 complete = (end && !offset); // FIXME: what purpose does this serve // final blob bufferlist dirbl; From 0a7e55946a0dc8204e6ffb895c487191da22df8b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 15 Jun 2009 16:16:57 -0700 Subject: [PATCH 03/13] kclient: fix di->off calculation The dentry dir offset calculation wasn't taking into account the possibility of multiple readdi requests, which in turn meant bad results for readdir-from-dcache. Since doing this on the client side was a mess, the MDS includes a dentry offset for each readdir dentry within the dirfrag. This value is stored in di->offset (with adjustment in leftmost frag for . and ..), and that's the value that's passed back via filldir. --- src/client/Client.cc | 2 ++ src/include/ceph_fs.h | 2 +- src/kernel/dir.c | 10 ++++++---- src/kernel/inode.c | 4 ++-- src/kernel/mds_client.c | 8 ++++++-- src/kernel/mds_client.h | 1 + src/mds/Server.cc | 9 ++++++--- 7 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index dea62357915..4cd6ffc7b4c 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -638,6 +638,8 @@ Inode* Client::insert_trace(MetaRequest *request, utime_t from, int mds) string dname; LeaseStat dlease; while (numdn) { + __u32 pos; // dentry pos within the fragment + ::decode(pos, p); ::decode(dname, p); ::decode(dlease, p); InodeStat ist(p); diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 3be87ed6b20..d443ada1bef 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -27,7 +27,7 @@ #define CEPH_MDS_PROTOCOL 9 /* cluster internal */ #define CEPH_MON_PROTOCOL 4 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 17 /* public/client */ -#define CEPH_MDSC_PROTOCOL 21 /* public/client */ +#define CEPH_MDSC_PROTOCOL 22 /* public/client */ #define CEPH_MONC_PROTOCOL 12 /* public/client */ diff --git a/src/kernel/dir.c b/src/kernel/dir.c index fdeff4b1e24..c0fca4da626 100644 --- a/src/kernel/dir.c +++ b/src/kernel/dir.c @@ -279,22 +279,24 @@ more: dout(10, "readdir frag %x num %d off %d fragoff %d skew %d\n", frag, rinfo->dir_nr, off, fi->off, skew); while (off >= skew && off+skew < rinfo->dir_nr) { - dout(10, "readdir off %d -> %d / %d name '%.*s'\n", + u64 pos = ceph_make_fpos(frag, rinfo->dir_pos[off+skew]); + + dout(10, "readdir off %d -> %d / %d %lld name '%.*s'\n", off, off+skew, - rinfo->dir_nr, rinfo->dir_dname_len[off+skew], + rinfo->dir_nr, pos, rinfo->dir_dname_len[off+skew], rinfo->dir_dname[off+skew]); ftype = le32_to_cpu(rinfo->dir_in[off+skew].in->mode) >> 12; if (filldir(dirent, rinfo->dir_dname[off+skew], rinfo->dir_dname_len[off+skew], - ceph_make_fpos(frag, off), + pos, le64_to_cpu(rinfo->dir_in[off+skew].in->ino), ftype) < 0) { dout(20, "filldir stopping us...\n"); return 0; } off++; - filp->f_pos++; + filp->f_pos = pos + 1; } if (fi->last_name) { diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 7c2a90c892d..6c03b50212d 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -1107,8 +1107,8 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, - i + (frag_is_leftmost(frag) ? 2 : 0)); + di->offset = ceph_make_fpos(frag, rinfo->dir_pos[i] + + (frag_is_leftmost(frag) ? 2 : 0)); /* inode */ if (dn->d_inode) { diff --git a/src/kernel/mds_client.c b/src/kernel/mds_client.c index 68a82aa24e5..3d949a28849 100644 --- a/src/kernel/mds_client.c +++ b/src/kernel/mds_client.c @@ -136,19 +136,23 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_in = kmalloc(num * (sizeof(*info->dir_in) + sizeof(*info->dir_dname) + sizeof(*info->dir_dname_len) + + sizeof(*info->dir_pos) + sizeof(*info->dir_dlease)), GFP_NOFS); if (info->dir_in == NULL) { err = -ENOMEM; goto out_bad; } - info->dir_dname = (void *)(info->dir_in + num); + info->dir_pos = (void *)(info->dir_in + num); + info->dir_dname = (void *)(info->dir_pos + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); while (num) { /* dentry */ - ceph_decode_32_safe(p, end, info->dir_dname_len[i], bad); + ceph_decode_need(p, end, sizeof(u32)*2, bad); + ceph_decode_32(p, info->dir_pos[i]); + ceph_decode_32(p, info->dir_dname_len[i]); ceph_decode_need(p, end, info->dir_dname_len[i], bad); info->dir_dname[i] = *p; *p += info->dir_dname_len[i]; diff --git a/src/kernel/mds_client.h b/src/kernel/mds_client.h index 4316c5b2cbe..fe090697305 100644 --- a/src/kernel/mds_client.h +++ b/src/kernel/mds_client.h @@ -86,6 +86,7 @@ struct ceph_mds_reply_info_parsed { u32 *dir_dname_len; struct ceph_mds_reply_lease **dir_dlease; struct ceph_mds_reply_info_in *dir_in; + u32 *dir_pos; u8 dir_complete, dir_end; /* encoded blob describing snapshot contexts for certain diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 4212f20f51b..fe39108acd4 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2143,13 +2143,11 @@ void Server::handle_client_readdir(MDRequest *mdr) __u32 numfiles = 0; + __u32 pos = 0; while (it != dir->end() && numfiles < max) { CDentry *dn = it->second; it++; - if (offset && strcmp(dn->get_name().c_str(), offset) <= 0) - continue; - if (dn->state_test(CDentry::STATE_PURGING)) continue; @@ -2168,6 +2166,10 @@ void Server::handle_client_readdir(MDRequest *mdr) if (dn->last < snapid || dn->first > snapid) continue; + __u32 dpos = pos++; + if (offset && strcmp(dn->get_name().c_str(), offset) <= 0) + continue; + CInode *in = dnl->get_inode(); // remote link? @@ -2195,6 +2197,7 @@ void Server::handle_client_readdir(MDRequest *mdr) // dentry dout(12) << "including dn " << *dn << dendl; + ::encode(dpos, dnbl); ::encode(dn->name, dnbl); mds->locker->issue_client_lease(dn, client, dnbl, mdr->now, mdr->session); From ed34a0c7dc0849b8483201c00ddc6588436f12fb Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Mon, 15 Jun 2009 16:22:44 -0700 Subject: [PATCH 04/13] Client-side support for creating/removing pool snapshots. --- src/Makefile.am | 2 ++ src/include/librados.h | 4 +-- src/librados.cc | 59 ++++++++++++++++++++++++++++++++- src/messages/MPoolSnap.h | 53 ++++++++++++++++++++++++++++++ src/messages/MPoolSnapReply.h | 48 +++++++++++++++++++++++++++ src/msg/Message.cc | 10 +++++- src/msg/Message.h | 3 ++ src/osdc/Objecter.cc | 61 +++++++++++++++++++++++++++++++++++ src/osdc/Objecter.h | 21 +++++++++++- 9 files changed, 256 insertions(+), 5 deletions(-) create mode 100644 src/messages/MPoolSnap.h create mode 100644 src/messages/MPoolSnapReply.h diff --git a/src/Makefile.am b/src/Makefile.am index 7d815d73b2f..518465e648b 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -573,6 +573,8 @@ noinst_HEADERS = \ messages/MPGStats.h\ messages/MPGStatsAck.h\ messages/MPing.h\ + messages/MPoolSnap.h\ + messages/MPoolSnapReply.h\ messages/MRemoveSnaps.h\ messages/MStatfs.h\ messages/MStatfsReply.h\ diff --git a/src/include/librados.h b/src/include/librados.h index 3b477fa43f5..f632c6ea395 100644 --- a/src/include/librados.h +++ b/src/include/librados.h @@ -114,8 +114,8 @@ public: std::map& stats); int get_fs_stats(rados_statfs_t& result); - int snap_create(rados_pool_t pool, const char *snapname); - int snap_remove(rados_pool_t pool, const char *snapname); + int snap_create(rados_pool_t pool, char *snapname); + int snap_remove(rados_pool_t pool, char *snapname); int snap_list(rados_pool_t pool, vector *snaps); int snap_get_name(rados_pool_t pool, rados_snap_t snap, std::string *name); int snap_get_stamp(rados_pool_t pool, rados_snap_t snap, time_t *t); diff --git a/src/librados.cc b/src/librados.cc index 7400795b242..8028e3cb908 100644 --- a/src/librados.cc +++ b/src/librados.cc @@ -89,6 +89,8 @@ public: int snap_lookup(PoolCtx *pool, const char *name, rados_snap_t *snapid); int snap_get_name(PoolCtx *pool, rados_snap_t snapid, std::string *s); int snap_get_stamp(PoolCtx *pool, rados_snap_t snapid, time_t *t); + int snap_create(rados_pool_t pool, char* snapname); + int snap_remove(rados_pool_t pool, char* snapname); // io int write(PoolCtx& pool, const object_t& oid, off_t off, bufferlist& bl, size_t len); @@ -371,8 +373,11 @@ bool RadosClient::_dispatch(Message *m) case CEPH_MSG_STATFS_REPLY: objecter->handle_fs_stats_reply((MStatfsReply*)m); - break; + break; + case MSG_POOLSNAPREPLY: + objecter->handle_pool_snap_reply((MPoolSnapReply*)m); + break; default: return false; } @@ -445,8 +450,49 @@ int RadosClient::get_fs_stats( rados_statfs_t& result ) { } + // SNAPS +int RadosClient::snap_create( rados_pool_t pool, char *snapName) { + int reply; + int poolID = ((PoolCtx *)pool)->poolid; + + Mutex mylock ("RadosClient::snap_create::mylock"); + Cond cond; + bool done; + lock.Lock(); + objecter->create_pool_snap(&reply, + poolID, + snapName, + new C_SafeCond(&mylock, &cond, &done)); + lock.Unlock(); + + mylock.Lock(); + while(!done) cond.Wait(mylock); + mylock.Unlock(); + return reply; +} + +int RadosClient::snap_remove( rados_pool_t pool, char *snapName) { + int reply; + int poolID = ((PoolCtx *)pool)->poolid; + + Mutex mylock ("RadosClient::snap_remove::mylock"); + Cond cond; + bool done; + lock.Lock(); + objecter->delete_pool_snap(&reply, + poolID, + snapName, + new C_SafeCond(&mylock, &cond, &done)); + lock.Unlock(); + + mylock.Lock(); + while(!done) cond.Wait(mylock); + mylock.Unlock(); + return reply; +} + int RadosClient::snap_list(PoolCtx *pool, vector *snaps) { Mutex::Locker l(lock); @@ -885,6 +931,17 @@ int Rados::close_pool(rados_pool_t pool) // SNAPS +int Rados::snap_create(rados_pool_t pool, char *snapname) { + if (!client) return -EINVAL; + return client->snap_create(pool, snapname); +} + +int Rados::snap_remove(rados_pool_t pool, char *snapname) { + if (!client) return -EINVAL; + return client->snap_remove(pool, snapname); +} + + void Rados::set_snap(rados_pool_t pool, snapid_t seq) { if (!client) diff --git a/src/messages/MPoolSnap.h b/src/messages/MPoolSnap.h new file mode 100644 index 00000000000..0edea587074 --- /dev/null +++ b/src/messages/MPoolSnap.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MPOOLSNAP_H +#define __MPOOLSNAP_H + + +class MPoolSnap : public Message { +public: + ceph_fsid_t fsid; + tid_t tid; + int pool; + char *name; + bool create; + + MPoolSnap() : Message(MSG_POOLSNAP) {} + MPoolSnap( ceph_fsid_t& f, tid_t t, int p, char *n, bool c) : + Message(MSG_POOLSNAP), fsid(f), tid(t), pool(p), name(n), create(c) {} + + const char *get_type_name() { return "poolsnap"; } + void print(ostream& out) { + out << "poolsnap(" << tid << " " << name << ")"; + } + + void encode_payload() { + ::encode(fsid, payload); + ::encode(tid, payload); + ::encode(pool, payload); + ::encode(name, payload); + ::encode(create, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(fsid, p); + ::decode(tid, p); + ::decode(pool, p); + ::decode(*name, p); + ::decode(create, p); + } +}; + +#endif diff --git a/src/messages/MPoolSnapReply.h b/src/messages/MPoolSnapReply.h new file mode 100644 index 00000000000..a73031f908b --- /dev/null +++ b/src/messages/MPoolSnapReply.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MPOOLSNAPREPLY_H +#define __MPOOLSNAPREPLY_H + + +class MPoolSnapReply : public Message { +public: + ceph_fsid_t fsid; + tid_t tid; + int replyCode; + + MPoolSnapReply() : Message(MSG_POOLSNAPREPLY) {} + MPoolSnapReply( ceph_fsid_t& f, tid_t t, int rc) : + Message(MSG_POOLSNAPREPLY), fsid(f), tid(t), replyCode(rc) {} + + const char *get_type_name() { return "poolsnapreply"; } + + void print(ostream& out) { + out << "poolsnapreply(" << tid <<")"; + } + + void encode_payload() { + ::encode(fsid, payload); + ::encode(tid, payload); + ::encode(replyCode, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(fsid, p); + ::decode(tid, p); + ::decode(replyCode, p); + } +}; + +#endif diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 1a0d9605d28..b0714d801b6 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -19,6 +19,9 @@ using namespace std; #include "messages/MGetPoolStats.h" #include "messages/MGetPoolStatsReply.h" +#include "messages/MPoolSnap.h" +#include "messages/MPoolSnapReply.h" + #include "messages/MMonCommand.h" #include "messages/MMonCommandAck.h" #include "messages/MMonPaxos.h" @@ -168,7 +171,12 @@ Message *decode_message(ceph_msg_header& header, ceph_msg_footer& footer, case MSG_GETPOOLSTATSREPLY: m = new MGetPoolStatsReply; break; - + case MSG_POOLSNAP: + m = new MPoolSnap; + break; + case MSG_POOLSNAPREPLY: + m = new MPoolSnapReply; + break; case MSG_MON_COMMAND: m = new MMonCommand; break; diff --git a/src/msg/Message.h b/src/msg/Message.h index 5ccb2b065b1..d6c2feba4cf 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -35,6 +35,9 @@ #define MSG_GETPOOLSTATS 58 #define MSG_GETPOOLSTATSREPLY 59 +#define MSG_POOLSNAP 49 +#define MSG_POOLSNAPREPLY 48 + // osd internal #define MSG_OSD_PING 70 #define MSG_OSD_BOOT 71 diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index a91603ae13b..e29d9407627 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -25,6 +25,9 @@ #include "messages/MOSDMap.h" #include "messages/MOSDGetMap.h" +#include "messages/MPoolSnap.h" +#include "messages/MPoolSnapReply.h" + #include "messages/MGetPoolStats.h" #include "messages/MGetPoolStatsReply.h" #include "messages/MStatfs.h" @@ -76,6 +79,10 @@ void Objecter::dispatch(Message *m) handle_fs_stats_reply((MStatfsReply*)m); break; + case MSG_POOLSNAPREPLY: + handle_pool_snap_reply((MPoolSnapReply*)m); + break; + default: dout(1) << "don't know message type " << m->get_type() << dendl; assert(0); @@ -525,6 +532,60 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) delete m; } +//snapshots + +void Objecter::create_pool_snap(int *reply, int pool, char* snapName, Context *onfinish) { + dout(10) << "create_pool_snap; pool: " << pool << "; snap: " << snapName << dendl; + SnapOp *op = new SnapOp; + op->tid = ++last_tid; + op->pool = pool; + op->name = snapName; + op->onfinish = onfinish; + op->create = true; + op->replyCode = reply; + op_snap[op->tid] = op; + + pool_snap_submit(op); +} + +void Objecter::delete_pool_snap(int *reply, int pool, char* snapName, Context *onfinish) { + dout(10) << "delete_pool_snap; pool: " << pool << "; snap: " << snapName << dendl; + SnapOp *op = new SnapOp; + op->tid = ++last_tid; + op->pool = pool; + op->name = snapName; + op->onfinish = onfinish; + op->create = false; + op->replyCode = reply; + op_snap[op->tid] = op; + + pool_snap_submit(op); +} + +void Objecter::pool_snap_submit(SnapOp *op) { + dout(10) << "pool_snap_submit " << op->tid << dendl; + MPoolSnap *m = new MPoolSnap(monmap->fsid, op->tid, op->pool, op->name, op->create); + int mon = monmap->pick_mon(); + messenger->send_message(m, monmap->get_inst(mon)); +} + +void Objecter::handle_pool_snap_reply(MPoolSnapReply *m) { + dout(10) << "handle_pool_snap_reply " << *m << dendl; + tid_t tid = m->tid; + if (op_snap.count(tid)) { + SnapOp *op = op_snap[tid]; + dout(10) << "have request " << tid << " at " << op << " Create: " << op->create << dendl; + *(op->replyCode) = m->replyCode; + op->onfinish->finish(0); + delete op->onfinish; + op_snap.erase(tid); + delete op; + } else { + dout(10) << "unknown request " << tid << dendl; + } + dout(10) << "done" << dendl; + delete m; +} // pool stats diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 3b729eef8d7..1445b9759f0 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -35,6 +35,8 @@ class OSDMap; class MonMap; class Message; +class MPoolSnapReply; + class MGetPoolStatsReply; class MStatfsReply; @@ -259,11 +261,21 @@ class Objecter { Context *onfinish; }; + struct SnapOp { + tid_t tid; + int pool; + char *name; + Context *onfinish; + bool create; + int* replyCode; + }; + private: // pending ops hash_map op_osd; map op_poolstat; map op_statfs; + map op_snap; list waiting_for_map; @@ -462,7 +474,14 @@ private: o->snapc = snapc; return op_submit(o); } - + // ------------------------- + // snapshots +private: + void pool_snap_submit(SnapOp *op); +public: + void create_pool_snap(int *reply, int pool, char* snapName, Context *onfinish); + void delete_pool_snap(int *reply, int pool, char* snapName, Context *onfinish); + void handle_pool_snap_reply(MPoolSnapReply *m); // -------------------------- // pool stats From 9ce5802a5e4b8fe8b307d3aca0772332df7b590a Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 16 Jun 2009 14:22:32 -0700 Subject: [PATCH 05/13] mon:Added server-side handling of MPoolSnap. Currently assumes it's a snap-create message. --- src/messages/MPoolSnapReply.h | 6 +++-- src/mon/Monitor.cc | 4 ++++ src/mon/OSDMonitor.cc | 45 +++++++++++++++++++++++++++++++++++ src/mon/OSDMonitor.h | 15 ++++++++++++ 4 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/messages/MPoolSnapReply.h b/src/messages/MPoolSnapReply.h index a73031f908b..63885173f86 100644 --- a/src/messages/MPoolSnapReply.h +++ b/src/messages/MPoolSnapReply.h @@ -21,10 +21,12 @@ public: ceph_fsid_t fsid; tid_t tid; int replyCode; + int epoch; + MPoolSnapReply() : Message(MSG_POOLSNAPREPLY) {} - MPoolSnapReply( ceph_fsid_t& f, tid_t t, int rc) : - Message(MSG_POOLSNAPREPLY), fsid(f), tid(t), replyCode(rc) {} + MPoolSnapReply( ceph_fsid_t& f, tid_t t, int rc, int e) : + Message(MSG_POOLSNAPREPLY), fsid(f), tid(t), replyCode(rc), epoch(e) {} const char *get_type_name() { return "poolsnapreply"; } diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index a7e061fcb68..e6eb6cdb971 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -419,6 +419,10 @@ bool Monitor::dispatch_impl(Message *m) paxos_service[PAXOS_PGMAP]->dispatch(m); break; + case MSG_POOLSNAP: + paxos_service[PAXOS_OSDMAP]->dispatch(m); + break; + // log case MSG_LOG: paxos_service[PAXOS_LOG]->dispatch(m); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index fc8a5283a4b..911b5b19983 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -271,6 +271,9 @@ bool OSDMonitor::preprocess_query(Message *m) return preprocess_out((MOSDOut*)m); */ + case MSG_POOLSNAP: + return preprocess_pool_snap((MPoolSnap*)m); + case MSG_REMOVE_SNAPS: return preprocess_remove_snaps((MRemoveSnaps*)m); @@ -301,6 +304,8 @@ bool OSDMonitor::prepare_update(Message *m) case MSG_OSD_OUT: return prepare_out((MOSDOut*)m); */ + case MSG_POOLSNAP: + return prepare_pool_snap((MPoolSnap*)m); case MSG_REMOVE_SNAPS: return prepare_remove_snaps((MRemoveSnaps*)m); @@ -1264,4 +1269,44 @@ out: return false; } +bool OSDMonitor::preprocess_pool_snap ( MPoolSnap *m) { + if (m->pool < 0 ) { + ss << "unrecognized pool '" << m->pool << "'"; + err = -ENOENT; + //create reply, set replyCode to badness + _pool_snap(m->fsid, m->tid, -1, pending_inc.epoch); + return true; + } + +} +bool OSDMonitor::prepare_pool_snap ( MPoolSnap *m) { + const pg_pool_t *p = &osdmap.get_pg_pool(m->pool); + pg_pool_t *pp = 0; + if (pending_inc.new_pools.count(pool)) pp = &pending_inc.new_pools[pool]; + //if the snapname is already in use, we have a problem + if (p->snap_exists(m->name) || + pp && pp->snap_exists(m->name)) { + ss << "pool " << m->pool << " snap " << m->name << " already exists"; + err = -EEXIST; + _pool_snap(m->fsid, m->tid, -2, pending_inc.epoch); + return false; + } else { + if(!pp) { + pp = &pending_inc.new_pools[pool]; + *pp = *p; + } + pp->add_snap(m->name, g_clock.now()); + pp->set_snap_epoch(pending_inc.epoch); + ss << "created pool " << m->pool << " snap " << m->name; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Snap(mon, m, 0, pending_inc.epoch)); + return true; + } +} + + void _pool_snap(ceph_fsid_t fsid, tid_t tid, int replyCode, int epoch) { + MPoolSnapReply *m = new MPoolSnapReply(fsid, tid, replyCode, epoch); + mon->messenger->send_message(m, m->get_orig_source_inst()); + delete m; + } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 2a93bda44b5..502b25315b2 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -83,6 +83,10 @@ private: bool prepare_alive(class MOSDAlive *m); void _alive(MOSDAlive *m); + bool preprocess_pool_snap ( class MPoolSnap *m); + bool prepare_pool_snap (MPoolSnap *m); + void _pool_snap(ceph_fsid_t fsid, tid_t tid, int replyCode, int epoch); + struct C_Booted : public Context { OSDMonitor *cmon; MOSDBoot *m; @@ -115,6 +119,17 @@ private: cmon->dispatch((Message*)m); } }; + struct C_Snap : public Context { + OSDMonitor *osdmon; + MPoolSnap *m; + int replyCode; + int epoch; + C_Snap(OSDMonitor * osd, MPoolSnap *m_, int rc, int e) : + osdmon(osd), m(m_), replyCode(rc), epoch(e) {} + void finish(int r) { + osdmon->_pool_snap(m->fsid, m->tid, replyCode, epoch); + } + }; bool preprocess_out(class MOSDOut *m); bool prepare_out(class MOSDOut *m); From dd7c2f2cc0502224cff8b079452a184da11619eb Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 16 Jun 2009 14:25:02 -0700 Subject: [PATCH 06/13] mon: Now handles snapshot removals too. --- src/include/librados.h | 4 +- src/librados.cc | 18 +++++---- src/messages/MPoolSnap.h | 6 +-- src/mon/OSDMonitor.cc | 81 +++++++++++++++++++++++----------------- src/mon/OSDMonitor.h | 7 ++-- src/osdc/Objecter.cc | 4 +- src/osdc/Objecter.h | 6 +-- 7 files changed, 71 insertions(+), 55 deletions(-) diff --git a/src/include/librados.h b/src/include/librados.h index f632c6ea395..3b477fa43f5 100644 --- a/src/include/librados.h +++ b/src/include/librados.h @@ -114,8 +114,8 @@ public: std::map& stats); int get_fs_stats(rados_statfs_t& result); - int snap_create(rados_pool_t pool, char *snapname); - int snap_remove(rados_pool_t pool, char *snapname); + int snap_create(rados_pool_t pool, const char *snapname); + int snap_remove(rados_pool_t pool, const char *snapname); int snap_list(rados_pool_t pool, vector *snaps); int snap_get_name(rados_pool_t pool, rados_snap_t snap, std::string *name); int snap_get_stamp(rados_pool_t pool, rados_snap_t snap, time_t *t); diff --git a/src/librados.cc b/src/librados.cc index 8028e3cb908..69b372d109b 100644 --- a/src/librados.cc +++ b/src/librados.cc @@ -89,8 +89,8 @@ public: int snap_lookup(PoolCtx *pool, const char *name, rados_snap_t *snapid); int snap_get_name(PoolCtx *pool, rados_snap_t snapid, std::string *s); int snap_get_stamp(PoolCtx *pool, rados_snap_t snapid, time_t *t); - int snap_create(rados_pool_t pool, char* snapname); - int snap_remove(rados_pool_t pool, char* snapname); + int snap_create(rados_pool_t pool, const char* snapname); + int snap_remove(rados_pool_t pool, const char* snapname); // io int write(PoolCtx& pool, const object_t& oid, off_t off, bufferlist& bl, size_t len); @@ -453,9 +453,10 @@ int RadosClient::get_fs_stats( rados_statfs_t& result ) { // SNAPS -int RadosClient::snap_create( rados_pool_t pool, char *snapName) { +int RadosClient::snap_create( rados_pool_t pool, const char *snapName) { int reply; int poolID = ((PoolCtx *)pool)->poolid; + string sName = string(snapName); Mutex mylock ("RadosClient::snap_create::mylock"); Cond cond; @@ -463,7 +464,7 @@ int RadosClient::snap_create( rados_pool_t pool, char *snapName) { lock.Lock(); objecter->create_pool_snap(&reply, poolID, - snapName, + sName, new C_SafeCond(&mylock, &cond, &done)); lock.Unlock(); @@ -473,9 +474,10 @@ int RadosClient::snap_create( rados_pool_t pool, char *snapName) { return reply; } -int RadosClient::snap_remove( rados_pool_t pool, char *snapName) { +int RadosClient::snap_remove( rados_pool_t pool, const char *snapName) { int reply; int poolID = ((PoolCtx *)pool)->poolid; + string sName = string(snapName); Mutex mylock ("RadosClient::snap_remove::mylock"); Cond cond; @@ -483,7 +485,7 @@ int RadosClient::snap_remove( rados_pool_t pool, char *snapName) { lock.Lock(); objecter->delete_pool_snap(&reply, poolID, - snapName, + sName, new C_SafeCond(&mylock, &cond, &done)); lock.Unlock(); @@ -931,12 +933,12 @@ int Rados::close_pool(rados_pool_t pool) // SNAPS -int Rados::snap_create(rados_pool_t pool, char *snapname) { +int Rados::snap_create(rados_pool_t pool, const char *snapname) { if (!client) return -EINVAL; return client->snap_create(pool, snapname); } -int Rados::snap_remove(rados_pool_t pool, char *snapname) { +int Rados::snap_remove(rados_pool_t pool, const char *snapname) { if (!client) return -EINVAL; return client->snap_remove(pool, snapname); } diff --git a/src/messages/MPoolSnap.h b/src/messages/MPoolSnap.h index 0edea587074..33aa7799671 100644 --- a/src/messages/MPoolSnap.h +++ b/src/messages/MPoolSnap.h @@ -21,11 +21,11 @@ public: ceph_fsid_t fsid; tid_t tid; int pool; - char *name; + string name; bool create; MPoolSnap() : Message(MSG_POOLSNAP) {} - MPoolSnap( ceph_fsid_t& f, tid_t t, int p, char *n, bool c) : + MPoolSnap( ceph_fsid_t& f, tid_t t, int p, string& n, bool c) : Message(MSG_POOLSNAP), fsid(f), tid(t), pool(p), name(n), create(c) {} const char *get_type_name() { return "poolsnap"; } @@ -45,7 +45,7 @@ public: ::decode(fsid, p); ::decode(tid, p); ::decode(pool, p); - ::decode(*name, p); + ::decode(name, p); ::decode(create, p); } }; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 911b5b19983..7bf2557847d 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -26,6 +26,8 @@ #include "messages/MOSDGetMap.h" #include "messages/MOSDBoot.h" #include "messages/MOSDAlive.h" +#include "messages/MPoolSnap.h" +#include "messages/MPoolSnapReply.h" #include "messages/MMonCommand.h" #include "messages/MRemoveSnaps.h" #include "messages/MOSDScrub.h" @@ -619,7 +621,7 @@ bool OSDMonitor::preprocess_remove_snaps(MRemoveSnaps *m) q != m->snaps.end(); q++) { if (!osdmap.have_pg_pool(q->first)) { - dout(10) << " ignoring removed_snaps " << q->second << " on non-existant pool " << q->first << dendl; + dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl; continue; } const pg_pool_t& pi = osdmap.get_pg_pool(q->first); @@ -1271,42 +1273,53 @@ out: bool OSDMonitor::preprocess_pool_snap ( MPoolSnap *m) { if (m->pool < 0 ) { - ss << "unrecognized pool '" << m->pool << "'"; - err = -ENOENT; - //create reply, set replyCode to badness - _pool_snap(m->fsid, m->tid, -1, pending_inc.epoch); - return true; + _pool_snap(m, -ENOENT, pending_inc.epoch); + return true; //done with this message } - -} - -bool OSDMonitor::prepare_pool_snap ( MPoolSnap *m) { - const pg_pool_t *p = &osdmap.get_pg_pool(m->pool); + bool snap_exists = false; pg_pool_t *pp = 0; - if (pending_inc.new_pools.count(pool)) pp = &pending_inc.new_pools[pool]; - //if the snapname is already in use, we have a problem - if (p->snap_exists(m->name) || - pp && pp->snap_exists(m->name)) { - ss << "pool " << m->pool << " snap " << m->name << " already exists"; - err = -EEXIST; - _pool_snap(m->fsid, m->tid, -2, pending_inc.epoch); - return false; - } else { - if(!pp) { - pp = &pending_inc.new_pools[pool]; - *pp = *p; + if (pending_inc.new_pools.count(m->pool)) pp = &pending_inc.new_pools[m->pool]; + //check if the snapname exists + if ((osdmap.get_pg_pool(m->pool).snap_exists(m->name.c_str())) || + (pp && pp->snap_exists(m->name.c_str()))) snap_exists = true; + + if (m->create) { //if it's a snap creation request + if(snap_exists) { + _pool_snap(m, -EEXIST, pending_inc.epoch); + return true; } - pp->add_snap(m->name, g_clock.now()); - pp->set_snap_epoch(pending_inc.epoch); - ss << "created pool " << m->pool << " snap " << m->name; - getline(ss, rs); - paxos->wait_for_commit(new Monitor::C_Snap(mon, m, 0, pending_inc.epoch)); - return true; + else return false; //this message needs to go through preparation } + //it's a snap deletion request if we make it here + if (!snap_exists) { + _pool_snap(m, -ENOENT, pending_inc.epoch); + return true; //done with this message + } + return false; } - void _pool_snap(ceph_fsid_t fsid, tid_t tid, int replyCode, int epoch) { - MPoolSnapReply *m = new MPoolSnapReply(fsid, tid, replyCode, epoch); - mon->messenger->send_message(m, m->get_orig_source_inst()); - delete m; - } +bool OSDMonitor::prepare_pool_snap ( MPoolSnap *m) +{ + const pg_pool_t *p = &osdmap.get_pg_pool(m->pool); + pg_pool_t* pp = 0; + //if the pool isn't already in the update, add it + if (!pending_inc.new_pools.count(m->pool)) pending_inc.new_pools[m->pool] = *p; + pp = &pending_inc.new_pools[m->pool]; + + if (m->create) { //it's a snap creation message + pp->add_snap(m->name.c_str(), g_clock.now()); + pp->set_snap_epoch(pending_inc.epoch); + } + else { //it's a snap removal message + pp->remove_snap(pp->snap_exists(m->name.c_str())); + } + paxos->wait_for_commit(new OSDMonitor::C_Snap(this, m, 0, pending_inc.epoch)); + return true; +} + +void OSDMonitor::_pool_snap(MPoolSnap *m, int replyCode, int epoch) +{ + MPoolSnapReply *reply = new MPoolSnapReply(m->fsid, m->tid, replyCode, epoch); + mon->messenger->send_message(reply, m->get_orig_source_inst()); + delete m; +} diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 502b25315b2..10c91cb8565 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -32,6 +32,7 @@ using namespace std; class Monitor; class MOSDBoot; class MMonCommand; +class MPoolSnap; class OSDMonitor : public PaxosService { public: @@ -85,7 +86,7 @@ private: bool preprocess_pool_snap ( class MPoolSnap *m); bool prepare_pool_snap (MPoolSnap *m); - void _pool_snap(ceph_fsid_t fsid, tid_t tid, int replyCode, int epoch); + void _pool_snap(MPoolSnap *m, int replyCode, int epoch); struct C_Booted : public Context { OSDMonitor *cmon; @@ -124,10 +125,10 @@ private: MPoolSnap *m; int replyCode; int epoch; - C_Snap(OSDMonitor * osd, MPoolSnap *m_, int rc, int e) : + C_Snap(OSDMonitor * osd, MPoolSnap *m_, int rc, int e) : osdmon(osd), m(m_), replyCode(rc), epoch(e) {} void finish(int r) { - osdmon->_pool_snap(m->fsid, m->tid, replyCode, epoch); + osdmon->_pool_snap(m, replyCode, epoch); } }; diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index e29d9407627..1c7562b9cf2 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -534,7 +534,7 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) //snapshots -void Objecter::create_pool_snap(int *reply, int pool, char* snapName, Context *onfinish) { +void Objecter::create_pool_snap(int *reply, int pool, string& snapName, Context *onfinish) { dout(10) << "create_pool_snap; pool: " << pool << "; snap: " << snapName << dendl; SnapOp *op = new SnapOp; op->tid = ++last_tid; @@ -548,7 +548,7 @@ void Objecter::create_pool_snap(int *reply, int pool, char* snapName, Context *o pool_snap_submit(op); } -void Objecter::delete_pool_snap(int *reply, int pool, char* snapName, Context *onfinish) { +void Objecter::delete_pool_snap(int *reply, int pool, string& snapName, Context *onfinish) { dout(10) << "delete_pool_snap; pool: " << pool << "; snap: " << snapName << dendl; SnapOp *op = new SnapOp; op->tid = ++last_tid; diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 1445b9759f0..7e68aa69c33 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -264,7 +264,7 @@ class Objecter { struct SnapOp { tid_t tid; int pool; - char *name; + string name; Context *onfinish; bool create; int* replyCode; @@ -479,8 +479,8 @@ private: private: void pool_snap_submit(SnapOp *op); public: - void create_pool_snap(int *reply, int pool, char* snapName, Context *onfinish); - void delete_pool_snap(int *reply, int pool, char* snapName, Context *onfinish); + void create_pool_snap(int *reply, int pool, string& snapName, Context *onfinish); + void delete_pool_snap(int *reply, int pool, string& snapName, Context *onfinish); void handle_pool_snap_reply(MPoolSnapReply *m); // -------------------------- From ac64f0578ac03183e847b9baa1ca8b77960b84e4 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 16 Jun 2009 15:00:24 -0700 Subject: [PATCH 07/13] rados: submit snap creation and removal to the system. --- src/include/librados.h | 8 ++++---- src/librados.cc | 12 ++++++------ src/rados.cc | 20 +++++++++++++++++++- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/include/librados.h b/src/include/librados.h index 3b477fa43f5..dad9c33ac52 100644 --- a/src/include/librados.h +++ b/src/include/librados.h @@ -51,8 +51,8 @@ void rados_pool_close_ctx(rados_list_ctx_t *ctx); int rados_pool_list_next(rados_pool_t pool, const char **entry, rados_list_ctx_t *ctx); /* snapshots */ -int rados_snap_create(rados_pool_t pool, const char *snapname); -int rados_snap_remove(rados_pool_t pool, const char *snapname); +int rados_snap_create(const rados_pool_t pool, const char *snapname); +int rados_snap_remove(const rados_pool_t pool, const char *snapname); int rados_snap_list(rados_pool_t pool, rados_snap_t *snaps, int maxlen); int rados_snap_get_name(rados_pool_t pool, rados_snap_t id, char *name, int maxlen); @@ -114,8 +114,8 @@ public: std::map& stats); int get_fs_stats(rados_statfs_t& result); - int snap_create(rados_pool_t pool, const char *snapname); - int snap_remove(rados_pool_t pool, const char *snapname); + int snap_create(const rados_pool_t pool, const char *snapname); + int snap_remove(const rados_pool_t pool, const char *snapname); int snap_list(rados_pool_t pool, vector *snaps); int snap_get_name(rados_pool_t pool, rados_snap_t snap, std::string *name); int snap_get_stamp(rados_pool_t pool, rados_snap_t snap, time_t *t); diff --git a/src/librados.cc b/src/librados.cc index 69b372d109b..6d397d985e5 100644 --- a/src/librados.cc +++ b/src/librados.cc @@ -89,8 +89,8 @@ public: int snap_lookup(PoolCtx *pool, const char *name, rados_snap_t *snapid); int snap_get_name(PoolCtx *pool, rados_snap_t snapid, std::string *s); int snap_get_stamp(PoolCtx *pool, rados_snap_t snapid, time_t *t); - int snap_create(rados_pool_t pool, const char* snapname); - int snap_remove(rados_pool_t pool, const char* snapname); + int snap_create(const rados_pool_t pool, const char* snapname); + int snap_remove(const rados_pool_t pool, const char* snapname); // io int write(PoolCtx& pool, const object_t& oid, off_t off, bufferlist& bl, size_t len); @@ -453,7 +453,7 @@ int RadosClient::get_fs_stats( rados_statfs_t& result ) { // SNAPS -int RadosClient::snap_create( rados_pool_t pool, const char *snapName) { +int RadosClient::snap_create( const rados_pool_t pool, const char *snapName) { int reply; int poolID = ((PoolCtx *)pool)->poolid; string sName = string(snapName); @@ -474,7 +474,7 @@ int RadosClient::snap_create( rados_pool_t pool, const char *snapName) { return reply; } -int RadosClient::snap_remove( rados_pool_t pool, const char *snapName) { +int RadosClient::snap_remove( const rados_pool_t pool, const char *snapName) { int reply; int poolID = ((PoolCtx *)pool)->poolid; string sName = string(snapName); @@ -933,12 +933,12 @@ int Rados::close_pool(rados_pool_t pool) // SNAPS -int Rados::snap_create(rados_pool_t pool, const char *snapname) { +int Rados::snap_create(const rados_pool_t pool, const char *snapname) { if (!client) return -EINVAL; return client->snap_create(pool, snapname); } -int Rados::snap_remove(rados_pool_t pool, const char *snapname) { +int Rados::snap_remove(const rados_pool_t pool, const char *snapname) { if (!client) return -EINVAL; return client->snap_remove(pool, snapname); } diff --git a/src/rados.cc b/src/rados.cc index b3bdb0c8bb4..4d886ad9d9d 100644 --- a/src/rados.cc +++ b/src/rados.cc @@ -426,7 +426,25 @@ int main(int argc, const char **argv) } cout << snaps.size() << " snaps" << std::endl; } - + + else if (strcmp(nargs[0], "mksnap") == 0) { + if ( nargs.size() < 2) usage(); + + cout << "Submitting snap to backend." << std::endl; + int result = rados.snap_create(p, nargs[1]); + if (result == 0 ) cout << "Success! Created snapshot " << nargs[1] << std::endl; + else cout << "Failure. Attempt to create snapshot returned " << result << std::endl; + } + + else if (strcmp(nargs[0], "rmsnap") == 0) { + if ( nargs.size() < 2) usage(); + + cout << "Submitting snap removal to backend." << std::endl; + int result = rados.snap_remove(p, nargs[1]); + if (result == 0 ) cout << "Success! Removed snapshot " << nargs[1] << std::endl; + else cout << "Failure. Attempt to remove snapshot returned " << result << std::endl; + } + else if (strcmp(nargs[0], "bench") == 0) { if (nargs.size() < 2) usage(); From 3e6819052a6f0d83c9bf828488afa9726892630c Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 16 Jun 2009 15:22:16 -0700 Subject: [PATCH 08/13] mon/Objecter: Snapshot creation/removal now waits for proper epoch to return. --- src/messages/MPoolSnapReply.h | 2 +- src/mon/OSDMonitor.cc | 2 +- src/mon/OSDMonitor.h | 2 +- src/osdc/Objecter.cc | 15 ++++++++++++--- src/osdc/Objecter.h | 6 +++--- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/messages/MPoolSnapReply.h b/src/messages/MPoolSnapReply.h index 63885173f86..bba43a2b50a 100644 --- a/src/messages/MPoolSnapReply.h +++ b/src/messages/MPoolSnapReply.h @@ -21,7 +21,7 @@ public: ceph_fsid_t fsid; tid_t tid; int replyCode; - int epoch; + epoch_t epoch; MPoolSnapReply() : Message(MSG_POOLSNAPREPLY) {} diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7bf2557847d..b86518d5e30 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1317,7 +1317,7 @@ bool OSDMonitor::prepare_pool_snap ( MPoolSnap *m) return true; } -void OSDMonitor::_pool_snap(MPoolSnap *m, int replyCode, int epoch) +void OSDMonitor::_pool_snap(MPoolSnap *m, int replyCode, epoch_t epoch) { MPoolSnapReply *reply = new MPoolSnapReply(m->fsid, m->tid, replyCode, epoch); mon->messenger->send_message(reply, m->get_orig_source_inst()); diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 10c91cb8565..d6712735053 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -86,7 +86,7 @@ private: bool preprocess_pool_snap ( class MPoolSnap *m); bool prepare_pool_snap (MPoolSnap *m); - void _pool_snap(MPoolSnap *m, int replyCode, int epoch); + void _pool_snap(MPoolSnap *m, int replyCode, epoch_t epoch); struct C_Booted : public Context { OSDMonitor *cmon; diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 1c7562b9cf2..09179dc8f41 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -184,7 +184,12 @@ void Objecter::handle_osd_map(MOSDMap *m) kick_requests(changed_pgs); } - finish_contexts(waiting_for_map); + map >::iterator p = waiting_for_map.begin(); + while (p != waiting_for_map.end() && + p->first <= osdmap->get_epoch()) { + finish_contexts(p->second); + waiting_for_map.erase(p++); + } delete m; } @@ -576,8 +581,12 @@ void Objecter::handle_pool_snap_reply(MPoolSnapReply *m) { SnapOp *op = op_snap[tid]; dout(10) << "have request " << tid << " at " << op << " Create: " << op->create << dendl; *(op->replyCode) = m->replyCode; - op->onfinish->finish(0); - delete op->onfinish; + if (osdmap->get_epoch() < m->epoch) + wait_for_new_map(op->onfinish, m->epoch); + else { + op->onfinish->finish(0); + delete op->onfinish; + } op_snap.erase(tid); delete op; } else { diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 7e68aa69c33..2d279d0e824 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -277,7 +277,7 @@ class Objecter { map op_statfs; map op_snap; - list waiting_for_map; + map > waiting_for_map; /** * track pending ops by pg @@ -350,8 +350,8 @@ private: int get_client_incarnation() const { return client_inc; } void set_client_incarnation(int inc) { client_inc = inc; } - void wait_for_new_map(Context *c) { - waiting_for_map.push_back(c); + void wait_for_new_map(Context *c, epoch_t epoch) { + waiting_for_map[epoch].push_back(c); } // mid-level helpers From e3105a06df9f2e7e5388d051f8938441c95a08b0 Mon Sep 17 00:00:00 2001 From: Greg Farnum Date: Tue, 16 Jun 2009 15:49:34 -0700 Subject: [PATCH 09/13] Objecter: fixed a bug in snaps. Worked around a situtation where you could delete a callback before it was followed. --- src/osdc/Objecter.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 09179dc8f41..79da15aefb5 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -581,14 +581,17 @@ void Objecter::handle_pool_snap_reply(MPoolSnapReply *m) { SnapOp *op = op_snap[tid]; dout(10) << "have request " << tid << " at " << op << " Create: " << op->create << dendl; *(op->replyCode) = m->replyCode; - if (osdmap->get_epoch() < m->epoch) + if (osdmap->get_epoch() < m->epoch) { + dout(20) << "waiting for client to reach epoch " << m->epoch << " before calling back" << dendl; wait_for_new_map(op->onfinish, m->epoch); + } else { op->onfinish->finish(0); delete op->onfinish; } - op_snap.erase(tid); + op->onfinish = NULL; delete op; + op_snap.erase(tid); } else { dout(10) << "unknown request " << tid << dendl; } From a07894d17ecd1c52f00490c758eb73e1994cc7c3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Jun 2009 16:00:31 -0700 Subject: [PATCH 10/13] osd: fix cloned object context Screws up subsequent reads on cloned objects. And the bad ref counting was leaking memory. --- src/osd/ReplicatedPG.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 4321b260d45..4cd50fd78e3 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1333,6 +1333,8 @@ void ReplicatedPG::make_writeable(OpContext *ctx) ctx->clone_obc->obs.oi.last_reqid = oi.last_reqid; ctx->clone_obc->obs.oi.mtime = oi.mtime; ctx->clone_obc->obs.oi.snaps = snaps; + ctx->clone_obc->obs.exists = true; + ctx->clone_obc->get(); ctx->clone_obc->force_start_write(); if (is_primary()) From 92c48b0374bfe6d96236c68faa7d2cbd5434d6ea Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Jun 2009 16:04:12 -0700 Subject: [PATCH 11/13] kclient: specify smallish blksize for directories This is mainly just because /bin/ls will use the size, or blocks, or blksize to decide how big of a buffer to allocate for getdents, and the default of 4MB is unreasonably big. 64k seems like an okay number, I guess. --- src/kernel/inode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/kernel/inode.c b/src/kernel/inode.c index 6c03b50212d..c7ed3b2566d 100644 --- a/src/kernel/inode.c +++ b/src/kernel/inode.c @@ -1562,16 +1562,19 @@ int ceph_permission(struct inode *inode, int mask) int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { + struct inode *inode = dentry->d_inode; int err; - err = ceph_do_getattr(dentry->d_inode, CEPH_STAT_CAP_INODE_ALL); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); if (!err) { - generic_fillattr(dentry->d_inode, stat); - stat->ino = dentry->d_inode->i_ino; - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) - stat->dev = ceph_snap(dentry->d_inode); + generic_fillattr(inode, stat); + stat->ino = inode->i_ino; + if (ceph_snap(inode) != CEPH_NOSNAP) + stat->dev = ceph_snap(inode); else stat->dev = 0; + if (S_ISDIR(inode->i_mode)) + stat->blksize = 65536; } return err; } From 3efb869af1ba93202ac8b6bd8505a09009cebfec Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Jun 2009 16:14:55 -0700 Subject: [PATCH 12/13] kclient: fix non-dcache readdir offset calculation Needs to factor in frag_is_leftmost to account for . and .., just like the fi->offset calculation in readdir_prepopulate. Fixes the problem where an ls on a large dir returns duplicate entries. --- src/kernel/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/kernel/dir.c b/src/kernel/dir.c index c0fca4da626..689ac54db87 100644 --- a/src/kernel/dir.c +++ b/src/kernel/dir.c @@ -279,11 +279,12 @@ more: dout(10, "readdir frag %x num %d off %d fragoff %d skew %d\n", frag, rinfo->dir_nr, off, fi->off, skew); while (off >= skew && off+skew < rinfo->dir_nr) { - u64 pos = ceph_make_fpos(frag, rinfo->dir_pos[off+skew]); + u64 pos = ceph_make_fpos(frag, rinfo->dir_pos[off+skew] + + (frag_is_leftmost(frag) ? 2 : 0)); - dout(10, "readdir off %d -> %d / %d %lld name '%.*s'\n", - off, off+skew, - rinfo->dir_nr, pos, rinfo->dir_dname_len[off+skew], + dout(10, "readdir off %d (%d/%d) -> %lld '%.*s'\n", + off, off+skew, rinfo->dir_nr, pos, + rinfo->dir_dname_len[off+skew], rinfo->dir_dname[off+skew]); ftype = le32_to_cpu(rinfo->dir_in[off+skew].in->mode) >> 12; if (filldir(dirent, From 114ad5177f281b96ba6bbebe9ed85e2a4aec783f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Jun 2009 16:31:23 -0700 Subject: [PATCH 13/13] kclient: fix skipped entires in dcache_readdir I'm not sure why the old code incremented f_pos and looked at the prev entry, but it was wrong. --- src/kernel/dir.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kernel/dir.c b/src/kernel/dir.c index 689ac54db87..e24ceda52e7 100644 --- a/src/kernel/dir.c +++ b/src/kernel/dir.c @@ -70,8 +70,7 @@ static int __dcache_readdir(struct file *filp, p = parent->d_subdirs.prev; dout(10, " initial p %p/%p\n", p->prev, p->next); } else { - p = last->d_u.d_child.prev; - filp->f_pos++; + p = &last->d_u.d_child; } more: