From 41c67e0236c348e285b4fd064f650884441176b7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 15:01:53 -0700 Subject: [PATCH 01/22] osd: make ms_handle_reset debug more useful Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- src/osd/OSD.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 2cecd60c18b..d75375f6537 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3371,8 +3371,8 @@ void OSD::ms_handle_connect(Connection *con) bool OSD::ms_handle_reset(Connection *con) { - dout(1) << "OSD::ms_handle_reset()" << dendl; OSD::Session *session = (OSD::Session *)con->get_priv(); + dout(1) << "ms_handle_reset con " << con << " session " << session << dendl; if (!session) return false; session->wstate.reset(); From 561ac0b173161a429b0bfecb78676fb38af14e5c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 15:02:02 -0700 Subject: [PATCH 02/22] osd: break con <-> session cycle when marking down old peers Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- src/osd/OSD.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d75375f6537..2dc59b32f4f 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5575,7 +5575,9 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch) << " msg was " << m->get_source_inst().addr << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t()) << dendl; - cluster_messenger->mark_down(m->get_connection()); + ConnectionRef con = m->get_connection(); + con->set_priv(NULL); // break ref <-> session cycle, if any + cluster_messenger->mark_down(con.get()); return false; } } From bfadcd2a0eb48c0a46666db9647a6ad9fe24a038 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 15:02:07 -0700 Subject: [PATCH 03/22] osd/ReplicatedPG: fix obc leak on invalid LIST_SNAPS op Signed-off-by: Sage Weil Reviewed-by: Samuel Just --- src/osd/ReplicatedPG.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 14708e38cd9..453fdacfb76 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -821,6 +821,7 @@ void ReplicatedPG::do_op(OpRequestRef op) if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS && m->get_snapid() != CEPH_SNAPDIR) { dout(10) << "LIST_SNAPS with incorrect context" << dendl; + put_object_context(obc); osd->reply_op_error(op, -EINVAL); return; } From 8dcf0b199af36f0f3b3fb81103949050d53750e4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Jul 2013 22:43:26 -0700 Subject: [PATCH 04/22] msgr: generate reset event on mark_down to addr (not con) If the caller is marking down an addr, they presumably don't have the Connection* handy, so we should generate a reset event to help them clean up con <-> session ref cycles. Signed-off-by: Sage Weil --- src/msg/SimpleMessenger.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc index afee0952630..441ed432af0 100644 --- a/src/msg/SimpleMessenger.cc +++ b/src/msg/SimpleMessenger.cc @@ -599,9 +599,12 @@ void SimpleMessenger::mark_down(const entity_addr_t& addr) p->pipe_lock.Lock(); p->stop(); if (p->connection_state) { - // do not generate a reset event for the caller in this case, - // since they asked for it. - p->connection_state->clear_pipe(p); + // generate a reset event for the caller in this case, even + // though they asked for it, since this is the addr-based (and + // not Connection* based) interface + ConnectionRef con = p->connection_state; + if (con && con->clear_pipe(p)) + dispatch_queue.queue_reset(con.get()); } p->pipe_lock.Unlock(); } else { From 27868ca5ac6bd09f7de8836d61776a17e21657e0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 10:53:04 -0700 Subject: [PATCH 05/22] msgr: update docs for mark_down, mark_down_all semantics * RESET events * note that the reset detection only happens if it is enabled in the policy. Signed-off-by: Sage Weil --- src/msg/Messenger.h | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index 28643e10767..be0dcd1738e 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -476,22 +476,30 @@ public: */ virtual int send_keepalive(Connection *con) = 0; /** - * Mark down a Connection to a remote. This will cause us to - * discard our outgoing queue for them, and if they try - * to reconnect they will discard their queue when we - * inform them of the session reset. If there is no - * Connection to the given dest, it is a no-op. - * It does not generate any notifications to the Dispatcher. + * Mark down a Connection to a remote. + * + * This will cause us to discard our outgoing queue for them, and if + * reset detection is enabled in the policy and the endpoint tries + * to reconnect they will discard their queue when we inform them of + * the session reset. + * + * If there is no Connection to the given dest, it is a no-op. + * + * This generates a RESET notification to the Dispatcher. * * @param a The address to mark down. */ virtual void mark_down(const entity_addr_t& a) = 0; /** - * Mark down the given Connection. This will cause us to - * discard its outgoing queue, and if the endpoint tries - * to reconnect they will discard their queue when we - * inform them of the session reset. + * Mark down the given Connection. + * + * This will cause us to discard its outgoing queue, and if reset + * detection is enabled in the policy and the endpoint tries to + * reconnect they will discard their queue when we inform them of + * the session reset. + * * If the Connection* is NULL, this is a no-op. + * * It does not generate any notifications to the Dispatcher. * * @param con The Connection to mark down. @@ -500,6 +508,14 @@ public: void mark_down(const ConnectionRef& con) { mark_down(con.get()); } + /** + * Mark all the existing Connections down. This is equivalent + * to iterating over all Connections and calling mark_down() + * on each. + * + * This will generate a RESET event for each closed connections. + */ + virtual void mark_down_all() = 0; /** * Unlike mark_down, this function will try and deliver * all messages before ending the connection, and it will use @@ -529,12 +545,6 @@ public: * @param con The Connection to mark as disposable. */ virtual void mark_disposable(Connection *con) = 0; - /** - * Mark all the existing Connections down. This is equivalent - * to iterating over all Connections and calling mark_down() - * on each. - */ - virtual void mark_down_all() = 0; /** * @} // Connection Management */ From 564075c9ad37edc6f63bbb4c372775d422a9e8c7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 11:28:09 -0700 Subject: [PATCH 06/22] msg/SimpleMessenger: remove duplicated interface docs Document these in the interface, not the implementation; having two copies clutters the header and invites them to get out of sync. Signed-off-by: Sage Weil --- src/msg/Messenger.h | 16 ++- src/msg/SimpleMessenger.h | 217 ++------------------------------------ 2 files changed, 16 insertions(+), 217 deletions(-) diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index be0dcd1738e..94a737c1c11 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -329,17 +329,13 @@ public: */ virtual int bind(const entity_addr_t& bind_addr) = 0; /** - * This is an optional function for implementations - * to override. For those implementations that do - * implement it, this function shall perform a full - * restart of the Messenger component, whatever that means. - * Other entities who connect to this Messenger post-rebind() - * should perceive it as a new entity which they have not - * previously contacted, and it MUST bind to a different - * address than it did previously. If avoid_port is non-zero - * it must additionally avoid that port. + * This function performs a full restart of the Messenger component, + * whatever that means. Other entities who connect to this + * Messenger post-rebind() should perceive it as a new entity which + * they have not previously contacted, and it MUST bind to a + * different address than it did previously. * - * @param avoid_port An additional port to avoid binding to. + * @param avoid_ports Additional port to avoid binding to. */ virtual int rebind(const set& avoid_ports) { return -EOPNOTSUPP; } /** diff --git a/src/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h index 4538b0f18bc..6860c6c21a3 100644 --- a/src/msg/SimpleMessenger.h +++ b/src/msg/SimpleMessenger.h @@ -92,28 +92,12 @@ public: /** @defgroup Accessors * @{ */ - /** - * Set the IP this SimpleMessenger is using. This is useful if it's unset - * but another SimpleMessenger on the same interface has already learned its - * IP. Of course, this function does not change the port, since the - * SimpleMessenger always knows the correct setting for that. - * If the SimpleMesssenger's IP is already set, this function is a no-op. - * - * @param addr The IP address to set internally. - */ void set_addr_unknowns(entity_addr_t& addr); - /** - * Get the number of Messages which the SimpleMessenger has received - * but not yet dispatched. - * @return The length of the Dispatch queue. - */ + int get_dispatch_queue_len() { return dispatch_queue.get_queue_len(); } - /** - * Get age of oldest undelivered message - * (0 if the queue is empty) - */ + double get_dispatch_queue_max_age(utime_t now) { return dispatch_queue.get_max_age(now); } @@ -123,52 +107,21 @@ public: * @defgroup Configuration functions * @{ */ - /** - * Set the cluster protocol in use by this daemon. - * This is an init-time function and cannot be called after calling - * start() or bind(). - * - * @param p The cluster protocol to use. Defined externally. - */ void set_cluster_protocol(int p) { assert(!started && !did_bind); cluster_protocol = p; } - /** - * Set a policy which is applied to all peers who do not have a type-specific - * Policy. - * This is an init-time function and cannot be called after calling - * start() or bind(). - * - * @param p The Policy to apply. - */ + void set_default_policy(Policy p) { Mutex::Locker l(policy_lock); default_policy = p; } - /** - * Set a policy which is applied to all peers of the given type. - * This is an init-time function and cannot be called after calling - * start() or bind(). - * - * @param type The peer type this policy applies to. - * @param p The policy to apply. - */ + void set_policy(int type, Policy p) { Mutex::Locker l(policy_lock); policy_map[type] = p; } - /** - * Set a Throttler which is applied to all Messages from the given - * type of peer. - * This is an init-time function and cannot be called after calling - * start() or bind(). - * - * @param type The peer type this Throttler will apply to. - * @param t The Throttler to apply. SimpleMessenger does not take - * ownership of this pointer, but you must not destroy it before - * you destroy SimpleMessenger. - */ + void set_policy_throttlers(int type, Throttle *byte_throttle, Throttle *msg_throttle) { Mutex::Locker l(policy_lock); if (policy_map.count(type)) { @@ -179,50 +132,18 @@ public: default_policy.throttler_messages = msg_throttle; } } - /** - * Bind the SimpleMessenger to a specific address. If bind_addr - * is not completely filled in the system will use the - * valid portions and cycle through the unset ones (eg, the port) - * in an unspecified order. - * - * @param bind_addr The address to bind to. - * @return 0 on success, or -1 if the SimpleMessenger is already running, or - * -errno if an error is returned from a system call. - */ + int bind(const entity_addr_t& bind_addr); - /** - * This function performs a full restart of the SimpleMessenger. It - * calls mark_down_all() and binds to a new port. (If avoid_port - * is set it additionally avoids that specific port.) - * - * @param avoid_port An additional port to avoid binding to. - */ int rebind(const set& avoid_ports); + /** @} Configuration functions */ /** * @defgroup Startup/Shutdown * @{ */ - /** - * Start up the SimpleMessenger. Create worker threads as necessary. - * @return 0 - */ virtual int start(); - /** - * Wait until the SimpleMessenger is ready to shut down (triggered by a - * call to the shutdown() function), then handle - * stopping its threads and cleaning up Pipes and various queues. - * Once this function returns, the SimpleMessenger is fully shut down and - * can be deleted. - */ virtual void wait(); - /** - * Tell the SimpleMessenger to shut down. This function does not - * complete the shutdown; it just triggers it. - * - * @return 0 - */ virtual int shutdown(); /** @} // Startup/Shutdown */ @@ -231,60 +152,18 @@ public: * @defgroup Messaging * @{ */ - /** - * Queue the given Message for the given entity. - * Success in this function does not guarantee Message delivery, only - * success in queueing the Message. Other guarantees may be provided based - * on the Connection policy associated with the dest. - * - * @param m The Message to send. The Messenger consumes a single reference - * when you pass it in. - * @param dest The entity to send the Message to. - * - * @return 0 on success, or -EINVAL if the dest's address is empty. - */ virtual int send_message(Message *m, const entity_inst_t& dest) { return _send_message(m, dest, false); } - /** - * Queue the given Message to send out on the given Connection. - * Success in this function does not guarantee Message delivery, only - * success in queueing the Message (or else a guaranteed-safe drop). - * Other guarantees may be provided based on the Connection policy. - * - * @param m The Message to send. The Messenger consumes a single reference - * when you pass it in. - * @param con The Connection to send the Message out on. - * - * @return 0 on success. - */ + virtual int send_message(Message *m, Connection *con) { return _send_message(m, con, false); } - /** - * Lazily queue the given Message for the given entity. Unlike with - * send_message(), lazy_send_message() will not establish a - * Connection if none exists, re-establish the connection if it - * has broken, or queue the Message if the connection is broken. - * - * @param m The Message to send. The Messenger consumes a single reference - * when you pass it in. - * @param dest The entity to send the Message to. - * - * @return 0 on success, or -EINVAL if the dest's address is empty. - */ + virtual int lazy_send_message(Message *m, const entity_inst_t& dest) { return _send_message(m, dest, true); } - /** - * Lazily queue the given Message for the given Connection. - * - * @param m The Message to send. The Messenger consumes a single reference - * when you pass it in. - * @param con The Connection to send the Message out on. - * - * @return 0. - */ + virtual int lazy_send_message(Message *m, Connection *con) { return _send_message(m, con, true); } @@ -294,90 +173,14 @@ public: * @defgroup Connection Management * @{ */ - /** - * Get the Connection object associated with a given entity. If a - * Connection does not exist, create one and establish a logical connection. - * The caller owns a reference when this returns. Call ->put() when you're - * done! - * - * @param dest The entity to get a connection for. - * @return The requested Connection, as a pointer whose reference you own. - */ virtual ConnectionRef get_connection(const entity_inst_t& dest); virtual ConnectionRef get_loopback_connection(); - /** - * Send a "keepalive" ping to the given dest, if it has a working Connection. - * If the Messenger doesn't already have a Connection, or if the underlying - * connection has broken, this function does nothing. - * - * @param dest The entity to send the keepalive to. - * @return 0, or -EINVAL if we don't already have a Connection, or - * -EPIPE if a Pipe for the dest doesn't exist. - */ virtual int send_keepalive(const entity_inst_t& addr); - /** - * Send a "keepalive" ping along the given Connection, if it's working. - * If the underlying connection has broken, this function does nothing. - * - * @param dest The entity to send the keepalive to. - * @return 0, or -EPIPE if the Connection doesn't have a running Pipe. - */ virtual int send_keepalive(Connection *con); - /** - * Mark down a Connection to a remote. This will cause us to - * discard our outgoing queue for them, and if they try - * to reconnect they will discard their queue when we - * inform them of the session reset. If there is no - * Connection to the given dest, it is a no-op. - * It does not generate any notifications to the Dispatcher. - * - * @param a The address to mark down. - */ virtual void mark_down(const entity_addr_t& addr); - /** - * Mark down the given Connection. This will cause us to - * discard its outgoing queue, and if the endpoint tries - * to reconnect they will discard their queue when we - * inform them of the session reset. - * It does not generate any notifications to the Dispatcher. - * - * @param con The Connection to mark down. - */ virtual void mark_down(Connection *con); - /** - * Unlike mark_down, this function will try and deliver - * all messages before ending the connection, and it will use - * the Pipe's existing semantics to do so. Once the Messages - * all been sent out (and acked, if using reliable delivery) - * the Connection will be closed. - * This function means that you will get standard delivery to endpoints, - * and then the Connection will be cleaned up. It does not - * generate any notifications to the Dispatcher. - * - * @param con The Connection to mark down. - */ virtual void mark_down_on_empty(Connection *con); - /** - * Mark a Connection as "disposable", setting it to lossy - * (regardless of initial Policy). Unlike mark_down_on_empty() - * this does not immediately close the Connection once - * Messages have been delivered, so as long as there are no errors you can - * continue to receive responses; but it will not attempt - * to reconnect for message delivery or preserve your old - * delivery semantics, either. - * You can compose this with mark_down, in which case the Pipe - * will make sure to send all Messages and wait for an ack before - * closing, but if there's a failure it will simply shut down. It - * does not generate any notifications to the Dispatcher. - * - * @param con The Connection to mark as disposable. - */ virtual void mark_disposable(Connection *con); - /** - * Mark all the existing Connections down. This is equivalent - * to iterating over all Connections and calling mark_down() - * on each. - */ virtual void mark_down_all(); /** @} // Connection Management */ protected: From 30de04066d874921b7d721bfcb0f363df8a735f1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 14:44:17 -0700 Subject: [PATCH 07/22] mon: break con <-> session ref cycle in mon even if shutting down If we get a reset during shutdown, we should still break the cycle to avoid tripping the valgrind leak detection. Note that we are touching no internal Monitor state here and the locking has not changed. Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 9ae3e93a111..3ff0a6418b2 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3242,9 +3242,6 @@ bool Monitor::ms_handle_reset(Connection *con) { dout(10) << "ms_handle_reset " << con << " " << con->get_peer_addr() << dendl; - if (is_shutdown()) - return false; - // ignore lossless monitor sessions if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) return false; @@ -3253,6 +3250,12 @@ bool Monitor::ms_handle_reset(Connection *con) if (!s) return false; + // break any con <-> session ref cycle + s->con->set_priv(NULL); + + if (is_shutdown()) + return false; + Mutex::Locker l(lock); dout(10) << "reset/close on session " << s->inst << dendl; From 000d4d38a47871002a870bc815893833dc5a5773 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 14:46:57 -0700 Subject: [PATCH 08/22] mon: mark_down session by con, not addr We have the ConnectionRef here; use it. This avoids generating a spurious RESET event for the connection. Signed-off-by: Sage Weil --- src/mon/Monitor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 3ff0a6418b2..90750dd7b11 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3477,16 +3477,16 @@ void Monitor::tick() continue; if (!s->until.is_zero() && s->until < now) { - dout(10) << " trimming session " << s->inst + dout(10) << " trimming session " << s->con << " " << s->inst << " (until " << s->until << " < now " << now << ")" << dendl; - messenger->mark_down(s->inst.addr); + messenger->mark_down(s->con); remove_session(s); } else if (!exited_quorum.is_zero()) { if (now > (exited_quorum + 2 * g_conf->mon_lease)) { // boot the client Session because we've taken too long getting back in - dout(10) << " trimming session " << s->inst - << " because we've been out of quorum too long" << dendl; - messenger->mark_down(s->inst.addr); + dout(10) << " trimming session " << s->con << " " << s->inst + << " because we've been out of quorum too long" << dendl; + messenger->mark_down(s->con); remove_session(s); } } From 11c47cc4e3ddac54a5b2bb2623cf59b77917f555 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 14:50:32 -0700 Subject: [PATCH 09/22] client: mark_down by con We have the con handy; use it. This avoids generate a spurious RESET event, which we do not need or do anything useful with. Note that in this case we are not attaching anything to the Connection priv field. Signed-off-by: Sage Weil --- src/client/Client.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index ae7ddf65db4..eb7502c1530 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -1882,7 +1882,7 @@ void Client::handle_mds_map(MMDSMap* m) int newstate = mdsmap->get_state(p->first); if (!mdsmap->is_up(p->first) || mdsmap->get_inst(p->first) != p->second->inst) { - messenger->mark_down(p->second->inst.addr); + messenger->mark_down(p->second->con); if (mdsmap->is_up(p->first)) p->second->inst = mdsmap->get_inst(p->first); } else if (oldstate == newstate) From d1b83be14c4d6dc996e352b5fe6558f50176cbab Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 15:05:22 -0700 Subject: [PATCH 10/22] msgr: mark addr-based [lazy_]send_message and get_connection deprecated Signed-off-by: Sage Weil --- src/msg/Messenger.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h index 94a737c1c11..42feaf227df 100644 --- a/src/msg/Messenger.h +++ b/src/msg/Messenger.h @@ -388,6 +388,9 @@ public: * when you pass it in. * @param dest The entity to send the Message to. * + * DEPRECATED: please do not use this interface for any new code; + * use the Connection* variant. + * * @return 0 on success, or -errno on failure. */ virtual int send_message(Message *m, const entity_inst_t& dest) = 0; @@ -417,6 +420,9 @@ public: * when you pass it in. * @param dest The entity to send the Message to. * + * DEPRECATED: please do not use this interface for any new code; + * use the Connection* variant. + * * @return 0. */ virtual int lazy_send_message(Message *m, const entity_inst_t& dest) = 0; @@ -483,6 +489,9 @@ public: * * This generates a RESET notification to the Dispatcher. * + * DEPRECATED: please do not use this interface for any new code; + * use the Connection* variant. + * * @param a The address to mark down. */ virtual void mark_down(const entity_addr_t& a) = 0; From 0de708516c48907a548856d64d1657d2fc576e32 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Jul 2013 16:58:50 -0700 Subject: [PATCH 11/22] mon/MonClient: fix small leak We need to delete the version_req_d here. Signed-off-by: Sage Weil --- src/mon/MonClient.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc index 57f30063fa7..8139b0259bf 100644 --- a/src/mon/MonClient.cc +++ b/src/mon/MonClient.cc @@ -516,6 +516,7 @@ void MonClient::_reopen_session(int rank, string name) // throw out version check requests while (!version_requests.empty()) { finisher.queue(version_requests.begin()->second->context, -EAGAIN); + delete version_requests.begin()->second; version_requests.erase(version_requests.begin()); } From 4f4bdbd5cb84bc84fd578d56fc3340ef4173b025 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 17 Jul 2013 16:34:50 -0700 Subject: [PATCH 12/22] rgw: fix bucket re-creation on secondary region We had a problem with bucket recreation, where we identified that bucket has already existed, but missed the fact that it's the same bucket, so removal of the bucket index was wrong. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rados.cc | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 067a1adbabd..308adff5ae9 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -1835,21 +1835,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket, info.creation_time = creation_time; ret = put_linked_bucket_info(info, exclusive, 0, &attrs, true); if (ret == -EEXIST) { - /* remove bucket meta instance */ - string entry; - get_bucket_instance_entry(bucket, entry); - r = rgw_bucket_instance_remove_entry(this, entry, &info.objv_tracker); - if (r < 0) - return r; - - /* remove bucket index */ - librados::IoCtx index_ctx; // context for new bucket - int r = open_bucket_index_ctx(bucket, index_ctx); - if (r < 0) - return r; - - /* we need to reread the info and return it, caller will have a use for it */ - index_ctx.remove(dir_oid); + /* we need to reread the info and return it, caller will have a use for it */ r = get_bucket_info(NULL, bucket.name, info, NULL, NULL); if (r < 0) { if (r == -ENOENT) { @@ -1858,6 +1844,24 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket, ldout(cct, 0) << "get_bucket_info returned " << r << dendl; return r; } + + /* only remove it if it's a different bucket instance */ + if (info.bucket.bucket_id != bucket.bucket_id) { + /* remove bucket meta instance */ + string entry; + get_bucket_instance_entry(bucket, entry); + r = rgw_bucket_instance_remove_entry(this, entry, &info.objv_tracker); + if (r < 0) + return r; + + /* remove bucket index */ + librados::IoCtx index_ctx; // context for new bucket + int r = open_bucket_index_ctx(bucket, index_ctx); + if (r < 0) + return r; + + index_ctx.remove(dir_oid); + } /* ret == -ENOENT here */ } return ret; From 2e518235636149dae50870fc897459b27b24e31d Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 17 Jul 2013 17:20:30 -0700 Subject: [PATCH 13/22] rgw: forward x_amz_meta headers when forwarding a request Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rest_client.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc index 3e712e7e023..2075e535525 100644 --- a/src/rgw/rgw_rest_client.cc +++ b/src/rgw/rgw_rest_client.cc @@ -224,6 +224,11 @@ int RGWRESTSimpleRequest::forward_request(RGWAccessKey& key, req_info& info, siz headers.push_back(make_pair(iter->first, iter->second)); } + map& meta_map = new_info.x_meta_map; + for (iter = meta_map.begin(); iter != meta_map.end(); ++iter) { + headers.push_back(make_pair(iter->first, iter->second)); + } + string params_str; map& args = new_info.args.get_params(); get_params_str(args, params_str); From 989a4d93d88d5342c8369c7f9c22af66601bfdbf Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 18 Jul 2013 10:48:39 -0700 Subject: [PATCH 14/22] rgw: adjust error for bucket removal on secondary region Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_op.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 05c31d61689..f0c57e78cab 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1074,8 +1074,13 @@ void RGWDeleteBucket::execute() bufferlist in_data; JSONParser jp; ret = forward_request_to_master(s, store, in_data, &jp); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { /* adjust error, + we want to return with NoSuchBucket and not NoSuchKey */ + ret = -ERR_NO_SUCH_BUCKET; + } return; + } JSONDecoder::decode_json("object_ver", objv_tracker.read_version, &jp); } From 85f3f09b0a064f02c09ce7e014bddfe2b6217cbd Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 18 Jul 2013 11:16:15 -0700 Subject: [PATCH 15/22] rgw: forward delete bucket request to master after removal We can only forward the bucket removal to the master if it was successfully removed locally. The master region has no knowledge about whether the bucket can be removed or not, e.g., there are still objects in the bucket. If we send it to the master first, then it'll happily remove it even though it might fail in the end. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_op.cc | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index f0c57e78cab..94ce60c4876 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1070,6 +1070,19 @@ void RGWDeleteBucket::execute() if (!s->bucket_name) return; + ret = store->delete_bucket(s->bucket, objv_tracker); + + if (ret == 0) { + ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); + if (ret < 0) { + ldout(s->cct, 0) << "WARNING: failed to unlink bucket: ret=" << ret << dendl; + } + } + + if (ret < 0) { + return; + } + if (!store->region.is_master) { bufferlist in_data; JSONParser jp; @@ -1085,14 +1098,6 @@ void RGWDeleteBucket::execute() JSONDecoder::decode_json("object_ver", objv_tracker.read_version, &jp); } - ret = store->delete_bucket(s->bucket, objv_tracker); - - if (ret == 0) { - ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); - if (ret < 0) { - ldout(s->cct, 0) << "WARNING: failed to remove bucket: ret=" << ret << dendl; - } - } } int RGWPutObj::verify_permission() From 89ecba209b4f7e8932e92768ac2f028fa2d59b71 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 18 Jul 2013 13:07:55 -0700 Subject: [PATCH 16/22] rgw: remove s->objv_tracker was never initialized correctly anyway. It was only supposed to be used for buckets, but it was never initialized in that case. Using s->bucket_info.objv_tracker instead. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_common.h | 2 -- src/rgw/rgw_op.cc | 30 ++++++++++++++---------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index c885724efbd..9d7c3d41542 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -807,8 +807,6 @@ struct req_state { map bucket_attrs; bool bucket_exists; - RGWObjVersionTracker objv_tracker; - bool has_bad_meta; RGWUserInfo user; diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 94ce60c4876..1dcd8fb8c93 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -178,8 +178,7 @@ static int decode_policy(CephContext *cct, bufferlist& bl, RGWAccessControlPolic static int get_bucket_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, RGWBucketInfo& bucket_info, map& bucket_attrs, - RGWAccessControlPolicy *policy, rgw_obj& obj, - RGWObjVersionTracker *objv_tracker) + RGWAccessControlPolicy *policy, rgw_obj& obj) { int ret; map::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL); @@ -203,13 +202,12 @@ static int get_bucket_policy_from_attr(CephContext *cct, RGWRados *store, void * static int get_obj_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, RGWBucketInfo& bucket_info, map& bucket_attrs, - RGWAccessControlPolicy *policy, rgw_obj& obj, - RGWObjVersionTracker *objv_tracker) + RGWAccessControlPolicy *policy, rgw_obj& obj) { bufferlist bl; int ret = 0; - ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl, objv_tracker); + ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl, NULL); if (ret >= 0) { ret = decode_policy(cct, bl, policy); if (ret < 0) @@ -237,7 +235,7 @@ static int get_obj_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx */ static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, RGWBucketInfo& bucket_info, map& bucket_attrs, - RGWAccessControlPolicy *policy, rgw_obj& obj, RGWObjVersionTracker *objv_tracker) + RGWAccessControlPolicy *policy, rgw_obj& obj) { if (obj.bucket.name.empty()) { return 0; @@ -245,10 +243,10 @@ static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx, if (obj.object.empty()) { return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, - policy, obj, objv_tracker); + policy, obj); } return get_obj_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs, - policy, obj, objv_tracker); + policy, obj); } static int get_obj_attrs(RGWRados *store, struct req_state *s, rgw_obj& obj, map& attrs, @@ -282,14 +280,14 @@ static int read_policy(RGWRados *store, struct req_state *s, } else { obj.init(bucket, oid); } - int ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, policy, obj, &s->objv_tracker); + int ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, policy, obj); if (ret == -ENOENT && object.size()) { /* object does not exist checking the bucket's ACL to make sure that we send a proper error code */ RGWAccessControlPolicy bucket_policy(s->cct); string no_object; rgw_obj no_obj(bucket, no_object); - ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, &bucket_policy, no_obj, &s->objv_tracker); + ret = get_policy_from_attr(s->cct, store, s->obj_ctx, bucket_info, bucket_attrs, &bucket_policy, no_obj); if (ret < 0) return ret; string& owner = bucket_policy.get_owner().get_id(); @@ -976,7 +974,7 @@ void RGWCreateBucket::execute() s->bucket_owner.set_name(s->user.display_name); if (s->bucket_exists) { r = get_policy_from_attr(s->cct, store, s->obj_ctx, s->bucket_info, s->bucket_attrs, - &old_policy, obj, &s->objv_tracker); + &old_policy, obj); if (r >= 0) { if (old_policy.get_owner().get_id().compare(s->user.user_id) != 0) { ret = -EEXIST; @@ -1070,7 +1068,7 @@ void RGWDeleteBucket::execute() if (!s->bucket_name) return; - ret = store->delete_bucket(s->bucket, objv_tracker); + ret = store->delete_bucket(s->bucket, s->bucket_info.objv_tracker); if (ret == 0) { ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); @@ -1465,7 +1463,7 @@ void RGWPutMetadata::execute() rgw_get_request_metadata(s->cct, s->info, attrs); /* no need to track object versioning, need it for bucket's data only */ - RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->objv_tracker); + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); /* check if obj exists, read orig attrs */ ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); @@ -1783,7 +1781,7 @@ void RGWPutACLs::execute() *_dout << dendl; } - RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->objv_tracker); + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); new_policy.encode(bl); obj.init(s->bucket, s->object_str); @@ -1861,7 +1859,7 @@ void RGWPutCORS::execute() *_dout << dendl; } - RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->objv_tracker); + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); string no_obj; cors_config->encode(bl); @@ -1893,7 +1891,7 @@ void RGWDeleteCORS::execute() map orig_attrs, attrs, rmattrs; map::iterator iter; - RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->objv_tracker); + RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker); /* check if obj exists, read orig attrs */ ret = get_obj_attrs(store, s, obj, orig_attrs, NULL, ptracker); From 7cd0bd85d458b2cf8f48036362890cf3b22895b3 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 18 Jul 2013 17:40:52 -0700 Subject: [PATCH 17/22] rgw: bucket entry point object ver fixes Multiple fixes: - sync master, secondary entry point ver on creation - use correct entry point version when removing entry point - check correct version on bucket removal Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_cache.h | 8 ++++---- src/rgw/rgw_common.h | 1 + src/rgw/rgw_metadata.cc | 2 +- src/rgw/rgw_op.cc | 35 +++++++++++++++++++++++++++-------- src/rgw/rgw_op.h | 1 + src/rgw/rgw_rados.cc | 30 +++++++++++++++++++++++------- src/rgw/rgw_rados.h | 7 ++++--- src/rgw/rgw_rest_conn.cc | 8 +++++++- src/rgw/rgw_rest_conn.h | 2 +- src/rgw/rgw_rest_s3.cc | 1 + 10 files changed, 70 insertions(+), 25 deletions(-) diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h index 1a36e1a78d2..b6c4e15eede 100644 --- a/src/rgw/rgw_cache.h +++ b/src/rgw/rgw_cache.h @@ -208,7 +208,7 @@ public: int obj_stat(void *ctx, rgw_obj& obj, uint64_t *psize, time_t *pmtime, uint64_t *epoch, map *attrs, bufferlist *first_chunk, RGWObjVersionTracker *objv_tracker); - int delete_obj(void *ctx, rgw_obj& obj); + int delete_obj(void *ctx, rgw_obj& obj, RGWObjVersionTracker *objv_tracker); }; template @@ -224,13 +224,13 @@ void RGWCache::normalize_bucket_and_obj(rgw_bucket& src_bucket, string& src_o } template -int RGWCache::delete_obj(void *ctx, rgw_obj& obj) +int RGWCache::delete_obj(void *ctx, rgw_obj& obj, RGWObjVersionTracker *objv_tracker) { rgw_bucket bucket; string oid; normalize_bucket_and_obj(obj.bucket, obj.object, bucket, oid); if (bucket.name[0] != '.') - return T::delete_obj(ctx, obj); + return T::delete_obj(ctx, obj, objv_tracker); string name = normal_name(obj); cache.remove(name); @@ -238,7 +238,7 @@ int RGWCache::delete_obj(void *ctx, rgw_obj& obj) ObjectCacheInfo info; distribute_cache(name, obj, info, REMOVE_OBJ); - return T::delete_obj(ctx, obj); + return T::delete_obj(ctx, obj, objv_tracker); } template diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 9d7c3d41542..1d3596d4418 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -653,6 +653,7 @@ struct RGWBucketInfo string placement_rule; bool has_instance_obj; RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */ + obj_version ep_objv; /* entry point object version, for runtime tracking only */ void encode(bufferlist& bl) const { ENCODE_START(8, 4, bl); diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc index c370addc293..7be73e6ca0c 100644 --- a/src/rgw/rgw_metadata.cc +++ b/src/rgw/rgw_metadata.cc @@ -588,7 +588,7 @@ int RGWMetadataManager::remove_entry(RGWMetadataHandler *handler, string& key, R rgw_obj obj(bucket, oid); - ret = store->delete_obj(NULL, obj); + ret = store->delete_obj(NULL, obj, objv_tracker); /* cascading ret into post_modify() */ ret = post_modify(handler, section, key, log_data, objv_tracker, ret); diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 1dcd8fb8c93..644f8a8921d 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -920,7 +920,7 @@ int RGWCreateBucket::verify_permission() return 0; } -static int forward_request_to_master(struct req_state *s, RGWRados *store, bufferlist& in_data, JSONParser *jp) +static int forward_request_to_master(struct req_state *s, obj_version *objv, RGWRados *store, bufferlist& in_data, JSONParser *jp) { if (!store->rest_master_conn) { ldout(s->cct, 0) << "rest connection is invalid" << dendl; @@ -929,7 +929,7 @@ static int forward_request_to_master(struct req_state *s, RGWRados *store, buffe ldout(s->cct, 0) << "sending create_bucket request to master region" << dendl; bufferlist response; #define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response - int ret = store->rest_master_conn->forward(s->user.user_id, s->info, MAX_REST_RESPONSE, &in_data, &response); + int ret = store->rest_master_conn->forward(s->user.user_id, s->info, objv, MAX_REST_RESPONSE, &in_data, &response); if (ret < 0) return ret; @@ -989,10 +989,11 @@ void RGWCreateBucket::execute() if (!store->region.is_master) { JSONParser jp; - ret = forward_request_to_master(s, store, in_data, &jp); + ret = forward_request_to_master(s, NULL, store, in_data, &jp); if (ret < 0) return; + JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp); JSONDecoder::decode_json("object_ver", objv, &jp); JSONDecoder::decode_json("bucket_info", master_info, &jp); ldout(s->cct, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl; @@ -1022,7 +1023,7 @@ void RGWCreateBucket::execute() s->bucket.name = s->bucket_name_str; ret = store->create_bucket(s->user, s->bucket, region_name, placement_rule, attrs, info, pobjv, - creation_time, pmaster_bucket, true); + &ep_objv, creation_time, pmaster_bucket, true); /* continue if EEXIST and create_bucket will fail below. this way we can recover * from a partial create by retrying it. */ ldout(s->cct, 20) << "rgw_create_bucket returned ret=" << ret << " bucket=" << s->bucket << dendl; @@ -1068,7 +1069,27 @@ void RGWDeleteBucket::execute() if (!s->bucket_name) return; - ret = store->delete_bucket(s->bucket, s->bucket_info.objv_tracker); + RGWObjVersionTracker ot; + ot.read_version = s->bucket_info.ep_objv; + + if (s->system_request) { + string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag"); + string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver"); + if (!tag.empty()) { + ot.read_version.tag = tag; + uint64_t ver; + string err; + ver = strict_strtol(ver_str.c_str(), 10, &err); + if (!err.empty()) { + ldout(s->cct, 0) << "failed to parse ver param" << dendl; + ret = -EINVAL; + return; + } + ot.read_version.ver = ver; + } + } + + ret = store->delete_bucket(s->bucket, ot); if (ret == 0) { ret = rgw_unlink_bucket(store, s->user.user_id, s->bucket.name, false); @@ -1084,7 +1105,7 @@ void RGWDeleteBucket::execute() if (!store->region.is_master) { bufferlist in_data; JSONParser jp; - ret = forward_request_to_master(s, store, in_data, &jp); + ret = forward_request_to_master(s, &ot.read_version, store, in_data, &jp); if (ret < 0) { if (ret == -ENOENT) { /* adjust error, we want to return with NoSuchBucket and not NoSuchKey */ @@ -1092,8 +1113,6 @@ void RGWDeleteBucket::execute() } return; } - - JSONDecoder::decode_json("object_ver", objv_tracker.read_version, &jp); } } diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h index 7a2e4920ba8..7bca53b5e43 100644 --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@ -243,6 +243,7 @@ protected: string location_constraint; string placement_rule; RGWBucketInfo info; + obj_version ep_objv; bufferlist in_data; diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 308adff5ae9..a8061e99bf1 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -1776,6 +1776,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket, map& attrs, RGWBucketInfo& info, obj_version *pobjv, + obj_version *pep_objv, time_t creation_time, rgw_bucket *pmaster_bucket, bool exclusive) @@ -1833,7 +1834,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket, time(&info.creation_time); else info.creation_time = creation_time; - ret = put_linked_bucket_info(info, exclusive, 0, &attrs, true); + ret = put_linked_bucket_info(info, exclusive, 0, pep_objv, &attrs, true); if (ret == -EEXIST) { /* we need to reread the info and return it, caller will have a use for it */ r = get_bucket_info(NULL, bucket.name, info, NULL, NULL); @@ -3074,7 +3075,7 @@ int RGWRados::defer_gc(void *ctx, rgw_obj& obj) * obj: name of the object to delete * Returns: 0 on success, -ERR# otherwise. */ -int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj) +int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj, RGWObjVersionTracker *objv_tracker) { rgw_bucket bucket; std::string oid, key; @@ -3100,6 +3101,11 @@ int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj) r = prepare_update_index(state, bucket, CLS_RGW_OP_DEL, obj, tag); if (r < 0) return r; + + if (objv_tracker) { + objv_tracker->prepare_op_for_write(&op); + } + cls_refcount_put(op, tag, true); r = io_ctx.operate(oid, &op); bool removed = (r >= 0); @@ -3133,11 +3139,11 @@ int RGWRados::delete_obj_impl(void *ctx, rgw_obj& obj) return 0; } -int RGWRados::delete_obj(void *ctx, rgw_obj& obj) +int RGWRados::delete_obj(void *ctx, rgw_obj& obj, RGWObjVersionTracker *objv_tracker) { int r; - r = delete_obj_impl(ctx, obj); + r = delete_obj_impl(ctx, obj, objv_tracker); if (r == -ECANCELED) r = 0; @@ -4606,7 +4612,8 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf RGWBucketEntryPoint entry_point; time_t ep_mtime; - int ret = get_bucket_entrypoint_info(ctx, bucket_name, entry_point, NULL, &ep_mtime); + RGWObjVersionTracker ot; + int ret = get_bucket_entrypoint_info(ctx, bucket_name, entry_point, &ot, &ep_mtime); if (ret < 0) { info.bucket.name = bucket_name; /* only init this field */ return ret; @@ -4614,6 +4621,7 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf if (entry_point.has_bucket_info) { info = entry_point.old_bucket_info; + info.ep_objv = ot.read_version; ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl; return 0; } @@ -4629,6 +4637,7 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf get_bucket_meta_oid(entry_point.bucket, oid); ret = get_bucket_instance_from_oid(ctx, oid, info, pmtime, pattrs); + info.ep_objv = ot.read_version; if (ret < 0) { info.bucket.name = bucket_name; return ret; @@ -4657,7 +4666,7 @@ int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, return rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime); } -int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t mtime, +int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t mtime, obj_version *pep_objv, map *pattrs, bool create_entry_point) { bufferlist bl; @@ -4678,7 +4687,14 @@ int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t entry_point.creation_time = info.creation_time; entry_point.linked = true; RGWObjVersionTracker ot; - ot.generate_new_write_ver(cct); + if (pep_objv && !pep_objv->tag.empty()) { + ot.write_version = *pep_objv; + } else { + ot.generate_new_write_ver(cct); + if (pep_objv) { + *pep_objv = ot.write_version; + } + } ret = put_bucket_entrypoint_info(info.bucket.name, entry_point, exclusive, ot, mtime); if (ret < 0) return ret; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index fb1a1756ba8..99e66a91b1c 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -843,7 +843,7 @@ class RGWRados v.push_back(info); return clone_objs(ctx, dst_obj, v, attrs, category, pmtime, true, false); } - int delete_obj_impl(void *ctx, rgw_obj& src_obj); + int delete_obj_impl(void *ctx, rgw_obj& src_obj, RGWObjVersionTracker *objv_tracker); int complete_atomic_overwrite(RGWRadosCtx *rctx, RGWObjState *state, rgw_obj& obj); int update_placement_map(); @@ -983,6 +983,7 @@ public: map& attrs, RGWBucketInfo& bucket_info, obj_version *pobjv, + obj_version *pep_objv, time_t creation_time, rgw_bucket *master_bucket, bool exclusive = true); @@ -1142,7 +1143,7 @@ public: int bucket_suspended(rgw_bucket& bucket, bool *suspended); /** Delete an object.*/ - virtual int delete_obj(void *ctx, rgw_obj& src_obj); + virtual int delete_obj(void *ctx, rgw_obj& src_obj, RGWObjVersionTracker *objv_tracker = NULL); /** Remove an object from the bucket index */ int delete_obj_index(rgw_obj& obj); @@ -1294,7 +1295,7 @@ public: virtual int get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& info, time_t *pmtime, map *pattrs = NULL); - virtual int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t mtime, + virtual int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, time_t mtime, obj_version *pep_objv, map *pattrs, bool create_entry_point); int cls_rgw_init_index(librados::IoCtx& io_ctx, librados::ObjectWriteOperation& op, string& oid); diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc index 5caf3ce0bcd..35a8ac258e6 100644 --- a/src/rgw/rgw_rest_conn.cc +++ b/src/rgw/rgw_rest_conn.cc @@ -27,7 +27,7 @@ int RGWRESTConn::get_url(string& endpoint) return 0; } -int RGWRESTConn::forward(const string& uid, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl) +int RGWRESTConn::forward(const string& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl) { string url; int ret = get_url(url); @@ -36,6 +36,12 @@ int RGWRESTConn::forward(const string& uid, req_info& info, size_t max_response, list > params; params.push_back(make_pair(RGW_SYS_PARAM_PREFIX "uid", uid)); params.push_back(make_pair(RGW_SYS_PARAM_PREFIX "region", region)); + if (objv) { + params.push_back(make_pair(RGW_SYS_PARAM_PREFIX "tag", objv->tag)); + char buf[16]; + snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver); + params.push_back(make_pair(RGW_SYS_PARAM_PREFIX "ver", buf)); + } RGWRESTSimpleRequest req(cct, url, NULL, ¶ms); return req.forward_request(key, info, max_response, inbl, outbl); } diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h index 6fe572d2cf7..4a0b6087d26 100644 --- a/src/rgw/rgw_rest_conn.h +++ b/src/rgw/rgw_rest_conn.h @@ -20,7 +20,7 @@ public: int get_url(string& endpoint); /* sync request */ - int forward(const string& uid, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl); + int forward(const string& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl); /* async request */ int put_obj_init(const string& uid, rgw_obj& obj, uint64_t obj_size, diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc index 6e482e8a251..9e8ec3f88a5 100644 --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@ -449,6 +449,7 @@ void RGWCreateBucket_ObjStore_S3::send_response() JSONFormatter f; /* use json formatter for system requests output */ f.open_object_section("info"); + encode_json("entry_point_object_ver", ep_objv, &f); encode_json("object_ver", info.objv_tracker.read_version, &f); encode_json("bucket_info", info, &f); f.close_section(); From 0024e5aa2240c37da11488264d03b52dbe8b9cab Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Thu, 18 Jul 2013 21:50:51 -0700 Subject: [PATCH 18/22] rgw: fix time parsing in replica log Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_rest_replica_log.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/rgw/rgw_rest_replica_log.cc b/src/rgw/rgw_rest_replica_log.cc index 863a979a22e..600a8edb78c 100644 --- a/src/rgw/rgw_rest_replica_log.cc +++ b/src/rgw/rgw_rest_replica_log.cc @@ -27,13 +27,13 @@ #define REPLICA_INPUT_MAX_LEN (512*1024) static int parse_to_utime(string& in, utime_t& out) { - struct tm tm; - - if (!parse_iso8601(in.c_str(), &tm)) - return -EINVAL; + uint64_t sec = 0; + uint64_t nsec = 0; + int ret = utime_t::parse_date(in.c_str(), &sec, &nsec); + if (ret < 0) + return ret; - time_t tt = mktime(&tm); - out = utime_t(tt, 0); + out = utime_t(sec, nsec); return 0; } From e4d2787b023a10d782438e1dc9bb32be8d8ccd76 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Wed, 17 Jul 2013 16:14:02 -0700 Subject: [PATCH 19/22] rgw-admin: link / unlink should report errors Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_admin.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc index 0b7cb143400..239ec016389 100644 --- a/src/rgw/rgw_admin.cc +++ b/src/rgw/rgw_admin.cc @@ -1387,11 +1387,19 @@ int main(int argc, char **argv) } if (opt_cmd == OPT_BUCKET_LINK) { - RGWBucketAdminOp::link(store, bucket_op); + int r = RGWBucketAdminOp::link(store, bucket_op); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << std::endl; + return -r; + } } if (opt_cmd == OPT_BUCKET_UNLINK) { - RGWBucketAdminOp::unlink(store, bucket_op); + int r = RGWBucketAdminOp::unlink(store, bucket_op); + if (r < 0) { + cerr << "failure: " << cpp_strerror(-r) << std::endl; + return -r; + } } if (opt_cmd == OPT_TEMP_REMOVE) { From 4e05786a58a1218e1b68d5fbcddefa7a72ac03ce Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 09:44:43 -0700 Subject: [PATCH 20/22] rgw: replace logic that compares regions The logic was a bit broken. Basically, we want to make sure that region names are the same. However, if region name is not set then we need to check whether it's the master region. This can happen in upgrade cases where originally we didn't have a region name set. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_op.cc | 6 ++---- src/rgw/rgw_rados.cc | 20 +++++++++++--------- src/rgw/rgw_rados.h | 1 + 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 644f8a8921d..6a9d2319de7 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -334,8 +334,7 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu ret = store->get_bucket_info(s->obj_ctx, copy_source_str, source_info, NULL); if (ret == 0) { string& region = source_info.region; - s->local_source = (region.empty() && store->region.is_master) || - (region == store->region.name); + s->local_source = store->region.equals(region); } } @@ -362,8 +361,7 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu s->bucket_owner = s->bucket_acl->get_owner(); string& region = s->bucket_info.region; - if (s->bucket_exists && ((region.empty() && !store->region.is_master) || - (region != store->region.name))) { + if (s->bucket_exists && !store->region.equals(region)) { ldout(s->cct, 0) << "NOTICE: request for data in a different region (" << region << " != " << store->region.name << ")" << dendl; /* we now need to make sure that the operation actually requires copy source, that is * it's a copy operation diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index a8061e99bf1..e3312d9d373 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -259,6 +259,13 @@ int RGWRegion::store_info(bool exclusive) return ret; } +int RGWRegion::equals(const string& other_region) +{ + if (is_master && other_region.empty()) + return true; + + return (name == other_region); +} void RGWZoneParams::init_default(RGWRados *store) { @@ -429,8 +436,7 @@ int RGWRegionMap::update(RGWRegion& region) { Mutex::Locker l(lock); - if (region.is_master && !master_region.empty() && - master_region.compare(region.name) != 0) { + if (region.is_master && !region.equals(master_region)) { derr << "cannot update region map, master_region conflict" << dendl; return -EINVAL; } @@ -1938,8 +1944,7 @@ int RGWRados::set_bucket_location_by_rule(const string& location_rule, const std map::iterator piter = zone.placement_pools.find(location_rule); if (piter == zone.placement_pools.end()) { /* couldn't find, means we cannot really place data for this bucket in this zone */ - if ((region_name.empty() && region.is_master) || - region_name == region.name) { + if (region.equals(region_name)) { /* that's a configuration error, zone should have that rule, as we're within the requested * region */ return -EINVAL; @@ -2486,11 +2491,8 @@ int RGWRados::copy_obj(void *ctx, append_rand_alpha(cct, dest_obj.object, shadow_oid, 32); shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns); - remote_dest = ((dest_bucket_info.region.empty() && !region.is_master) || - (dest_bucket_info.region != region.name)); - - remote_src = ((src_bucket_info.region.empty() && !region.is_master) || - (src_bucket_info.region != region.name)); + remote_dest = !region.equals(dest_bucket_info.region); + remote_src = !region.equals(src_bucket_info.region); if (remote_src && remote_dest) { ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index 99e66a91b1c..e083879b582 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -619,6 +619,7 @@ struct RGWRegion { int read_info(const string& region_name); int read_default(RGWDefaultRegionInfo& default_region); int set_as_default(); + int equals(const string& other_region); static string get_pool_name(CephContext *cct); From d44082e421c7caa9637a6e42957dee64c1ce4e9e Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 11:19:05 -0700 Subject: [PATCH 21/22] cls_rgw: quiet down verbose log message Signed-off-by: Yehuda Sadeh --- src/cls/rgw/cls_rgw.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc index a972d6a1fdd..de2abe5665b 100644 --- a/src/cls/rgw/cls_rgw.cc +++ b/src/cls/rgw/cls_rgw.cc @@ -656,7 +656,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist } list::iterator remove_iter; - CLS_LOG(0, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", (int)op.remove_objs.size()); + CLS_LOG(20, "rgw_bucket_complete_op(): remove_objs.size()=%d\n", (int)op.remove_objs.size()); for (remove_iter = op.remove_objs.begin(); remove_iter != op.remove_objs.end(); ++remove_iter) { string& remove_oid_name = *remove_iter; CLS_LOG(1, "rgw_bucket_complete_op(): removing entries, read_index_entry name=%s\n", remove_oid_name.c_str()); From da8584f15f220ebde1015a1dcf52954b26287bea Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 19 Jul 2013 13:06:53 -0700 Subject: [PATCH 22/22] rgw: remove extra unused param from RGWRados::get_attr() No user for the extra obj_version param. Signed-off-by: Yehuda Sadeh --- src/rgw/rgw_bucket.cc | 2 +- src/rgw/rgw_op.cc | 4 ++-- src/rgw/rgw_rados.cc | 9 ++------- src/rgw/rgw_rados.h | 3 +-- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc index aae7d31e21c..7b22f44790b 100644 --- a/src/rgw/rgw_bucket.cc +++ b/src/rgw/rgw_bucket.cc @@ -718,7 +718,7 @@ int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, ostream& o) bufferlist bl; rgw_obj obj(bucket, object_name); - int ret = store->get_attr(NULL, obj, RGW_ATTR_ACL, bl, NULL); + int ret = store->get_attr(NULL, obj, RGW_ATTR_ACL, bl); if (ret < 0) return ret; diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 6a9d2319de7..17a3aaa8439 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -207,7 +207,7 @@ static int get_obj_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx bufferlist bl; int ret = 0; - ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl, NULL); + ret = store->get_attr(ctx, obj, RGW_ATTR_ACL, bl); if (ret >= 0) { ret = decode_policy(cct, bl, policy); if (ret < 0) @@ -2499,7 +2499,7 @@ int RGWHandler::read_cors_config(void) string no_object; rgw_obj no_obj(s->bucket, no_object); if (no_obj.bucket.name.size()) { - ret = store->get_attr(s->obj_ctx, no_obj, RGW_ATTR_CORS, bl, NULL); + ret = store->get_attr(s->obj_ctx, no_obj, RGW_ATTR_CORS, bl); if (ret >= 0) { bufferlist::iterator iter = bl.begin(); s->bucket_cors = new RGWCORSConfiguration(); diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index e3312d9d373..087fdcf8e09 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -3265,8 +3265,7 @@ int RGWRados::get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state * dest: bufferlist to store the result in * Returns: 0 on success, -ERR# otherwise. */ -int RGWRados::get_attr(void *ctx, rgw_obj& obj, const char *name, bufferlist& dest, - RGWObjVersionTracker *objv_tracker) +int RGWRados::get_attr(void *ctx, rgw_obj& obj, const char *name, bufferlist& dest) { rgw_bucket bucket; std::string oid, key; @@ -3301,10 +3300,6 @@ int RGWRados::get_attr(void *ctx, rgw_obj& obj, const char *name, bufferlist& de ObjectReadOperation op; - if (objv_tracker) { - objv_tracker->prepare_op_for_read(&op); - } - int rval; op.getxattr(name, &dest, &rval); @@ -3604,7 +3599,7 @@ int RGWRados::prepare_get_obj(void *ctx, rgw_obj& obj, } } if (if_match || if_nomatch) { - r = get_attr(rctx, obj, RGW_ATTR_ETAG, etag, NULL); + r = get_attr(rctx, obj, RGW_ATTR_ETAG, etag); if (r < 0) goto done_err; diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h index e083879b582..6422c182adc 100644 --- a/src/rgw/rgw_rados.h +++ b/src/rgw/rgw_rados.h @@ -1157,8 +1157,7 @@ public: * dest: bufferlist to store the result in * Returns: 0 on success, -ERR# otherwise. */ - virtual int get_attr(void *ctx, rgw_obj& obj, const char *name, - bufferlist& dest, RGWObjVersionTracker *objv_tracker); + virtual int get_attr(void *ctx, rgw_obj& obj, const char *name, bufferlist& dest); /** * Set an attr on an object.