From db7d12103a42263e08a660ec33c74eeb70b6a20f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 15:13:18 -0700 Subject: [PATCH 01/12] osdc/Objecter: add perfcounters for commands This matches the other counters we maintain for other kinds of ops. Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 0afc56805ea..ad7481f42ca 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -111,6 +111,10 @@ enum { l_osdc_statfs_send, l_osdc_statfs_resend, + l_osdc_command_active, + l_osdc_command_send, + l_osdc_command_resend, + l_osdc_map_epoch, l_osdc_map_full, l_osdc_map_inc, @@ -190,6 +194,10 @@ void Objecter::init_unlocked() pcb.add_u64_counter(l_osdc_statfs_send, "statfs_send"); pcb.add_u64_counter(l_osdc_statfs_resend, "statfs_resend"); + pcb.add_u64(l_osdc_command_active, "command_active"); + pcb.add_u64_counter(l_osdc_command_send, "command_send"); + pcb.add_u64_counter(l_osdc_command_resend, "command_resend"); + pcb.add_u64(l_osdc_map_epoch, "map_epoch"); pcb.add_u64_counter(l_osdc_map_full, "map_full"); pcb.add_u64_counter(l_osdc_map_inc, "map_inc"); @@ -2353,6 +2361,8 @@ int Objecter::_submit_command(CommandOp *c, tid_t *ptid) if (c->map_check_error) _send_command_map_check(c); *ptid = tid; + + logger->set(l_osdc_command_active, command_ops.size()); return 0; } @@ -2409,6 +2419,7 @@ void Objecter::_send_command(CommandOp *c) m->set_data(c->inbl); m->set_tid(c->tid); messenger->send_message(m, c->session->con); + logger->inc(l_osdc_command_send); } void Objecter::_finish_command(CommandOp *c, int r, string rs) @@ -2421,4 +2432,6 @@ void Objecter::_finish_command(CommandOp *c, int r, string rs) c->onfinish->complete(r); command_ops.erase(c->tid); c->put(); + + logger->set(l_osdc_command_active, command_ops.size()); } From 68a91995ba4ae7a0dccbe73b9c007e71d86b0d9d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 15:13:47 -0700 Subject: [PATCH 02/12] osdc/Objecter: kick command ops on osd con resets Resend osd/pg commands on the OSDSession, just as we do with other request types. Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index ad7481f42ca..685d8c5fe5e 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1013,6 +1013,17 @@ void Objecter::kick_requests(OSDSession *session) send_linger(lresend.begin()->second); lresend.erase(lresend.begin()); } + + // resend commands + map cresend; // resend in order + for (xlist::iterator k = session->command_ops.begin(); !k.end(); ++k) { + logger->inc(l_osdc_command_resend); + cresend[(*k)->tid] = *k; + } + while (!cresend.empty()) { + _send_command(cresend.begin()->second); + cresend.erase(cresend.begin()); + } } void Objecter::schedule_tick() From a6876ad7d949c9d9757e632f10d0bb3bcffddd0a Mon Sep 17 00:00:00 2001 From: Dan Mick Date: Thu, 13 Jun 2013 15:30:38 -0700 Subject: [PATCH 03/12] ceph.in: argparsing cleanup: suppress --completion, add help Options -v, --verbose, --concise didn't have helpstrings Option --completion doesn't quite work yet, and should be hidden anyway Signed-off-by: Dan Mick --- src/ceph.in | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ceph.in b/src/ceph.in index 7d6c62a7303..dcb120931ca 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -839,7 +839,8 @@ def parse_cmdargs(args=None, target=''): # format our own help parser = AP(description='Frontend for ceph CLI', add_help=False) - parser.add_argument('--completion', action='store_true') + parser.add_argument('--completion', action='store_true', + help=argparse.SUPPRESS) parser.add_argument('-h', '--help', help='request mon help', action='store_true') @@ -878,9 +879,10 @@ def parse_cmdargs(args=None, target=''): parser.add_argument('--watch-error', action='store_true', help='watch error events') - parser.add_argument('-v', action="store_true") - parser.add_argument('--verbose', action="store_true") - parser.add_argument('--concise', dest='verbose', action="store_false") + parser.add_argument('-v', action="store_true", help="display version") + parser.add_argument('--verbose', action="store_true", help="make verbose") + parser.add_argument('--concise', dest='verbose', action="store_false", + help="make less verbose") parser.add_argument('-f', '--format', choices=['json', 'json-pretty', 'xml', 'xml-pretty', 'plain'], dest='output_format') From e4f9dce7a5d1055115a065c3b2677cc65fcef6de Mon Sep 17 00:00:00 2001 From: Dan Mick Date: Thu, 13 Jun 2013 15:48:32 -0700 Subject: [PATCH 04/12] ceph.in: refuse 'ceph tell' commands; suggest 'ceph tell ' Signed-off-by: Dan Mick Reviewed-by: Sage Weil --- src/ceph.in | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ceph.in b/src/ceph.in index dcb120931ca..a6504006870 100755 --- a/src/ceph.in +++ b/src/ceph.in @@ -1,4 +1,4 @@ -# +#& # Processed in Makefile to add python #! line and version variable # # @@ -1454,6 +1454,14 @@ def main(): if '--' in childargs: childargs.remove('--') + # special deprecation warning for 'ceph tell' + # someday 'mds' will be here too + if len(childargs) >= 2 and \ + childargs[0] in ['mon', 'osd'] and \ + childargs[1] == 'tell': + print >> sys.stderr, '"{0} tell" is deprecated; try "tell {0}." instead (id can be "*") '.format(childargs[0]) + return 1 + try: cluster_handle.connect() except KeyboardInterrupt: From 6e73d999afa4189789c4b93e47ecda3c65494c98 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 15:57:57 -0700 Subject: [PATCH 05/12] osdc/Objecter: ping osds for which we have pending commands As with ops and linger_ops, this ensures we detect connection resets. Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 685d8c5fe5e..3674d361f14 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1071,6 +1071,17 @@ void Objecter::tick() ldout(cct, 10) << " lingering tid " << p->first << " does not have session" << dendl; } } + for (map::iterator p = command_ops.begin(); + p != command_ops.end(); + ++p) { + CommandOp *op = p->second; + if (op->session) { + ldout(cct, 10) << " pinging osd that serves command tid " << p->first << " (osd." << op->session->osd << ")" << dendl; + toping.insert(op->session); + } else { + ldout(cct, 10) << " command tid " << p->first << " does not have session" << dendl; + } + } logger->set(l_osdc_op_laggy, laggy_ops); logger->set(l_osdc_osd_laggy, toping.size()); From 2bda9db1c24530cbaaa161b7ff0a80efa913aa78 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 16:01:31 -0700 Subject: [PATCH 06/12] osdc/Objecter: dump command ops Dump command_ops along with everything else. Signed-off-by: Sage Weil --- src/osdc/Objecter.cc | 24 ++++++++++++++++++++++++ src/osdc/Objecter.h | 1 + 2 files changed, 25 insertions(+) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 3674d361f14..e6b07a248ed 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -2189,6 +2189,7 @@ void Objecter::dump_requests(Formatter& fmt) const dump_pool_ops(fmt); dump_pool_stat_ops(fmt); dump_statfs_ops(fmt); + dump_command_ops(fmt); fmt.close_section(); // requests object } @@ -2245,6 +2246,29 @@ void Objecter::dump_linger_ops(Formatter& fmt) const fmt.close_section(); // linger_ops array } +void Objecter::dump_command_ops(Formatter& fmt) const +{ + fmt.open_array_section("command_ops"); + for (map::const_iterator p = command_ops.begin(); + p != command_ops.end(); + ++p) { + CommandOp *op = p->second; + fmt.open_object_section("command_op"); + fmt.dump_unsigned("command_id", op->tid); + fmt.dump_int("osd", op->session ? op->session->osd : -1); + fmt.open_array_section("command"); + for (vector::const_iterator q = op->cmd.begin(); q != op->cmd.end(); ++q) + fmt.dump_string("word", *q); + fmt.close_section(); + if (op->target_osd >= 0) + fmt.dump_int("target_osd", op->target_osd); + else + fmt.dump_stream("target_pg") << op->target_pg; + fmt.close_section(); // command_op object + } + fmt.close_section(); // command_ops array +} + void Objecter::dump_pool_ops(Formatter& fmt) const { fmt.open_array_section("pool_ops"); diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index b64c8f77ed0..5f28d6d7eac 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1226,6 +1226,7 @@ private: void dump_requests(Formatter& fmt) const; void dump_ops(Formatter& fmt) const; void dump_linger_ops(Formatter& fmt) const; + void dump_command_ops(Formatter& fmt) const; void dump_pool_ops(Formatter& fmt) const; void dump_pool_stat_ops(Formatter& fmt) const; void dump_statfs_ops(Formatter& fmt) const; From 93505bb3c794312814f7e4d099296a53f3bb9db0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 16:39:30 -0700 Subject: [PATCH 07/12] librados: wait for osdmap for commands that need it In commit 7e1cf87b5158c870e2a118ed6d316be8cb9818ce we stopped waiting for the osdmap on start because the Objecter will normally wait, but for some commands we assume the osdmap is recent(ish). Signed-off-by: Sage Weil Reviewed-by: Josh Durgin --- src/librados/RadosClient.cc | 19 ++++++++++++++++++- src/librados/RadosClient.h | 2 ++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc index f5dccaffc09..f68125fb8c0 100644 --- a/src/librados/RadosClient.cc +++ b/src/librados/RadosClient.cc @@ -85,6 +85,7 @@ librados::RadosClient::RadosClient(CephContext *cct_) int64_t librados::RadosClient::lookup_pool(const char *name) { Mutex::Locker l(lock); + wait_for_osdmap(); int64_t ret = osdmap.lookup_pg_pool_name(name); if (ret < 0) return -ENOENT; @@ -100,6 +101,7 @@ const char *librados::RadosClient::get_pool_name(int64_t pool_id) int librados::RadosClient::pool_get_auid(uint64_t pool_id, unsigned long long *auid) { Mutex::Locker l(lock); + wait_for_osdmap(); const pg_pool_t *pg = osdmap.get_pg_pool(pool_id); if (!pg) return -ENOENT; @@ -110,6 +112,7 @@ int librados::RadosClient::pool_get_auid(uint64_t pool_id, unsigned long long *a int librados::RadosClient::pool_get_name(uint64_t pool_id, std::string *s) { Mutex::Locker l(lock); + wait_for_osdmap(); const char *str = osdmap.get_pool_name(pool_id); if (!str) return -ENOENT; @@ -123,7 +126,7 @@ int librados::RadosClient::get_fsid(std::string *s) return -EINVAL; Mutex::Locker l(lock); ostringstream oss; - oss << osdmap.get_fsid(); + oss << monclient.get_fsid(); *s = oss.str(); return 0; } @@ -354,9 +357,21 @@ bool librados::RadosClient::_dispatch(Message *m) return true; } +void librados::RadosClient::wait_for_osdmap() +{ + assert(lock.is_locked()); + if (osdmap.get_epoch() == 0) { + ldout(cct, 10) << __func__ << " waiting" << dendl; + while (osdmap.get_epoch() == 0) + cond.Wait(lock); + ldout(cct, 10) << __func__ << " done waiting" << dendl; + } +} + int librados::RadosClient::pool_list(std::list& v) { Mutex::Locker l(lock); + wait_for_osdmap(); for (map::const_iterator p = osdmap.get_pools().begin(); p != osdmap.get_pools().end(); ++p) @@ -453,6 +468,7 @@ int librados::RadosClient::pool_create_async(string& name, PoolAsyncCompletionIm int librados::RadosClient::pool_delete(const char *name) { lock.Lock(); + wait_for_osdmap(); int tmp_pool_id = osdmap.lookup_pg_pool_name(name); if (tmp_pool_id < 0) { lock.Unlock(); @@ -481,6 +497,7 @@ int librados::RadosClient::pool_delete(const char *name) int librados::RadosClient::pool_delete_async(const char *name, PoolAsyncCompletionImpl *c) { Mutex::Locker l(lock); + wait_for_osdmap(); int tmp_pool_id = osdmap.lookup_pg_pool_name(name); if (tmp_pool_id < 0) return -ENOENT; diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h index 337beff5750..4f616d45331 100644 --- a/src/librados/RadosClient.h +++ b/src/librados/RadosClient.h @@ -70,6 +70,8 @@ private: void *log_cb_arg; string log_watch; + void wait_for_osdmap(); + public: Finisher finisher; From 99bd5c8f7b37d8a77635d982683aa7a18b10da62 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 17:38:02 -0700 Subject: [PATCH 08/12] librados: add missing #include librados/librados.cc: In function 'int rados_mon_command_target(void*, const char*, const char**, size_t, const char*, size_t, char**, size_t*, char**, size_t*)': error: librados/librados.cc:1877: 'LONG_MAX' was not declared in this scope error: librados/librados.cc:1877: 'LONG_MIN' was not declared in this scope Signed-off-by: Sage Weil --- src/librados/librados.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/librados/librados.cc b/src/librados/librados.cc index 43c2584c390..0d4277d8fab 100644 --- a/src/librados/librados.cc +++ b/src/librados/librados.cc @@ -12,6 +12,8 @@ * */ +#include + #include "common/config.h" #include "common/errno.h" #include "common/ceph_argparse.h" From 10ba60cd088c15d4b4ea0b86ad681aa57f1051b6 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 11:03:37 -0700 Subject: [PATCH 09/12] ceph-disk: add 'zap' command Signed-off-by: Sage Weil --- src/ceph-disk | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index 0389b5ce55b..6ee15da736b 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -1869,6 +1869,9 @@ def main_suppress(args): def main_unsuppress(args): unset_suppress(args.path) +def main_zap(args): + for dev in args.dev: + zap(dev) ########################### @@ -2028,6 +2031,17 @@ def parse_args(): func=main_unsuppress, ) + zap_parser = subparsers.add_parser('zap', help='Zap/erase/destroy a device\'s partition table (and contents)') + zap_parser.add_argument( + 'dev', + metavar='DEV', + nargs='*', + help='path to block device', + ) + zap_parser.set_defaults( + func=main_zap, + ) + args = parser.parse_args() return args From 8b3b59e01432090f7ae774e971862316203ade68 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 11 Jun 2013 18:35:01 -0700 Subject: [PATCH 10/12] ceph-disk: call partprobe outside of the prepare lock; drop udevadm settle After we change the final partition type, sgdisk may or may not trigger a udev event, depending on how well udev is behaving (it varies between distros, it seems). The old code would often settle and wait for udev to activate the device, and then partprobe would uselessly fail because it was already mounted. Call partprobe only at the very end, after prepare is done. This ensures that if partprobe calls udevadm settle (which is sometimes does) we do not get stuck. Drop the udevadm settle. I'm not sure what this accomplishes; take it out, at least until we determine we need it. Signed-off-by: Sage Weil --- src/ceph-disk | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/src/ceph-disk b/src/ceph-disk index 6ee15da736b..d5642c58a92 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -761,14 +761,6 @@ def prepare_journal_dev( # wait for udev event queue to clear 'udevadm', 'settle', - '--timeout=10', - ], - ) - subprocess.check_call( - args=[ - # also make sure the kernel refreshes the new table - 'partprobe', - journal, ], ) @@ -963,14 +955,6 @@ def prepare_dev( # wait for udev event queue to clear 'udevadm', 'settle', - '--timeout=10', - ], - ) - subprocess.check_call( - args=[ - # also make sure the kernel refreshes the new table - 'partprobe', - data, ], ) except subprocess.CalledProcessError as e: @@ -1037,21 +1021,6 @@ def prepare_dev( data, ], ) - subprocess.call( - args=[ - # wait for udev event queue to clear - 'udevadm', - 'settle', - '--timeout=10', - ], - ) - subprocess.check_call( - args=[ - # also make sure the kernel refreshes the new table - 'partprobe', - data, - ], - ) except subprocess.CalledProcessError as e: raise Error(e) @@ -1192,6 +1161,19 @@ def main_prepare(args): raise Error('not a dir or block device', args.data) prepare_lock.release() + if stat.S_ISBLK(dmode): + # try to make sure the kernel refreshes the table. note + # that if this gets ebusy, we are probably racing with + # udev because it already updated it.. ignore failure here. + LOG.debug('Calling partprobe on prepared device %s', args.data) + subprocess.call( + args=[ + 'partprobe', + args.data, + ], + ) + + except Error as e: if journal_dm_keypath: os.unlink(journal_dm_keypath) From a2a78e8d16db0a71b13fc15457abc5fe0091c84c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 15:54:58 -0700 Subject: [PATCH 11/12] ceph-disk: implement 'activate-journal' Activate an osd via its journal device. udev populates its symlinks and triggers events in an order that is not related to whether the device is an osd data partition or a journal. That means that triggering 'ceph-disk activate' can happen before the journal (or journal symlink) is present and then fail. Similarly, it may be that they are on different disks that are hotplugged with the journal second. This can be wired up to the journal partition type to ensure that osds are started when the journal appears second. Include the udev rules to trigger this. Signed-off-by: Sage Weil --- src/ceph-disk | 82 ++++++++++++++++++++++++++++++++++++++++++ udev/95-ceph-osd.rules | 6 ++++ 2 files changed, 88 insertions(+) diff --git a/src/ceph-disk b/src/ceph-disk index d5642c58a92..13d9f8203ce 100755 --- a/src/ceph-disk +++ b/src/ceph-disk @@ -1606,6 +1606,64 @@ def main_activate(args): activate_lock.release() +########################### + +def get_journal_osd_uuid(path): + if not os.path.exists(path): + raise Error('%s does not exist', path) + + mode = os.stat(path).st_mode + if not stat.S_ISBLK(mode): + raise Error('%s is not a block device', path) + + try: + out = _check_output( + args=[ + 'ceph-osd', + '-i', '0', # this is ignored + '--get-journal-uuid', + '--osd-journal', + path, + ], + close_fds=True, + ) + except subprocess.CalledProcessError as e: + raise Error( + 'failed to get osd uuid/fsid from journal', + e, + ) + value = str(out).split('\n', 1)[0] + LOG.debug('Journal %s has OSD UUID %s', path, value) + return value + +def main_activate_journal(args): + if not os.path.exists(args.dev): + raise Error('%s does not exist', args.dev) + + cluster = None + osd_id = None + osd_uuid = None + activate_lock.acquire() + try: + osd_uuid = get_journal_osd_uuid(args.dev) + path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower()) + + (cluster, osd_id) = mount_activate( + dev=path, + activate_key_template=args.activate_key_template, + init=args.mark_init, + ) + + start_daemon( + cluster=cluster, + osd_id=osd_id, + ) + + activate_lock.release() + + except: + activate_lock.release() + raise ########################### @@ -1986,6 +2044,30 @@ def parse_args(): func=main_activate, ) + activate_journal_parser = subparsers.add_parser('activate-journal', help='Activate an OSD via its journal device') + activate_journal_parser.add_argument( + 'dev', + metavar='DEV', + help='path to journal block device', + ) + activate_journal_parser.add_argument( + '--activate-key', + metavar='PATH', + help='bootstrap-osd keyring path template (%(default)s)', + dest='activate_key_template', + ) + activate_journal_parser.add_argument( + '--mark-init', + metavar='INITSYSTEM', + help='init system to manage this dir', + default='auto', + choices=INIT_SYSTEMS, + ) + activate_journal_parser.set_defaults( + activate_key_template='/var/lib/ceph/bootstrap-osd/{cluster}.keyring', + func=main_activate_journal, + ) + list_parser = subparsers.add_parser('list', help='List disks, partitions, and Ceph OSDs') list_parser.set_defaults( func=main_list, diff --git a/udev/95-ceph-osd.rules b/udev/95-ceph-osd.rules index 77e6ef37c5d..9798e648483 100644 --- a/udev/95-ceph-osd.rules +++ b/udev/95-ceph-osd.rules @@ -4,6 +4,12 @@ ACTION=="add", SUBSYSTEM=="block", \ ENV{ID_PART_ENTRY_TYPE}=="4fbd7e29-9d25-41b8-afd0-062c0ceff05d", \ RUN+="/usr/sbin/ceph-disk-activate --mount /dev/$name" +# activate ceph-tagged partitions +ACTION=="add", SUBSYSTEM=="block", \ + ENV{DEVTYPE}=="partition", \ + ENV{ID_PART_ENTRY_TYPE}=="45b0969e-9b03-4f30-b4c6-b4b80ceff106", \ + RUN+="/usr/sbin/ceph-disk activate-journal /dev/$name" + # Map journal if using dm-crypt ACTION=="add" SUBSYSTEM=="block", \ ENV{DEVTYPE}=="partition", \ From 02599c43b4290f5474a6c4295d6caf6774ffceb1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 13 Jun 2013 18:13:34 -0700 Subject: [PATCH 12/12] ceph-fuse: fix uninitialized variable There is a delete call in the out_mc_start_failed path. Signed-off-by: Sage Weil --- src/ceph_fuse.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc index 77c70f29df0..54616f60f99 100644 --- a/src/ceph_fuse.cc +++ b/src/ceph_fuse.cc @@ -108,7 +108,7 @@ int main(int argc, const char **argv, const char *envp[]) { g_ceph_context->_log->start(); // get monmap - Messenger *messenger; + Messenger *messenger = NULL; Client *client; CephFuse *cfuse;