// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include #include #include #include #include "osd/OSD.h" #include "os/ObjectStore.h" #include "mon/MonClient.h" #include "include/ceph_features.h" #include "common/config.h" #include "mon/MonMap.h" #include "msg/Messenger.h" #include "common/Timer.h" #include "common/TracepointProvider.h" #include "common/ceph_argparse.h" #include "global/global_init.h" #include "global/signal_handler.h" #include "include/color.h" #include "common/errno.h" #include "common/pick_address.h" #include "perfglue/heap_profiler.h" #include "include/assert.h" #include "common/Preforker.h" #define dout_context g_ceph_context #define dout_subsys ceph_subsys_osd namespace { TracepointProvider::Traits osd_tracepoint_traits("libosd_tp.so", "osd_tracing"); TracepointProvider::Traits os_tracepoint_traits("libos_tp.so", "osd_objectstore_tracing"); #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS TracepointProvider::Traits cyg_profile_traits("libcyg_profile_tp.so", "osd_function_tracing"); #endif } // anonymous namespace OSD *osd = nullptr; void handle_osd_signal(int signum) { if (osd) osd->handle_signal(signum); } static void usage() { cout << "usage: ceph-osd -i [flags]\n" << " --osd-data PATH data directory\n" << " --osd-journal PATH\n" << " journal file or block device\n" << " --mkfs create a [new] data directory\n" << " --mkkey generate a new secret key. This is normally used in combination with --mkfs\n" << " --monmap specify the path to the monitor map. This is normally used in combination with --mkfs\n" << " --osd-uuid specify the OSD's fsid. This is normally used in combination with --mkfs\n" << " --keyring specify a path to the osd keyring. This is normally used in combination with --mkfs\n" << " --convert-filestore\n" << " run any pending upgrade operations\n" << " --flush-journal flush all data out of journal\n" << " --mkjournal initialize a new journal\n" << " --check-wants-journal\n" << " check whether a journal is desired\n" << " --check-allows-journal\n" << " check whether a journal is allowed\n" << " --check-needs-journal\n" << " check whether a journal is required\n" << " --debug_osd set debug level (e.g. 10)\n" << " --get-device-fsid PATH\n" << " get OSD fsid for the given block device\n" << std::endl; generic_server_usage(); } int main(int argc, const char **argv) { vector args; argv_to_vec(argc, argv, args); if (args.empty()) { cerr << argv[0] << ": -h or --help for usage" << std::endl; exit(1); } if (ceph_argparse_need_usage(args)) { usage(); exit(0); } map defaults = { // We want to enable leveldb's log, while allowing users to override this // option, therefore we will pass it as a default argument to global_init(). { "leveldb_log", "" } }; auto cct = global_init( &defaults, args, CEPH_ENTITY_TYPE_OSD, CODE_ENVIRONMENT_DAEMON, 0, "osd_data"); ceph_heap_profiler_init(); Preforker forker; // osd specific args bool mkfs = false; bool mkjournal = false; bool check_wants_journal = false; bool check_allows_journal = false; bool check_needs_journal = false; bool mkkey = false; bool flushjournal = false; bool dump_journal = false; bool convertfilestore = false; bool get_osd_fsid = false; bool get_cluster_fsid = false; bool get_journal_fsid = false; bool get_device_fsid = false; string device_path; std::string dump_pg_log; std::string val; for (std::vector::iterator i = args.begin(); i != args.end(); ) { if (ceph_argparse_double_dash(args, i)) { break; } else if (ceph_argparse_flag(args, i, "--mkfs", (char*)NULL)) { mkfs = true; } else if (ceph_argparse_flag(args, i, "--mkjournal", (char*)NULL)) { mkjournal = true; } else if (ceph_argparse_flag(args, i, "--check-allows-journal", (char*)NULL)) { check_allows_journal = true; } else if (ceph_argparse_flag(args, i, "--check-wants-journal", (char*)NULL)) { check_wants_journal = true; } else if (ceph_argparse_flag(args, i, "--check-needs-journal", (char*)NULL)) { check_needs_journal = true; } else if (ceph_argparse_flag(args, i, "--mkkey", (char*)NULL)) { mkkey = true; } else if (ceph_argparse_flag(args, i, "--flush-journal", (char*)NULL)) { flushjournal = true; } else if (ceph_argparse_flag(args, i, "--convert-filestore", (char*)NULL)) { convertfilestore = true; } else if (ceph_argparse_witharg(args, i, &val, "--dump-pg-log", (char*)NULL)) { dump_pg_log = val; } else if (ceph_argparse_flag(args, i, "--dump-journal", (char*)NULL)) { dump_journal = true; } else if (ceph_argparse_flag(args, i, "--get-cluster-fsid", (char*)NULL)) { get_cluster_fsid = true; } else if (ceph_argparse_flag(args, i, "--get-osd-fsid", "--get-osd-uuid", (char*)NULL)) { get_osd_fsid = true; } else if (ceph_argparse_flag(args, i, "--get-journal-fsid", "--get-journal-uuid", (char*)NULL)) { get_journal_fsid = true; } else if (ceph_argparse_witharg(args, i, &device_path, "--get-device-fsid", (char*)NULL)) { get_device_fsid = true; } else { ++i; } } if (!args.empty()) { cerr << "unrecognized arg " << args[0] << std::endl; exit(1); } if (global_init_prefork(g_ceph_context) >= 0) { std::string err; int r = forker.prefork(err); if (r < 0) { cerr << err << std::endl; return r; } if (forker.is_parent()) { if (forker.parent_wait(err) != 0) { return -ENXIO; } return 0; } setsid(); global_init_postfork_start(g_ceph_context); } common_init_finish(g_ceph_context); global_init_chdir(g_ceph_context); if (get_journal_fsid) { device_path = g_conf->get_val("osd_journal"); get_device_fsid = true; } if (get_device_fsid) { uuid_d uuid; int r = ObjectStore::probe_block_device_fsid(g_ceph_context, device_path, &uuid); if (r < 0) { cerr << "failed to get device fsid for " << device_path << ": " << cpp_strerror(r) << std::endl; forker.exit(1); } cout << uuid << std::endl; forker.exit(0); } if (!dump_pg_log.empty()) { common_init_finish(g_ceph_context); bufferlist bl; std::string error; if (bl.read_file(dump_pg_log.c_str(), &error) >= 0) { pg_log_entry_t e; auto p = bl.cbegin(); while (!p.end()) { uint64_t pos = p.get_off(); try { decode(e, p); } catch (const buffer::error &e) { derr << "failed to decode LogEntry at offset " << pos << dendl; forker.exit(1); } derr << pos << ":\t" << e << dendl; } } else { derr << "unable to open " << dump_pg_log << ": " << error << dendl; } forker.exit(0); } // whoami char *end; const char *id = g_conf->name.get_id().c_str(); int whoami = strtol(id, &end, 10); std::string data_path = g_conf->get_val("osd_data"); if (*end || end == id || whoami < 0) { derr << "must specify '-i #' where # is the osd number" << dendl; forker.exit(1); } if (data_path.empty()) { derr << "must specify '--osd-data=foo' data path" << dendl; forker.exit(1); } // the store std::string store_type = g_conf->get_val("osd_objectstore"); { char fn[PATH_MAX]; snprintf(fn, sizeof(fn), "%s/type", data_path.c_str()); int fd = ::open(fn, O_RDONLY); if (fd >= 0) { bufferlist bl; bl.read_fd(fd, 64); if (bl.length()) { store_type = string(bl.c_str(), bl.length() - 1); // drop \n dout(5) << "object store type is " << store_type << dendl; } ::close(fd); } } std::string journal_path = g_conf->get_val("osd_journal"); uint32_t flags = g_conf->get_val("osd_os_flags"); ObjectStore *store = ObjectStore::create(g_ceph_context, store_type, data_path, journal_path, flags); if (!store) { derr << "unable to create object store" << dendl; forker.exit(-ENODEV); } if (mkkey) { common_init_finish(g_ceph_context); KeyRing *keyring = KeyRing::create_empty(); if (!keyring) { derr << "Unable to get a Ceph keyring." << dendl; forker.exit(1); } EntityName ename(g_conf->name); EntityAuth eauth; std::string keyring_path = g_conf->get_val("keyring"); int ret = keyring->load(g_ceph_context, keyring_path); if (ret == 0 && keyring->get_auth(ename, eauth)) { derr << "already have key in keyring " << keyring_path << dendl; } else { eauth.key.create(g_ceph_context, CEPH_CRYPTO_AES); keyring->add(ename, eauth); bufferlist bl; keyring->encode_plaintext(bl); int r = bl.write_file(keyring_path.c_str(), 0600); if (r) derr << TEXT_RED << " ** ERROR: writing new keyring to " << keyring_path << ": " << cpp_strerror(r) << TEXT_NORMAL << dendl; else derr << "created new key in keyring " << keyring_path << dendl; } } if (mkfs) { common_init_finish(g_ceph_context); if (g_conf->get_val("fsid").is_zero()) { derr << "must specify cluster fsid" << dendl; forker.exit(-EINVAL); } int err = OSD::mkfs(g_ceph_context, store, data_path, g_conf->get_val("fsid"), whoami); if (err < 0) { derr << TEXT_RED << " ** ERROR: error creating empty object store in " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } dout(0) << "created object store " << data_path << " for osd." << whoami << " fsid " << g_conf->get_val("fsid") << dendl; } if (mkfs || mkkey) { forker.exit(0); } if (mkjournal) { common_init_finish(g_ceph_context); int err = store->mkjournal(); if (err < 0) { derr << TEXT_RED << " ** ERROR: error creating fresh journal " << journal_path << " for object store " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } derr << "created new journal " << journal_path << " for object store " << data_path << dendl; forker.exit(0); } if (check_wants_journal) { if (store->wants_journal()) { cout << "wants journal: yes" << std::endl; forker.exit(0); } else { cout << "wants journal: no" << std::endl; forker.exit(1); } } if (check_allows_journal) { if (store->allows_journal()) { cout << "allows journal: yes" << std::endl; forker.exit(0); } else { cout << "allows journal: no" << std::endl; forker.exit(1); } } if (check_needs_journal) { if (store->needs_journal()) { cout << "needs journal: yes" << std::endl; forker.exit(0); } else { cout << "needs journal: no" << std::endl; forker.exit(1); } } if (flushjournal) { common_init_finish(g_ceph_context); int err = store->mount(); if (err < 0) { derr << TEXT_RED << " ** ERROR: error flushing journal " << journal_path << " for object store " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; goto flushjournal_out; } store->umount(); derr << "flushed journal " << journal_path << " for object store " << data_path << dendl; flushjournal_out: delete store; forker.exit(err < 0 ? 1 : 0); } if (dump_journal) { common_init_finish(g_ceph_context); int err = store->dump_journal(cout); if (err < 0) { derr << TEXT_RED << " ** ERROR: error dumping journal " << journal_path << " for object store " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } derr << "dumped journal " << journal_path << " for object store " << data_path << dendl; forker.exit(0); } if (convertfilestore) { int err = store->mount(); if (err < 0) { derr << TEXT_RED << " ** ERROR: error mounting store " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } err = store->upgrade(); store->umount(); if (err < 0) { derr << TEXT_RED << " ** ERROR: error converting store " << data_path << ": " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } forker.exit(0); } string magic; uuid_d cluster_fsid, osd_fsid; int w; int r = OSD::peek_meta(store, magic, cluster_fsid, osd_fsid, w); if (r < 0) { derr << TEXT_RED << " ** ERROR: unable to open OSD superblock on " << data_path << ": " << cpp_strerror(-r) << TEXT_NORMAL << dendl; if (r == -ENOTSUP) { derr << TEXT_RED << " ** please verify that underlying storage " << "supports xattrs" << TEXT_NORMAL << dendl; } forker.exit(1); } if (w != whoami) { derr << "OSD id " << w << " != my id " << whoami << dendl; forker.exit(1); } if (strcmp(magic.c_str(), CEPH_OSD_ONDISK_MAGIC)) { derr << "OSD magic " << magic << " != my " << CEPH_OSD_ONDISK_MAGIC << dendl; forker.exit(1); } if (get_cluster_fsid) { cout << cluster_fsid << std::endl; forker.exit(0); } if (get_osd_fsid) { cout << osd_fsid << std::endl; forker.exit(0); } pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC |CEPH_PICK_ADDRESS_CLUSTER); entity_addr_t paddr = g_conf->get_val("public_addr"); entity_addr_t caddr = g_conf->get_val("cluster_addr"); if (paddr.is_blank_ip() && !caddr.is_blank_ip()) { derr << TEXT_YELLOW << " ** WARNING: specified cluster addr but not public addr; we recommend **\n" << " ** you specify neither or both. **" << TEXT_NORMAL << dendl; } std::string msg_type = g_conf->get_val("ms_type"); std::string public_msg_type = g_conf->get_val("ms_public_type"); std::string cluster_msg_type = g_conf->get_val("ms_cluster_type"); public_msg_type = public_msg_type.empty() ? msg_type : public_msg_type; cluster_msg_type = cluster_msg_type.empty() ? msg_type : cluster_msg_type; Messenger *ms_public = Messenger::create(g_ceph_context, public_msg_type, entity_name_t::OSD(whoami), "client", getpid(), Messenger::HAS_HEAVY_TRAFFIC | Messenger::HAS_MANY_CONNECTIONS); Messenger *ms_cluster = Messenger::create(g_ceph_context, cluster_msg_type, entity_name_t::OSD(whoami), "cluster", getpid(), Messenger::HAS_HEAVY_TRAFFIC | Messenger::HAS_MANY_CONNECTIONS); Messenger *ms_hb_back_client = Messenger::create(g_ceph_context, cluster_msg_type, entity_name_t::OSD(whoami), "hb_back_client", getpid(), Messenger::HEARTBEAT); Messenger *ms_hb_front_client = Messenger::create(g_ceph_context, public_msg_type, entity_name_t::OSD(whoami), "hb_front_client", getpid(), Messenger::HEARTBEAT); Messenger *ms_hb_back_server = Messenger::create(g_ceph_context, cluster_msg_type, entity_name_t::OSD(whoami), "hb_back_server", getpid(), Messenger::HEARTBEAT); Messenger *ms_hb_front_server = Messenger::create(g_ceph_context, public_msg_type, entity_name_t::OSD(whoami), "hb_front_server", getpid(), Messenger::HEARTBEAT); Messenger *ms_objecter = Messenger::create(g_ceph_context, public_msg_type, entity_name_t::OSD(whoami), "ms_objecter", getpid(), 0); if (!ms_public || !ms_cluster || !ms_hb_front_client || !ms_hb_back_client || !ms_hb_back_server || !ms_hb_front_server || !ms_objecter) forker.exit(1); ms_cluster->set_cluster_protocol(CEPH_OSD_PROTOCOL); ms_hb_front_client->set_cluster_protocol(CEPH_OSD_PROTOCOL); ms_hb_back_client->set_cluster_protocol(CEPH_OSD_PROTOCOL); ms_hb_back_server->set_cluster_protocol(CEPH_OSD_PROTOCOL); ms_hb_front_server->set_cluster_protocol(CEPH_OSD_PROTOCOL); cout << "starting osd." << whoami << " at " << ms_public->get_myaddr() << " osd_data " << data_path << " " << ((journal_path.empty()) ? "(no journal)" : journal_path) << std::endl; uint64_t message_size = g_conf->get_val("osd_client_message_size_cap"); boost::scoped_ptr client_byte_throttler( new Throttle(g_ceph_context, "osd_client_bytes", message_size)); // All feature bits 0 - 34 should be present from dumpling v0.67 forward uint64_t osd_required = CEPH_FEATURE_UID | CEPH_FEATURE_PGID64 | CEPH_FEATURE_OSDENC; ms_public->set_default_policy(Messenger::Policy::stateless_server(0)); ms_public->set_policy_throttlers(entity_name_t::TYPE_CLIENT, client_byte_throttler.get(), nullptr); ms_public->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(osd_required)); ms_public->set_policy(entity_name_t::TYPE_MGR, Messenger::Policy::lossy_client(osd_required)); //try to poison pill any OSD connections on the wrong address ms_public->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::stateless_server(0)); ms_cluster->set_default_policy(Messenger::Policy::stateless_server(0)); ms_cluster->set_policy(entity_name_t::TYPE_MON, Messenger::Policy::lossy_client(0)); ms_cluster->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::lossless_peer(osd_required)); ms_cluster->set_policy(entity_name_t::TYPE_CLIENT, Messenger::Policy::stateless_server(0)); ms_hb_front_client->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::lossy_client(0)); ms_hb_back_client->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::lossy_client(0)); ms_hb_back_server->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::stateless_server(0)); ms_hb_front_server->set_policy(entity_name_t::TYPE_OSD, Messenger::Policy::stateless_server(0)); ms_objecter->set_default_policy(Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX)); if (ms_public->bind(paddr) < 0) forker.exit(1); if (ms_cluster->bind(caddr) < 0) forker.exit(1); bool is_delay = g_conf->get_val("osd_heartbeat_use_min_delay_socket"); if (is_delay) { ms_hb_front_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); ms_hb_back_client->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); ms_hb_back_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); ms_hb_front_server->set_socket_priority(SOCKET_PRIORITY_MIN_DELAY); } // hb back should bind to same ip as cluster_addr (if specified) entity_addr_t haddr = g_conf->get_val("osd_heartbeat_addr"); if (haddr.is_blank_ip()) { haddr = caddr; if (haddr.is_ip()) haddr.set_port(0); } if (ms_hb_back_server->bind(haddr) < 0) forker.exit(1); if (ms_hb_back_client->client_bind(haddr) < 0) forker.exit(1); // hb front should bind to same ip as public_addr entity_addr_t hb_front_addr = paddr; if (hb_front_addr.is_ip()) hb_front_addr.set_port(0); if (ms_hb_front_server->bind(hb_front_addr) < 0) forker.exit(1); if (ms_hb_front_client->client_bind(hb_front_addr) < 0) forker.exit(1); // install signal handlers init_async_signal_handler(); register_async_signal_handler(SIGHUP, sighup_handler); TracepointProvider::initialize(g_ceph_context); TracepointProvider::initialize(g_ceph_context); #ifdef WITH_OSD_INSTRUMENT_FUNCTIONS TracepointProvider::initialize(g_ceph_context); #endif srand(time(NULL) + getpid()); MonClient mc(g_ceph_context); if (mc.build_initial_monmap() < 0) return -1; global_init_chdir(g_ceph_context); if (global_init_preload_erasure_code(g_ceph_context) < 0) { forker.exit(1); } osd = new OSD(g_ceph_context, store, whoami, ms_cluster, ms_public, ms_hb_front_client, ms_hb_back_client, ms_hb_front_server, ms_hb_back_server, ms_objecter, &mc, data_path, journal_path); int err = osd->pre_init(); if (err < 0) { derr << TEXT_RED << " ** ERROR: osd pre_init failed: " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } ms_public->start(); ms_hb_front_client->start(); ms_hb_back_client->start(); ms_hb_front_server->start(); ms_hb_back_server->start(); ms_cluster->start(); ms_objecter->start(); // start osd err = osd->init(); if (err < 0) { derr << TEXT_RED << " ** ERROR: osd init failed: " << cpp_strerror(-err) << TEXT_NORMAL << dendl; forker.exit(1); } // -- daemonize -- if (g_conf->daemonize) { global_init_postfork_finish(g_ceph_context); forker.daemonize(); } register_async_signal_handler_oneshot(SIGINT, handle_osd_signal); register_async_signal_handler_oneshot(SIGTERM, handle_osd_signal); osd->final_init(); if (g_conf->get_val("inject_early_sigterm")) kill(getpid(), SIGTERM); ms_public->wait(); ms_hb_front_client->wait(); ms_hb_back_client->wait(); ms_hb_front_server->wait(); ms_hb_back_server->wait(); ms_cluster->wait(); ms_objecter->wait(); unregister_async_signal_handler(SIGHUP, sighup_handler); unregister_async_signal_handler(SIGINT, handle_osd_signal); unregister_async_signal_handler(SIGTERM, handle_osd_signal); shutdown_async_signal_handler(); // done delete osd; delete ms_public; delete ms_hb_front_client; delete ms_hb_back_client; delete ms_hb_front_server; delete ms_hb_back_server; delete ms_cluster; delete ms_objecter; client_byte_throttler.reset(); // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. char s[20]; snprintf(s, sizeof(s), "gmon/%d", getpid()); if ((mkdir(s, 0755) == 0) && (chdir(s) == 0)) { dout(0) << "ceph-osd: gmon.out should be in " << s << dendl; } return 0; }