// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include using namespace std; #include #include "config.h" #include "mds/MDS.h" #include "osd/OSD.h" #include "mon/Monitor.h" #include "client/Client.h" #include "client/SyntheticClient.h" #include "msg/SimpleMessenger.h" #include "common/Timer.h" #define NUMMDS g_conf.num_mds #define NUMOSD g_conf.num_osd #define NUMCLIENT g_conf.num_client class C_Test : public Context { public: void finish(int r) { cout << "C_Test->finish(" << r << ")" << endl; } }; /* * start up NewMessenger via MPI. */ #include pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) { MPI_Init(&argc, &argv); int mpi_world; int mpi_rank; MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); // first, synchronize clocks. MPI_Barrier(MPI_COMM_WORLD); //dout(-10) << "tare" << endl; g_clock.tare(); // start up all monitors at known addresses. entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. rank.start_rank(); // bind and listen if (mpi_rank < g_conf.num_mon) { moninst[mpi_rank].addr = rank.my_addr; moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank); //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; } MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, moninst, sizeof(entity_inst_t), MPI_CHAR, 0, MPI_COMM_WORLD); if (mpi_rank == 0) { for (int i=0; imon_inst[i] = moninst[i]; } } // distribute monmap bufferlist bl; if (mpi_rank == 0) { monmap->encode(bl); monmap->write(".ceph_monmap"); } else { int l = g_conf.num_mon * 1000; // nice'n big. bufferptr bp(l); bl.append(bp); } MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, 0, MPI_COMM_WORLD); if (mpi_rank > 0) { monmap->decode(bl); } // wait for everyone! MPI_Barrier(MPI_COMM_WORLD); return pair(mpi_rank, mpi_world); } utime_t tick_start; int tick_count = 0; class C_Tick : public Context { public: void finish(int) { utime_t now = g_clock.now() - tick_start; dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; tick_count += g_conf.tick; utime_t next = tick_start; next.sec_ref() += tick_count; g_timer.add_event_at(next, new C_Tick); } }; class C_Die : public Context { public: void finish(int) { cerr << "die" << endl; exit(1); } }; class C_Debug : public Context { public: void finish(int) { int size = &g_conf.debug_after - &g_conf.debug; memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); dout(0) << "debug_after flipping debug settings" << endl; } }; int main(int argc, char **argv) { vector args; argv_to_vec(argc, argv, args); map kill_osd_after; if (1) { vector nargs; for (unsigned i=0; i nargs; for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); int myrank = mpiwho.first; int world = mpiwho.second; int need = 0; if (g_conf.ms_skip_rank0) need++; need += NUMMDS; if (g_conf.ms_stripe_osds) need++; else need += NUMOSD; if (NUMCLIENT) { if (!g_conf.ms_overlay_clients) need += 1; } assert(need <= world); if (myrank == 0) cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; char hostname[100]; gethostname(hostname,100); int pid = getpid(); int started = 0; //if (myrank == 0) g_conf.debug = 20; // create mon if (myrank < g_conf.num_mon) { Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); mon->init(); } // wait for monitors to start. MPI_Barrier(MPI_COMM_WORLD); // okay, home free! MPI_Finalize(); // create mds map mds; map mdsosd; for (int i=0; iinit(); started++; if (g_conf.mds_local_osd) { mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap); mdsosd[i]->init(); } } // create osd map osd; int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1; for (int i=0; iinit(); started++; } if (g_conf.ms_overlay_clients) sleep(5); // create client int skip_osd = NUMOSD; if (g_conf.ms_overlay_clients) skip_osd = 0; // put clients with osds too! int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0; int clients_per_node = 1; if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; set clientlist; map client;//[NUMCLIENT]; map syn;//[NUMCLIENT]; for (int i=0; iinit(); started++; syn[i] = new SyntheticClient(client[i]); } if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; int nclients = 0; for (set::iterator it = clientlist.begin(); it != clientlist.end(); it++) { int i = *it; //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; client[i]->mount(); syn[i]->start_thread(); nclients++; } if (nclients) { cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl; } for (set::iterator it = clientlist.begin(); it != clientlist.end(); it++) { int i = *it; // cout << "waiting for synthetic client" << i << " to finish" << endl; syn[i]->join_thread(); delete syn[i]; client[i]->unmount(); //cout << "client" << i << " unmounted" << endl; client[i]->shutdown(); delete client[i]; } if (myrank && !started) { //dout(1) << "IDLE" << endl; cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl; //rank.stop_rank(); } // wait for everything to finish rank.wait(); if (started) cerr << "newsyn finishing" << endl; return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). // cleanup for (map::iterator i = mds.begin(); i != mds.end(); i++) delete i->second; for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) delete i->second; for (map::iterator i = osd.begin(); i != osd.end(); i++) delete i->second; /* for (map::iterator i = client.begin(); i != client.end(); i++) delete i->second; for (map::iterator i = syn.begin(); i != syn.end(); i++) delete i->second; */ /* for (int i=0; i