// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include "config.h" #include "include/types.h" #include //#define MDS_CACHE_SIZE 4*10000 -> <20mb //#define MDS_CACHE_SIZE 80000 62mb #define AVG_PER_INODE_SIZE 450 #define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) //#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) //#define MDS_CACHE_SIZE 1500000 #define MDS_CACHE_SIZE 150000 // hack hack hack ugly FIXME #include "common/Mutex.h" long buffer_total_alloc = 0; Mutex bufferlock; #include "osd/osd_types.h" // debug output Mutex _dout_lock; ostream *_dout = &std::cout; ostream *_derr = &std::cerr; // file layouts struct ceph_file_layout g_OSD_FileLayout = { fl_stripe_unit: 1<<22, fl_stripe_count: 1, fl_object_size: 1<<22, fl_object_stripe_unit: 0, fl_pg_preferred: -1, fl_pg_type: CEPH_PG_TYPE_REP, fl_pg_size: 2 }; struct ceph_file_layout g_OSD_MDDirLayout = { fl_stripe_unit: 1<<22, fl_stripe_count: 1, fl_object_size: 1<<22, fl_object_stripe_unit: 0, fl_pg_preferred: -1, fl_pg_type: CEPH_PG_TYPE_REP, fl_pg_size: 2 }; struct ceph_file_layout g_OSD_MDLogLayout = { fl_stripe_unit: 1<<20, fl_stripe_count: 1, fl_object_size: 1<<20, fl_object_stripe_unit: 0, fl_pg_preferred: -1, fl_pg_type: CEPH_PG_TYPE_REP, fl_pg_size: 2 }; struct ceph_file_layout g_OSD_MDAnchorTableLayout = { fl_stripe_unit: 1<<20, fl_stripe_count: 1, fl_object_size: 1<<20, fl_object_stripe_unit: 0, fl_pg_preferred: -1, fl_pg_type: CEPH_PG_TYPE_REP, fl_pg_size: 2 }; #include // fake osd failures: osd -> time std::map g_fake_kill_after; std::map g_fake_osd_down; std::map g_fake_osd_out; entity_addr_t g_my_addr; md_config_t g_debug_after_conf; md_config_t g_conf = { num_mon: 1, num_mds: 1, num_osd: 4, num_client: 1, mkfs: false, // profiling and debugging log: true, log_interval: 1, log_name: (char*)0, log_messages: true, log_pins: true, logger_calc_variance: true, dout_dir: 0, fake_clock: false, fakemessenger_serialize: true, fake_osdmap_expand: 0, fake_osdmap_updates: 0, fake_osd_mttf: 0, fake_osd_mttr: 0, osd_remount_at: 0, kill_after: 0, tick: 0, debug: 0, debug_mds: 1, debug_mds_balancer: 1, debug_mds_log: 1, debug_mds_log_expire: 1, debug_mds_migrator: 1, debug_buffer: 0, debug_timer: 0, debug_filer: 0, debug_objecter: 0, debug_journaler: 0, debug_objectcacher: 0, debug_client: 0, debug_osd: 0, debug_ebofs: 1, debug_bdev: 1, // block device debug_ns: 0, debug_ms: 0, debug_mon: 1, debug_paxos: 0, debug_after: 0, // -- misc -- use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) // --- clock --- clock_lock: false, clock_tare: false, // --- messenger --- ms_tcp_nodelay: true, ms_retry_interval: 2.0, // how often to attempt reconnect ms_fail_interval: 15.0, // fail after this long ms_die_on_failure: false, ms_stripe_osds: false, ms_skip_rank0: false, ms_overlay_clients: false, // --- mon --- mon_tick_interval: 5, mon_osd_down_out_interval: 5, // seconds mon_lease: 5, // seconds // lease interval mon_lease_renew_interval: 3, // on leader, to renew the lease mon_lease_ack_timeout: 10.0, // on leader, if lease isn't acked by all peons mon_lease_timeout: 10.0, // on peon, if lease isn't extended mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted mon_stop_on_last_unmount: false, mon_stop_with_last_mds: false, mon_allow_mds_bully: false, // allow a booting mds to (forcibly) claim an mds # .. FIXME paxos_propose_interval: 1.0, // gather updates for this long before proposing a map update // --- client --- client_cache_size: 1000, client_cache_mid: .5, client_cache_stat_ttl: 0, // seconds until cached stat results become invalid client_cache_readdir_ttl: 1, // 1 second only client_use_random_mds: false, client_sync_writes: 0, client_mount_timeout: 10.0, // retry every N seconds client_hack_balance_reads: false, client_trace: 0, fuse_direct_io: 0, fuse_ll: true, // --- objectcacher --- client_oc: true, client_oc_size: 1024*1024* 10, // MB * n client_oc_max_dirty: 1024*1024* 10, // MB * n (dirty OR tx) client_oc_max_sync_write: 128*1024, // synx writes >= this use wrlock // --- objecter --- objecter_buffer_uncommitted: true, // this must be true for proper failure handling objecter_map_request_interval: 15.0, // request a new map every N seconds, if we have pending io objecter_tick_interval: 5.0, objecter_timeout: 10.0, // before we ask for a map // --- journaler --- journaler_allow_split_entries: true, journaler_safe: false, // wait for COMMIT on journal writes journaler_write_head_interval: 15, journaler_cache: false, // cache writes for later readback journaler_prefetch_periods: 50, // * journal object size (1~MB? see above) journaler_batch_interval: .001, // seconds.. max add'l latency we artificially incur journaler_batch_max: 16384, // max bytes we'll delay flushing // --- mds --- mds_cache_size: 300000, //MDS_CACHE_SIZE, mds_cache_mid: .7, mds_decay_halflife: 5, mds_beacon_interval: 4, //30.0, mds_beacon_grace: 15, //60*60.0, mds_log: true, mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, mds_log_max_segments: 100, mds_log_max_expiring: 20, mds_log_pad_entry: 128,//256,//64, mds_log_eopen_size: 100, // # open inodes per log entry mds_bal_sample_interval: 3.0, // every 5 seconds mds_bal_replicate_threshold: 8000, mds_bal_unreplicate_threshold: 0,//500, mds_bal_split_size: 10000, mds_bal_split_rd: 25000, mds_bal_split_wr: 10000, mds_bal_merge_size: 50, mds_bal_merge_rd: 1000, mds_bal_merge_wr: 1000, mds_bal_interval: 10, // seconds mds_bal_fragment_interval: 2, // seconds mds_bal_idle_threshold: 0, //.1, mds_bal_max: -1, mds_bal_max_until: -1, mds_bal_mode: 0, mds_bal_min_rebalance: .1, // must be this much above average before we export anything mds_bal_min_start: .2, // if we need less than this, we don't do anything mds_bal_need_min: .8, // take within this range of what we need mds_bal_need_max: 1.2, mds_bal_midchunk: .3, // any sub bigger than this taken in full mds_bal_minchunk: .001, // never take anything smaller than this mds_trim_on_rejoin: true, mds_shutdown_check: 0, //30, mds_verify_export_dirauth: true, mds_local_osd: false, mds_thrash_exports: 0, mds_thrash_fragments: 0, mds_dump_cache_on_map: false, mds_dump_cache_after_rejoin: true, mds_hack_log_expire_for_better_stats: false, // --- osd --- osd_rep: OSD_REP_PRIMARY, osd_balance_reads: false, // send from client to replica osd_flash_crowd_iat_threshold: 0,//100, osd_flash_crowd_iat_alpha: 0.125, osd_balance_reads_temp: 100, osd_shed_reads: false, // forward from primary to replica osd_shed_reads_min_latency: .01, // min local latency osd_shed_reads_min_latency_diff: .01, // min latency difference osd_shed_reads_min_latency_ratio: 1.5, // 1.2 == 20% higher than peer osd_immediate_read_from_cache: false,//true, // osds to read from the cache immediately? osd_exclusive_caching: true, // replicas evict replicated writes osd_stat_refresh_interval: .5, osd_pg_bits: 4, // bits per osd osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO, osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH, osd_max_rep: 4, osd_min_raid_width: 4, osd_max_raid_width: 3, //6, osd_maxthreads: 2, // 0 == no threading osd_max_opq: 10, osd_mkfs: false, osd_age: .8, osd_age_time: 0, osd_heartbeat_interval: 1, osd_heartbeat_grace: 30, osd_pg_stats_interval: 5, osd_replay_window: 5, osd_max_pull: 2, osd_pad_pg_log: false, osd_auto_weight: false, osd_hack_fast_startup: false, // this breaks localized pgs. // --- fakestore --- fakestore_fake_sync: .2, // seconds fakestore_fsync: false,//true, fakestore_writesync: false, fakestore_syncthreads: 4, fakestore_fake_attrs: false, fakestore_fake_collections: false, fakestore_dev: 0, // --- ebofs --- ebofs: 1, ebofs_cloneable: false, ebofs_verify: false, ebofs_commit_ms: 1000, // 0 = no forced commit timeout (for debugging/tracing) ebofs_idle_commit_ms: 0, // 0 = no idle detection. UGLY HACK. use bdev_idle_kick_after_ms instead. ebofs_oc_size: 10000, // onode cache ebofs_cc_size: 10000, // cnode cache ebofs_bc_size: (5 *256), // 4k blocks, *256 for MB ebofs_bc_max_dirty: (3 *256), // before write() will block ebofs_max_prefetch: 1000, // 4k blocks ebofs_realloc: false, // hrm, this can cause bad fragmentation, don't use! ebofs_verify_csum_on_read: true, // --- block device --- bdev_lock: true, bdev_iothreads: 1, // number of ios to queue with kernel bdev_idle_kick_after_ms: 100, // ms bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms bdev_el_bidir: false, // bidirectional elevator? bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps bdev_fake_mb: 0, bdev_fake_max_mb: 0, // --- fakeclient (mds regression testing) (ancient history) --- num_fakeclient: 100, fakeclient_requests: 100, fakeclient_deterministic: false, fakeclient_op_statfs: false, // loosely based on Roselli workload paper numbers fakeclient_op_stat: 610, fakeclient_op_lstat: false, fakeclient_op_utime: 0, fakeclient_op_chmod: 1, fakeclient_op_chown: 1, fakeclient_op_readdir: 2, fakeclient_op_mknod: 30, fakeclient_op_link: false, fakeclient_op_unlink: 20, fakeclient_op_rename: 0,//40, fakeclient_op_mkdir: 10, fakeclient_op_rmdir: 20, fakeclient_op_symlink: 20, fakeclient_op_openrd: 200, fakeclient_op_openwr: 0, fakeclient_op_openwrc: 0, fakeclient_op_read: false, // osd! fakeclient_op_write: false, // osd! fakeclient_op_truncate: false, fakeclient_op_fsync: false, fakeclient_op_close: 200 #ifdef USE_OSBDB , bdbstore: false, debug_bdbstore: 1, bdbstore_btree: false, bdbstore_ffactor: 0, bdbstore_nelem: 0, bdbstore_pagesize: 0, bdbstore_cachesize: 0, bdbstore_transactional: false #endif // USE_OSBDB }; #include #include void env_to_vec(std::vector& args) { const char *p = getenv("CEPH_ARGS"); if (!p) return; static char buf[1000]; int len = strlen(p); memcpy(buf, p, len); buf[len] = 0; //cout << "CEPH_ARGS " << buf << endl; int l = 0; for (int i=0; i& args) { for (int i=1; i& args, int& argc, char **&argv) { argv = (char**)malloc(sizeof(char*) * argc); argc = 1; argv[0] = "asdf"; for (unsigned i=0; i= '0' && *s <= '9') { int digit = *s - '0'; //cout << "digit " << digit << endl; val *= 10; val += digit; numdigits++; s++; off++; } if (numdigits == 0) { cerr << "no digits at off " << off << std::endl; return false; // no digits } if (count < 3 && *s != '.') { cerr << "should period at " << off << std::endl; return false; // should have 3 periods } s++; off++; if (count <= 3) a.set_ipquad(count, val); else a.set_port(val); count++; if (count == 4 && *(s-1) != ':') break; if (count == 5) break; } return true; } void parse_config_options(std::vector& args) { std::vector nargs; for (unsigned i=0; iis_open()) { std::cerr << "error opening output file " << fn << std::endl; delete out; } else { _dout = out; } } args = nargs; }