osd: set default timeout 30 seconds for Mon/OSD ops

This commit is contained in:
Yue Zhu 2019-10-09 13:19:32 -04:00
parent 444007fbfc
commit 8ec22c971f

View File

@ -21,6 +21,7 @@ import (
"net" "net"
"net/http" "net/http"
"os" "os"
"strconv"
"sync" "sync"
"syscall" "syscall"
"time" "time"
@ -134,10 +135,11 @@ func (c *CephExporter) Collect(ch chan<- prometheus.Metric) {
func main() { func main() {
var ( var (
addr = flag.String("telemetry.addr", ":9128", "host:port for ceph exporter") addr = flag.String("telemetry.addr", ":9128", "host:port for ceph exporter")
metricsPath = flag.String("telemetry.path", "/metrics", "URL path for surfacing collected metrics") metricsPath = flag.String("telemetry.path", "/metrics", "URL path for surfacing collected metrics")
cephConfig = flag.String("ceph.config", "", "path to ceph config file") cephConfig = flag.String("ceph.config", "", "path to Ceph config file")
cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.") cephUser = flag.String("ceph.user", "admin", "Ceph user to connect to cluster.")
cephRadosOpTimeout = flag.Duration("ceph.rados_op_timeout", 30*time.Second, "Ceph rados_osd_op_timeout and rados_mon_op_timeout used to contact cluster (0s means no limit).")
rgwMode = flag.Int("rgw.mode", 0, "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)") rgwMode = flag.Int("rgw.mode", 0, "Enable collection of stats from RGW (0:disabled 1:enabled 2:background)")
@ -164,6 +166,17 @@ func main() {
log.Fatalf("cannot read ceph config file: %s", err) log.Fatalf("cannot read ceph config file: %s", err)
} }
// Set rados_osd_op_timeout and rados_mon_op_timeout to avoid Mon
// and PG command hang.
// See https://github.com/ceph/ceph/blob/d4872ce97a2825afcb58876559cc73aaa1862c0f/src/common/legacy_config_opts.h#L1258-L1259
if err := conn.SetConfigOption("rados_osd_op_timeout", strconv.FormatFloat(cephRadosOpTimeout.Seconds(), 'f', -1, 64)); err != nil {
log.Fatalf("cannot set rados_osd_op_timeout for ceph cluster: %s", err)
}
if err := conn.SetConfigOption("rados_mon_op_timeout", strconv.FormatFloat(cephRadosOpTimeout.Seconds(), 'f', -1, 64)); err != nil {
log.Fatalf("cannot set rados_mon_op_timeout for ceph cluster: %s", err)
}
if err := conn.Connect(); err != nil { if err := conn.Connect(); err != nil {
log.Fatalf("cannot connect to ceph cluster: %s", err) log.Fatalf("cannot connect to ceph cluster: %s", err)
} }
@ -191,6 +204,17 @@ func main() {
log.Fatalf("cannot read ceph config file: %s", err) log.Fatalf("cannot read ceph config file: %s", err)
} }
// Set rados_osd_op_timeout and rados_mon_op_timeout to avoid Mon
// and PG command hang.
// See https://github.com/ceph/ceph/blob/d4872ce97a2825afcb58876559cc73aaa1862c0f/src/common/legacy_config_opts.h#L1258-L1259
if err := conn.SetConfigOption("rados_osd_op_timeout", strconv.FormatFloat(cephRadosOpTimeout.Seconds(), 'f', -1, 64)); err != nil {
log.Fatalf("cannot set rados_osd_op_timeout for ceph cluster: %s", err)
}
if err := conn.SetConfigOption("rados_mon_op_timeout", strconv.FormatFloat(cephRadosOpTimeout.Seconds(), 'f', -1, 64)); err != nil {
log.Fatalf("cannot set rados_mon_op_timeout for ceph cluster: %s", err)
}
if err := conn.Connect(); err != nil { if err := conn.Connect(); err != nil {
log.Fatalf("cannot connect to ceph cluster: %s", err) log.Fatalf("cannot connect to ceph cluster: %s", err)
} }