diff --git a/ceph/crashes.go b/ceph/crashes.go index 10ae575..e953942 100644 --- a/ceph/crashes.go +++ b/ceph/crashes.go @@ -15,19 +15,14 @@ package ceph import ( - "bufio" - "bytes" "encoding/json" "fmt" - "regexp" "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) var ( - crashLsLineRegex = regexp.MustCompile(`.*_[0-9a-f-]{36}\s+(\S+)\s*(\*)?`) - statusNames = map[bool]string{true: "new", false: "archived"} ) @@ -56,7 +51,7 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector { crashReportsDesc: prometheus.NewDesc( fmt.Sprintf("%s_crash_reports", cephNamespace), "Count of crashes reports per daemon, according to `ceph crash ls`", - []string{"daemon", "status"}, + []string{"entity", "status"}, labels, ), } @@ -69,6 +64,11 @@ type crashEntry struct { isNew bool } +type cephCrashLs struct { + Entity string `json:"entity_name"` + Archived string `json:"archived"` +} + // getCrashLs runs the 'crash ls' command and parses its results func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) { crashes := make(map[crashEntry]int) @@ -78,7 +78,7 @@ func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) { // to process in an outage storm. cmd, err := json.Marshal(map[string]interface{}{ "prefix": "crash ls", - "format": "plain", + "format": "json", }) if err != nil { return crashes, err @@ -89,15 +89,13 @@ func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) { return crashes, err } - scanner := bufio.NewScanner(bytes.NewBuffer(buf)) - for scanner.Scan() { - matched := crashLsLineRegex.FindStringSubmatch(scanner.Text()) - if len(matched) == 3 { - crashes[crashEntry{matched[1], matched[2] == "*"}]++ - } else if len(matched) == 2 { - // Just in case the line-end spaces were stripped - crashes[crashEntry{matched[1], false}]++ - } + var crashData []cephCrashLs + if err = json.Unmarshal(buf, &crashData); err != nil { + return crashes, err + } + + for _, crash := range crashData { + crashes[crashEntry{crash.Entity, len(crash.Archived) == 0}]++ } return crashes, nil diff --git a/ceph/crashes_test.go b/ceph/crashes_test.go index 5b665fe..626f2bb 100644 --- a/ceph/crashes_test.go +++ b/ceph/crashes_test.go @@ -36,64 +36,139 @@ func TestCrashesCollector(t *testing.T) { reMatch []*regexp.Regexp }{ { + // Example with the full output, further examples will be simpler name: "single new crash", input: ` -ID ENTITY NEW -2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 * - `, +[ + { + "os_version_id": "7", + "assert_condition": "p != obs_call_gate.end()", + "utsname_release": "5.10.53-138-generic", + "os_name": "CentOS Linux", + "entity_name": "client.admin", + "assert_file": "/ceph/src/common/config_proxy.h", + "timestamp": "2022-01-25 21:03:38.371403Z", + "process_name": "rbd-nbd", + "utsname_machine": "x86_64", + "utsname_sysname": "Linux", + "os_version": "7 (Core)", + "os_id": "centos", + "assert_thread_name": "rbd-nbd", + "utsname_version": "#4745ab954 SMP Fri Oct 22 23:05:54 UTC 2021", + "backtrace": [ + "(()+0xe54d4) [0x5561b4a744d4]", + "(()+0xf630) [0x7f18aac9f630]", + "(gsignal()+0x37) [0x7f18a9256387]", + "(abort()+0x148) [0x7f18a9257a78]", + "(ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x199) [0x7f18ac7dce46]", + "(()+0x25cfbf) [0x7f18ac7dcfbf]", + "(ConfigProxy::call_gate_enter(ceph::md_config_obs_impl*)+0x79) [0x5561b4a6cc67]", + "(ConfigProxy::map_observer_changes(ceph::md_config_obs_impl*, std::string const&, std::map*, std::set, std::allocator >, std::less*>, std::allocator* const, std::set, std::allocator > > > >*)+0x120) [0x5561b4a6d0a2]", + "(ConfigProxy::_gather_changes(std::set, std::allocator >&, std::map*, std::set, std::allocator >, std::less*>, std::allocator* const, std::set, std::allocator > > > >*, std::ostream*)::{lambda(ceph::md_config_obs_impl*, std::string const&)#1}::operator()(ceph::md_config_obs_impl*, std::string const&) const+0x33) [0x5561b4a6d651]", + "(std::_Function_handler*, std::string const&), ConfigProxy::_gather_changes(std::set, std::allocator >&, std::map*, std::set, std::allocator >, std::less*>, std::allocator* const, std::set, std::allocator > > > >*, std::ostream*)::{lambda(ceph::md_config_obs_impl*, std::string const&)#1}>::_M_invoke(std::_Any_data const&, ceph::md_config_obs_impl*&&, std::string const&)+0x52) [0x5561b4a6f11e]", + "(std::function*, std::string const&)>::operator()(ceph::md_config_obs_impl*, std::string const&) const+0x61) [0x5561b4a6f05f]", + "(void ObserverMgr >::for_each_change(std::set, std::allocator > const&, ConfigProxy&, std::function*, std::string const&)>, std::ostream*)+0x1cb) [0x5561b4a6e343]", + "(ConfigProxy::_gather_changes(std::set, std::allocator >&, std::map*, std::set, std::allocator >, std::less*>, std::allocator* const, std::set, std::allocator > > > >*, std::ostream*)+0x76) [0x5561b4a6d6ca]", + "(ConfigProxy::apply_changes(std::ostream*)+0x7c) [0x5561b4a6d5aa]", + "(global_init(std::map, std::allocator > > const*, std::vector >&, unsigned int, code_environment_t, int, char const*, bool)+0x1022) [0x5561b4a6a806]", + "(()+0x9380c) [0x5561b4a2280c]", + "(()+0x9618a) [0x5561b4a2518a]", + "(main()+0x20) [0x5561b4a252e4]", + "(__libc_start_main()+0xf5) [0x7f18a9242555]", + "(()+0x907f9) [0x5561b4a1f7f9]" + ], + "utsname_hostname": "test-ceph-server.company.example", + "assert_msg": "/ceph/src/common/config_proxy.h: In function 'void ConfigProxy::call_gate_enter(ConfigProxy::md_config_obs_t*)' thread 7f18b63dfa00 time 2022-01-25 21:03:38.368357\n/ceph/src/common/config_proxy.h: 65: FAILED ceph_assert(p != obs_call_gate.end())\n", + "crash_id": "2022-01-25_21:03:38.371403Z_f9df5b64-32ef-4073-8b37-d1c5a1b3dcb8", + "assert_line": 65, + "ceph_version": "14.2.18", + "assert_func": "void ConfigProxy::call_gate_enter(ConfigProxy::md_config_obs_t*)" + } +]`, reMatch: []*regexp.Regexp{ - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="client.admin",status="new"} 1`), }, }, { name: "single archived crash", input: ` -ID ENTITY NEW -2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 +[ + { + "entity_name": "client.admin", + "timestamp": "2022-01-25 21:02:46.687015Z", + "archived": "2022-06-14 19:44:40.356826", + "crash_id": "2022-01-25_21:02:46.687015Z_d6513591-c16b-472f-8d40-5a143b28837d" + } +] `, reMatch: []*regexp.Regexp{ - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="client.admin",status="archived"} 1`), }, }, { name: "two new crashes same entity", input: ` -ID ENTITY NEW -2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 * -2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 * -`, +[ + { + "entity_name": "osd.0", + "timestamp": "2022-02-01 21:02:46.687015Z", + "crash_id": "2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9" + }, + { + "entity_name": "osd.0", + "timestamp": "2022-02-03 04:05:45.419226Z", + "crash_id": "2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a" + } +]`, reMatch: []*regexp.Regexp{ - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 2`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="osd.0",status="new"} 2`), }, }, { name: "mix of crashes same entity", input: ` -ID ENTITY NEW -2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 -2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 * -`, +[ + { + "entity_name": "osd.0", + "timestamp": "2022-02-01 21:02:46.687015Z", + "crash_id": "2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9" + }, + { + "entity_name": "osd.0", + "timestamp": "2022-02-03 04:05:45.419226Z", + "archived": "2022-06-14 19:44:40.356826", + "crash_id": "2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a" + } +]`, reMatch: []*regexp.Regexp{ - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="osd.0",status="new"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="osd.0",status="archived"} 1`), }, }, { name: "mix of crashes different entities", input: ` -ID ENTITY NEW -2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 mgr.mgr-node-01 * -2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a client.admin * -`, +[ + { + "entity_name": "mgr.mgr-node-01", + "timestamp": "2022-02-01 21:02:46.687015Z", + "crash_id": "2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9" + }, + { + "entity_name": "client.admin", + "timestamp": "2022-02-03 04:05:45.419226Z", + "crash_id": "2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a" + } +]`, reMatch: []*regexp.Regexp{ - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 1`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="mgr.mgr-node-01",status="new"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",entity="client.admin",status="new"} 1`), }, }, { // At least code shouldn't panic name: "no crashes", - input: ``, + input: `[]`, reMatch: []*regexp.Regexp{}, }, } {