mirror of
https://github.com/ceph/ceph
synced 2025-01-29 22:43:40 +00:00
Merge pull request #33065 from ifed01/wip-ifed-spurious-readerr-alert
os/bluestore: introduce health alert on spurious read errors. Reviewed-by: Josh Durgin <jdurgin@redhat.com> Signed-off-by: Igor Fedotov <ifedotov@suse.com>
This commit is contained in:
commit
70839d99dd
@ -530,6 +530,28 @@ are correctly installed and that the OSD daemon(s) have been
|
||||
restarted. If the problem persists, check the OSD log for any clues
|
||||
as to the source of the problem.
|
||||
|
||||
BLUESTORE_SPURIOUS_READ_ERRORS
|
||||
______________________________
|
||||
|
||||
One or more OSDs using BlueStore detects spurious read errors at main device.
|
||||
BlueStore has recovered from these errors by retrying disk reads.
|
||||
Though this might show some issues with underlying hardware, I/O subsystem,
|
||||
etc.
|
||||
Which theoretically might cause permanent data corruption.
|
||||
Some observations on the root cause can be found at
|
||||
https://tracker.ceph.com/issues/22464
|
||||
|
||||
This alert doesn't require immediate response but corresponding host might need
|
||||
additional attention, e.g. upgrading to the latest OS/kernel versions and
|
||||
H/W resource utilization monitoring.
|
||||
|
||||
This warning can be disabled on all OSDs with::
|
||||
|
||||
ceph config set osd bluestore_warn_on_spurious_read_errors false
|
||||
|
||||
Alternatively, it can be disabled on a specific OSD with::
|
||||
|
||||
ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
|
||||
|
||||
|
||||
Device health
|
||||
|
@ -1051,6 +1051,7 @@ OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
|
||||
OPTION(bluestore_fsck_error_on_no_per_pool_stats, OPT_BOOL)
|
||||
OPTION(bluestore_warn_on_bluefs_spillover, OPT_BOOL)
|
||||
OPTION(bluestore_warn_on_legacy_statfs, OPT_BOOL)
|
||||
OPTION(bluestore_warn_on_spurious_read_errors, OPT_BOOL)
|
||||
OPTION(bluestore_fsck_error_on_no_per_pool_omap, OPT_BOOL)
|
||||
OPTION(bluestore_warn_on_no_per_pool_omap, OPT_BOOL)
|
||||
OPTION(bluestore_log_op_age, OPT_DOUBLE)
|
||||
|
@ -4604,6 +4604,10 @@ std::vector<Option> get_global_options() {
|
||||
.set_default(true)
|
||||
.set_description("Enable health indication on lack of per-pool statfs reporting from bluestore"),
|
||||
|
||||
Option("bluestore_warn_on_spurious_read_errors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(true)
|
||||
.set_description("Enable health indication when spurious read errors are observed by OSD"),
|
||||
|
||||
Option("bluestore_fsck_error_on_no_per_pool_omap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
|
||||
.set_default(false)
|
||||
.set_description("Make fsck error (instead of warn) when objects without per-pool omap are found"),
|
||||
|
@ -3234,7 +3234,10 @@ void PGMap::get_health_checks(
|
||||
summary += " have dangerous mismatch between BlueStore block device and free list sizes";
|
||||
} else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
|
||||
summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
|
||||
} else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
|
||||
summary += " have spurious read errors";
|
||||
}
|
||||
|
||||
auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
|
||||
for (auto& s : asum.second.second) {
|
||||
d.detail.push_back(s);
|
||||
|
@ -9924,6 +9924,9 @@ int BlueStore::_do_read(
|
||||
logger->inc(l_bluestore_reads_with_retries);
|
||||
dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
|
||||
<< " failed " << std::dec << retry_count << " times before succeeding" << dendl;
|
||||
stringstream s;
|
||||
s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
|
||||
_set_spurious_read_errors_alert(s.str());
|
||||
}
|
||||
return r;
|
||||
}
|
||||
@ -15726,6 +15729,11 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
|
||||
{
|
||||
std::lock_guard l(qlock);
|
||||
|
||||
if (!spurious_read_errors_alert.empty()) {
|
||||
alerts.emplace(
|
||||
"BLUESTORE_SPURIOUS_READ_ERRORS",
|
||||
spurious_read_errors_alert);
|
||||
}
|
||||
if (!disk_size_mismatch_alert.empty()) {
|
||||
alerts.emplace(
|
||||
"BLUESTORE_DISK_SIZE_MISMATCH",
|
||||
|
@ -2947,6 +2947,7 @@ private:
|
||||
std::string legacy_statfs_alert;
|
||||
std::string no_per_pool_omap_alert;
|
||||
std::string disk_size_mismatch_alert;
|
||||
std::string spurious_read_errors_alert;
|
||||
|
||||
void _log_alerts(osd_alert_list_t& alerts);
|
||||
bool _set_compression_alert(bool cmode, const char* s) {
|
||||
@ -2979,6 +2980,10 @@ private:
|
||||
std::lock_guard l(qlock);
|
||||
disk_size_mismatch_alert = s;
|
||||
}
|
||||
void _set_spurious_read_errors_alert(const string& s) {
|
||||
std::lock_guard l(qlock);
|
||||
spurious_read_errors_alert = s;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user