Merge pull request #33065 from ifed01/wip-ifed-spurious-readerr-alert

os/bluestore: introduce health alert on spurious read errors.

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
This commit is contained in:
Igor Fedotov 2020-04-23 11:32:25 +03:00 committed by GitHub
commit 70839d99dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 43 additions and 0 deletions

View File

@ -530,6 +530,28 @@ are correctly installed and that the OSD daemon(s) have been
restarted. If the problem persists, check the OSD log for any clues
as to the source of the problem.
BLUESTORE_SPURIOUS_READ_ERRORS
______________________________
One or more OSDs using BlueStore detects spurious read errors at main device.
BlueStore has recovered from these errors by retrying disk reads.
Though this might show some issues with underlying hardware, I/O subsystem,
etc.
Which theoretically might cause permanent data corruption.
Some observations on the root cause can be found at
https://tracker.ceph.com/issues/22464
This alert doesn't require immediate response but corresponding host might need
additional attention, e.g. upgrading to the latest OS/kernel versions and
H/W resource utilization monitoring.
This warning can be disabled on all OSDs with::
ceph config set osd bluestore_warn_on_spurious_read_errors false
Alternatively, it can be disabled on a specific OSD with::
ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
Device health

View File

@ -1051,6 +1051,7 @@ OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
OPTION(bluestore_fsck_error_on_no_per_pool_stats, OPT_BOOL)
OPTION(bluestore_warn_on_bluefs_spillover, OPT_BOOL)
OPTION(bluestore_warn_on_legacy_statfs, OPT_BOOL)
OPTION(bluestore_warn_on_spurious_read_errors, OPT_BOOL)
OPTION(bluestore_fsck_error_on_no_per_pool_omap, OPT_BOOL)
OPTION(bluestore_warn_on_no_per_pool_omap, OPT_BOOL)
OPTION(bluestore_log_op_age, OPT_DOUBLE)

View File

@ -4604,6 +4604,10 @@ std::vector<Option> get_global_options() {
.set_default(true)
.set_description("Enable health indication on lack of per-pool statfs reporting from bluestore"),
Option("bluestore_warn_on_spurious_read_errors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(true)
.set_description("Enable health indication when spurious read errors are observed by OSD"),
Option("bluestore_fsck_error_on_no_per_pool_omap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
.set_default(false)
.set_description("Make fsck error (instead of warn) when objects without per-pool omap are found"),

View File

@ -3234,7 +3234,10 @@ void PGMap::get_health_checks(
summary += " have dangerous mismatch between BlueStore block device and free list sizes";
} else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
summary += " reporting legacy (not per-pool) BlueStore omap usage stats";
} else if (asum.first == "BLUESTORE_SPURIOUS_READ_ERRORS") {
summary += " have spurious read errors";
}
auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
for (auto& s : asum.second.second) {
d.detail.push_back(s);

View File

@ -9924,6 +9924,9 @@ int BlueStore::_do_read(
logger->inc(l_bluestore_reads_with_retries);
dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
<< " failed " << std::dec << retry_count << " times before succeeding" << dendl;
stringstream s;
s << " reads with retries: " << logger->get(l_bluestore_reads_with_retries);
_set_spurious_read_errors_alert(s.str());
}
return r;
}
@ -15726,6 +15729,11 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
{
std::lock_guard l(qlock);
if (!spurious_read_errors_alert.empty()) {
alerts.emplace(
"BLUESTORE_SPURIOUS_READ_ERRORS",
spurious_read_errors_alert);
}
if (!disk_size_mismatch_alert.empty()) {
alerts.emplace(
"BLUESTORE_DISK_SIZE_MISMATCH",

View File

@ -2947,6 +2947,7 @@ private:
std::string legacy_statfs_alert;
std::string no_per_pool_omap_alert;
std::string disk_size_mismatch_alert;
std::string spurious_read_errors_alert;
void _log_alerts(osd_alert_list_t& alerts);
bool _set_compression_alert(bool cmode, const char* s) {
@ -2979,6 +2980,10 @@ private:
std::lock_guard l(qlock);
disk_size_mismatch_alert = s;
}
void _set_spurious_read_errors_alert(const string& s) {
std::lock_guard l(qlock);
spurious_read_errors_alert = s;
}
private: