osd: do not send ENXIO on misdirected op by default

In practice this tends to get bubbled up the stack as an error on
the caller, and they usually do not handle it properly.  For example,
with librbd, this turns into EIO and break the VM.

Instead, this will manifest as a hung op on the client.  That is
also not ideal, but given that the root cause here is generally a
bug, it's not clear what else would be better.

We already log an error in the cluster log, so teuthology runs will
continue to fail.

Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2017-01-31 15:14:59 -05:00
parent 6995d2c142
commit 923e7f5ce5
3 changed files with 27 additions and 13 deletions

View File

@ -30,3 +30,11 @@
either the IoCtx methods on older librados versions or the
deprecated methods on any version of librados will lead to
incomplete results if/when the new OSD limits are enabled.
* In previous versions, if a client sent an op to the wrong OSD, the OSD
would reply with ENXIO. The rationale here is that the client or OSD is
clearly buggy and we want to surface the error as clearly as possible.
We now only send the ENXIO reply if the osd_enxio_on_misdirected_op option
is enabled (it's off by default). This means that a VM using librbd that
previously would have gotten an EIO and gone read-only will now see a
blocked/hung IO instead.

View File

@ -853,6 +853,7 @@ OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE, 0)
OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL, false) // inject failure during copyfrom completion
OPTION(osd_debug_randomize_hobject_sort_order, OPT_BOOL, false)
OPTION(osd_debug_misdirected_ops, OPT_BOOL, false)
OPTION(osd_enxio_on_misdirected_op, OPT_BOOL, false)
OPTION(osd_debug_verify_cached_snaps, OPT_BOOL, false)
OPTION(osd_enable_op_tracker, OPT_BOOL, true) // enable/disable OSD op tracking
OPTION(osd_num_op_tracker_shard, OPT_U32, 32) // The number of shards for holding the ops

View File

@ -1434,11 +1434,14 @@ void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
<< " pg " << m->get_pg()
<< " to osd." << whoami
<< " not " << pg->acting
<< " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch() << "\n";
reply_op_error(op, -ENXIO);
<< " pg " << m->get_pg()
<< " to osd." << whoami
<< " not " << pg->acting
<< " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch()
<< "\n";
if (g_conf->osd_enxio_on_misdirected_op) {
reply_op_error(op, -ENXIO);
}
}
@ -8785,14 +8788,16 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
if (!send_map->osd_is_valid_op_target(pgid.pgid, whoami)) {
dout(7) << "we are invalid target" << dendl;
clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
<< " pg " << m->get_pg()
<< " to osd." << whoami
<< " in e" << osdmap->get_epoch()
<< ", client e" << m->get_map_epoch()
<< " pg " << pgid
<< " features " << m->get_connection()->get_features()
<< "\n";
service.reply_op_error(op, -ENXIO);
<< " pg " << m->get_pg()
<< " to osd." << whoami
<< " in e" << osdmap->get_epoch()
<< ", client e" << m->get_map_epoch()
<< " pg " << pgid
<< " features " << m->get_connection()->get_features()
<< "\n";
if (g_conf->osd_enxio_on_misdirected_op) {
service.reply_op_error(op, -ENXIO);
}
return;
}