From 6a02bfef3d44a13589c1a90bec29ff0ac64f97aa Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Tue, 3 Oct 2017 18:32:20 -0700 Subject: [PATCH 1/7] osd: Better handle failure to get enough EC shards to backfill Fixes: http://tracker.ceph.com/issues/18162 Signed-off-by: David Zafman <dzafman@redhat.com> --- src/osd/ECBackend.cc | 3 +++ src/osd/PGBackend.h | 6 ++++++ src/osd/PrimaryLogPG.cc | 8 ++++++++ src/osd/PrimaryLogPG.h | 1 + 4 files changed, 18 insertions(+) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 780bdb676b3..52fd2e21fe3 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -217,6 +217,7 @@ void ECBackend::_failed_push(const hobject_t &hoid, dout(10) << __func__ << ": canceling recovery op for obj " << hoid << dendl; assert(recovery_ops.count(hoid)); + eversion_t v = recovery_ops[hoid].v; recovery_ops.erase(hoid); list<pg_shard_t> fl; @@ -224,6 +225,8 @@ void ECBackend::_failed_push(const hobject_t &hoid, fl.push_back(i.first); } get_parent()->failed_push(fl, hoid); + get_parent()->backfill_add_missing(hoid, v); + get_parent()->finish_degraded_object(hoid); } struct OnRecoveryReadComplete : diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index b9604dd2729..b53c615926f 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -106,6 +106,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef; const hobject_t oid) = 0; virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0; + virtual void finish_degraded_object(const hobject_t& oid) = 0; virtual void primary_failed(const hobject_t &soid) = 0; virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0; virtual void cancel_pull(const hobject_t &soid) = 0; @@ -122,6 +123,11 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef; eversion_t v ) = 0; + virtual void backfill_add_missing( + const hobject_t &oid, + eversion_t v + ) = 0; + virtual void remove_missing_object(const hobject_t &oid, eversion_t v, Context *on_complete) = 0; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 99a4230da78..a9a6fa23342 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -525,6 +525,14 @@ void PrimaryLogPG::on_primary_error( dout(0) << __func__ << ": oid " << oid << " version " << v << dendl; primary_failed(oid); primary_error(oid, v); + backfill_add_missing(oid, v); +} + +void PrimaryLogPG::backfill_add_missing( + const hobject_t &oid, + eversion_t v) +{ + dout(0) << __func__ << ": oid " << oid << " version " << v << dendl; backfills_in_flight.erase(oid); missing_loc.add_missing(oid, v, eversion_t()); } diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 8606fc4f694..fb954178715 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -277,6 +277,7 @@ public: const hobject_t &soid, const object_stat_sum_t &delta_stats) override; void on_primary_error(const hobject_t &oid, eversion_t v) override; + void backfill_add_missing(const hobject_t &oid, eversion_t v) override; void remove_missing_object(const hobject_t &oid, eversion_t v, Context *on_complete) override; From b9de5eec267627c8bc5ff0759ddea6c4a8aa7bce Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Mon, 2 Oct 2017 13:51:17 -0700 Subject: [PATCH 2/7] test: Test case that reproduces tracker 18162 recover_replicas: object added to missing set for backfill, but is not in recovering, error! Signed-off-by: David Zafman <dzafman@redhat.com> --- .../erasure-code/test-erasure-eio.sh | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh index 7d26d70a1c0..0cbe6d6443e 100755 --- a/qa/standalone/erasure-code/test-erasure-eio.sh +++ b/qa/standalone/erasure-code/test-erasure-eio.sh @@ -56,6 +56,13 @@ function setup_osds() { grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1 } +function get_state() { + local pgid=$1 + local sname=state + ceph --format json pg dump pgs 2>/dev/null | \ + jq -r ".[] | select(.pgid==\"$pgid\") | .$sname" +} + function create_erasure_coded_pool() { local poolname=$1 shift @@ -408,6 +415,158 @@ function TEST_ec_recovery_errors() { delete_pool $poolname } +# Test backfill with errors present +function TEST_ec_backfill_errors() { + local dir=$1 + local objname=myobject + local lastobj=300 + # Must be between 1 and $lastobj + local testobj=obj250 + + export CEPH_ARGS + CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10' + setup_osds 5 || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname 3 2 || return 1 + + ceph pg dump pgs + + rados_put $dir $poolname $objname || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + ceph pg dump pgs + + dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4 + for i in $(seq 1 $lastobj) + do + rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1 + done + + inject_eio ec data $poolname $testobj $dir 0 || return 1 + inject_eio ec data $poolname $testobj $dir 1 || return 1 + + run_osd $dir ${last_osd} || return 1 + ceph osd in ${last_osd} || return 1 + + sleep 15 + + while(true); do + state=$(get_state 2.0) + echo $state | grep -v backfilling + if [ "$?" = "0" ]; then + break + fi + echo -n "$state " + done + + ceph pg dump pgs + ceph pg 2.0 list_missing | grep -q $testobj || return 1 + + # Command should hang because object is unfound + timeout 5 rados -p $poolname get $testobj $dir/CHECK + test $? = "124" || return 1 + + ceph pg 2.0 mark_unfound_lost delete + + wait_for_clean || return 1 + + for i in $(seq 1 $lastobj) + do + if [ obj${i} = "$testobj" ]; then + # Doesn't exist anymore + ! rados -p $poolname get $testobj $dir/CHECK || return 1 + else + rados --pool $poolname get obj${i} $dir/CHECK || return 1 + diff -q $dir/ORIGINAL $dir/CHECK || return 1 + fi + done + + rm -f ${dir}/ORIGINAL ${dir}/CHECK + + delete_pool $poolname +} + +# Test recovery with errors present +function TEST_ec_recovery_errors() { + local dir=$1 + local objname=myobject + local lastobj=100 + # Must be between 1 and $lastobj + local testobj=obj75 + + setup_osds 5 || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname 3 2 || return 1 + + ceph pg dump pgs + + rados_put $dir $poolname $objname || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + ceph pg dump pgs + + dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4 + for i in $(seq 1 $lastobj) + do + rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1 + done + + inject_eio ec data $poolname $testobj $dir 0 || return 1 + inject_eio ec data $poolname $testobj $dir 1 || return 1 + + run_osd $dir ${last_osd} || return 1 + ceph osd in ${last_osd} || return 1 + + sleep 15 + + while(true); do + state=$(get_state 2.0) + echo $state | grep -v recovering + if [ "$?" = "0" ]; then + break + fi + echo -n "$state " + done + + ceph pg dump pgs + ceph pg 2.0 list_missing | grep -q $testobj || return 1 + + # Command should hang because object is unfound + timeout 5 rados -p $poolname get $testobj $dir/CHECK + test $? = "124" || return 1 + + ceph pg 2.0 mark_unfound_lost delete + + wait_for_clean || return 1 + + for i in $(seq 1 $lastobj) + do + if [ obj${i} = "$testobj" ]; then + # Doesn't exist anymore + ! rados -p $poolname get $testobj $dir/CHECK || return 1 + else + rados --pool $poolname get obj${i} $dir/CHECK || return 1 + diff -q $dir/ORIGINAL $dir/CHECK || return 1 + fi + done + + rm -f ${dir}/ORIGINAL ${dir}/CHECK + + delete_pool $poolname +} + main test-erasure-eio "$@" # Local Variables: From bb2bcb95f51abc206e005e44ef383ee45b8f2209 Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Mon, 9 Oct 2017 08:17:29 -0700 Subject: [PATCH 3/7] osd: Add new UnfoundBackfill and UnfoundRecovery pg transitions Signed-off-by: David Zafman <dzafman@redhat.com> --- .../erasure-code/test-erasure-eio.sh | 22 +++++---- src/osd/PG.cc | 47 ++++++++++++++++++- src/osd/PG.h | 43 +++++++++++++++-- 3 files changed, 97 insertions(+), 15 deletions(-) diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh index 0cbe6d6443e..8404f7e000b 100755 --- a/qa/standalone/erasure-code/test-erasure-eio.sh +++ b/qa/standalone/erasure-code/test-erasure-eio.sh @@ -415,8 +415,8 @@ function TEST_ec_recovery_errors() { delete_pool $poolname } -# Test backfill with errors present -function TEST_ec_backfill_errors() { +# Test backfill with unfound object +function TEST_ec_backfill_unfound() { local dir=$1 local objname=myobject local lastobj=300 @@ -456,13 +456,14 @@ function TEST_ec_backfill_errors() { sleep 15 - while(true); do + for tmp in $(seq 1 100); do state=$(get_state 2.0) - echo $state | grep -v backfilling + echo $state | grep backfill_unfound if [ "$?" = "0" ]; then break fi - echo -n "$state " + echo $state + sleep 1 done ceph pg dump pgs @@ -492,8 +493,8 @@ function TEST_ec_backfill_errors() { delete_pool $poolname } -# Test recovery with errors present -function TEST_ec_recovery_errors() { +# Test recovery with unfound object +function TEST_ec_recovery_unfound() { local dir=$1 local objname=myobject local lastobj=100 @@ -531,13 +532,14 @@ function TEST_ec_recovery_errors() { sleep 15 - while(true); do + for tmp in $(seq 1 100); do state=$(get_state 2.0) - echo $state | grep -v recovering + echo $state | grep recovery_unfound if [ "$?" = "0" ]; then break fi - echo -n "$state " + echo "$state " + sleep 1 done ceph pg dump pgs diff --git a/src/osd/PG.cc b/src/osd/PG.cc index da44df85415..831ea5fbc22 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5775,7 +5775,7 @@ void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx) new PG::CephPeeringEvt( queued, queued, - PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval))); + PG::UnfoundBackfill())); queue_peering_event(evt); action = "in backfill"; } else if (state_test(PG_STATE_RECOVERING)) { @@ -5783,7 +5783,7 @@ void PG::find_unfound(epoch_t queued, RecoveryCtx *rctx) new PG::CephPeeringEvt( queued, queued, - PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval))); + PG::UnfoundRecovery())); queue_peering_event(evt); action = "in recovery"; } else { @@ -6358,6 +6358,36 @@ PG::RecoveryState::Backfilling::react(const DeferBackfill &c) return transit<NotBackfilling>(); } +boost::statechart::result +PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + + pg->state_clear(PG_STATE_BACKFILLING); + + for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin(); + it != pg->backfill_targets.end(); + ++it) { + assert(*it != pg->pg_whoami); + ConnectionRef con = pg->osd->get_con_osd_cluster( + it->osd, pg->get_osdmap()->get_epoch()); + if (con) { + pg->osd->send_message_osd_cluster( + new MBackfillReserve( + MBackfillReserve::CANCEL, + spg_t(pg->info.pgid.pgid, it->shard), + pg->get_osdmap()->get_epoch()), + con.get()); + } + } + + pg->waiting_on_backfill.clear(); + + return transit<NotBackfilling>(); +} + boost::statechart::result PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &) { @@ -6537,6 +6567,7 @@ void PG::RecoveryState::NotBackfilling::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_UNFOUND); utime_t dur = ceph_clock_now() - enter_time; pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); } @@ -6555,6 +6586,7 @@ void PG::RecoveryState::NotRecovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; + pg->state_clear(PG_STATE_UNFOUND); utime_t dur = ceph_clock_now() - enter_time; pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); } @@ -6929,6 +6961,17 @@ PG::RecoveryState::Recovering::react(const DeferRecovery &evt) return transit<NotRecovering>(); } +boost::statechart::result +PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt) +{ + PG *pg = context< RecoveryMachine >().pg; + ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl; + pg->state_clear(PG_STATE_RECOVERING); + pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + release_reservations(true); + return transit<NotRecovering>(); +} + void PG::RecoveryState::Recovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); diff --git a/src/osd/PG.h b/src/osd/PG.h index 2692117714c..ce67cc1d960 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1815,6 +1815,18 @@ public: *out << "DeferRecovery: delay " << delay; } }; + struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> { + explicit UnfoundBackfill() {} + void print(std::ostream *out) const { + *out << "UnfoundBackfill"; + } + }; + struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> { + explicit UnfoundRecovery() {} + void print(std::ostream *out) const { + *out << "UnfoundRecovery"; + } + }; protected: TrivialEvent(Initialize) TrivialEvent(Load) @@ -2101,7 +2113,9 @@ protected: boost::statechart::custom_reaction< Backfilled >, boost::statechart::custom_reaction< AllReplicasActivated >, boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< DeferBackfill > + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill > > reactions; boost::statechart::result react(const QueryState& q); boost::statechart::result react(const ActMap&); @@ -2119,6 +2133,12 @@ protected: boost::statechart::result react(const DeferBackfill& evt) { return discard_event(); } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } }; struct Clean : boost::statechart::state< Clean, Active >, NamedState { @@ -2147,11 +2167,13 @@ protected: typedef boost::mpl::list< boost::statechart::transition< Backfilled, Recovered >, boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundBackfill >, boost::statechart::custom_reaction< RemoteReservationRejected > > reactions; explicit Backfilling(my_context ctx); boost::statechart::result react(const RemoteReservationRejected& evt); boost::statechart::result react(const DeferBackfill& evt); + boost::statechart::result react(const UnfoundBackfill& evt); void exit(); }; @@ -2191,13 +2213,18 @@ protected: struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState { typedef boost::mpl::list< boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >, - boost::statechart::custom_reaction< DeferRecovery > + boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery > > reactions; explicit NotRecovering(my_context ctx); boost::statechart::result react(const DeferRecovery& evt) { /* no-op */ return discard_event(); } + boost::statechart::result react(const UnfoundRecovery& evt) { + /* no-op */ + return discard_event(); + } void exit(); }; @@ -2214,7 +2241,9 @@ protected: boost::statechart::custom_reaction< MLogRec >, boost::statechart::custom_reaction< Activate >, boost::statechart::custom_reaction< DeferRecovery >, - boost::statechart::custom_reaction< DeferBackfill > + boost::statechart::custom_reaction< DeferBackfill >, + boost::statechart::custom_reaction< UnfoundRecovery >, + boost::statechart::custom_reaction< UnfoundBackfill > > reactions; boost::statechart::result react(const QueryState& q); boost::statechart::result react(const MInfoRec& infoevt); @@ -2228,6 +2257,12 @@ protected: boost::statechart::result react(const DeferBackfill& evt) { return discard_event(); } + boost::statechart::result react(const UnfoundRecovery& evt) { + return discard_event(); + } + boost::statechart::result react(const UnfoundBackfill& evt) { + return discard_event(); + } }; struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState { @@ -2295,6 +2330,7 @@ protected: typedef boost::mpl::list < boost::statechart::custom_reaction< AllReplicasRecovered >, boost::statechart::custom_reaction< DeferRecovery >, + boost::statechart::custom_reaction< UnfoundRecovery >, boost::statechart::custom_reaction< RequestBackfill > > reactions; explicit Recovering(my_context ctx); @@ -2302,6 +2338,7 @@ protected: void release_reservations(bool cancel = false); boost::statechart::result react(const AllReplicasRecovered &evt); boost::statechart::result react(const DeferRecovery& evt); + boost::statechart::result react(const UnfoundRecovery& evt); boost::statechart::result react(const RequestBackfill &evt); }; From 7f8b0ce9e681f727d8217e3ed74a1a3355f364f3 Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Mon, 9 Oct 2017 08:19:21 -0700 Subject: [PATCH 4/7] osd, mon: Add new pg states recovery_unfound and backfill_unfound Signed-off-by: David Zafman <dzafman@redhat.com> --- src/mon/PGMap.cc | 2 ++ src/osd/PG.cc | 6 ++++-- src/osd/PG.h | 2 ++ src/osd/osd_types.cc | 8 ++++++++ src/osd/osd_types.h | 4 ++-- 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index 0b6d3480497..206983b54f8 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2134,6 +2134,8 @@ void PGMap::get_health_checks( { PG_STATE_INCOMPLETE, {UNAVAILABLE, {}} }, { PG_STATE_REPAIR, {DAMAGED, {}} }, { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} }, + { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} }, + { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} }, { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} }, { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} }, { PG_STATE_DEGRADED, {DEGRADED, {}} }, diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 831ea5fbc22..325290d8ab5 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -6365,6 +6365,7 @@ PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c) ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl; pg->osd->local_reserver.cancel_reservation(pg->info.pgid); + pg->state_set(PG_STATE_BACKFILL_UNFOUND); pg->state_clear(PG_STATE_BACKFILLING); for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin(); @@ -6567,7 +6568,7 @@ void PG::RecoveryState::NotBackfilling::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; - pg->state_clear(PG_STATE_UNFOUND); + pg->state_clear(PG_STATE_BACKFILL_UNFOUND); utime_t dur = ceph_clock_now() - enter_time; pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur); } @@ -6586,7 +6587,7 @@ void PG::RecoveryState::NotRecovering::exit() { context< RecoveryMachine >().log_exit(state_name, enter_time); PG *pg = context< RecoveryMachine >().pg; - pg->state_clear(PG_STATE_UNFOUND); + pg->state_clear(PG_STATE_RECOVERY_UNFOUND); utime_t dur = ceph_clock_now() - enter_time; pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur); } @@ -6966,6 +6967,7 @@ PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt) { PG *pg = context< RecoveryMachine >().pg; ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl; + pg->state_set(PG_STATE_RECOVERY_UNFOUND); pg->state_clear(PG_STATE_RECOVERING); pg->osd->local_reserver.cancel_reservation(pg->info.pgid); release_reservations(true); diff --git a/src/osd/PG.h b/src/osd/PG.h index ce67cc1d960..40e7c7c3be5 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -2613,6 +2613,8 @@ protected: bool is_activating() const { return state_test(PG_STATE_ACTIVATING); } bool is_peering() const { return state_test(PG_STATE_PEERING); } bool is_down() const { return state_test(PG_STATE_DOWN); } + bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); } + bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); } bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); } bool is_clean() const { return state_test(PG_STATE_CLEAN); } bool is_degraded() const { return state_test(PG_STATE_DEGRADED); } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index ceced3cb40b..5a53342b3af 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -811,6 +811,10 @@ std::string pg_state_string(uint64_t state) oss << "forced_recovery+"; if (state & PG_STATE_DOWN) oss << "down+"; + if (state & PG_STATE_RECOVERY_UNFOUND) + oss << "recovery_unfound+"; + if (state & PG_STATE_BACKFILL_UNFOUND) + oss << "backfill_unfound+"; if (state & PG_STATE_UNDERSIZED) oss << "undersized+"; if (state & PG_STATE_DEGRADED) @@ -862,6 +866,10 @@ boost::optional<uint64_t> pg_string_state(const std::string& state) type = PG_STATE_CLEAN; else if (state == "down") type = PG_STATE_DOWN; + else if (state == "recovery_unfound") + type = PG_STATE_RECOVERY_UNFOUND; + else if (state == "backfill_unfound") + type = PG_STATE_BACKFILL_UNFOUND; else if (state == "scrubbing") type = PG_STATE_SCRUBBING; else if (state == "degraded") diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 33583e21639..acb9c89e5bd 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -978,8 +978,8 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) { #define PG_STATE_ACTIVE (1ULL << 1) // i am active. (primary: replicas too) #define PG_STATE_CLEAN (1ULL << 2) // peers are complete, clean of stray replicas. #define PG_STATE_DOWN (1ULL << 4) // a needed replica is down, PG offline -//#define PG_STATE_REPLAY (1ULL << 5) // crashed, waiting for replay -//#define PG_STATE_STRAY (1ULL << 6) // i must notify the primary i exist. +#define PG_STATE_RECOVERY_UNFOUND (1ULL << 5) // recovery stopped due to unfound +#define PG_STATE_BACKFILL_UNFOUND (1ULL << 6) // backfill stopped due to unfound //#define PG_STATE_SPLITTING (1ULL << 7) // i am splitting #define PG_STATE_SCRUBBING (1ULL << 8) // scrubbing //#define PG_STATE_SCRUBQ (1ULL << 9) // queued for scrub From 689bff354a6bfa1cf47b0d0a04fa9f1b2ef68f75 Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Mon, 9 Oct 2017 14:03:23 -0700 Subject: [PATCH 5/7] osd: Better recovery/backfill restart for mark_unfound_lost delete/revert Signed-off-by: David Zafman <dzafman@redhat.com> --- src/osd/PrimaryLogPG.cc | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index a9a6fa23342..fc04412dcf8 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -702,7 +702,7 @@ void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef o void PrimaryLogPG::maybe_force_recovery() { - // no force if not in degraded/recovery/backfill stats + // no force if not in degraded/recovery/backfill states if (!is_degraded() && !state_test(PG_STATE_RECOVERING | PG_STATE_RECOVERY_WAIT | @@ -2314,6 +2314,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op) // force recovery of the oldest missing object if too many logs maybe_force_recovery(); } + PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail( OpRequestRef op, bool write_ordered, @@ -10580,7 +10581,23 @@ void PrimaryLogPG::mark_all_unfound_lost( release_backoffs(p.first); } requeue_object_waiters(waiting_for_unreadable_object); - queue_recovery(); + if (is_recovery_unfound()) { + queue_peering_event( + CephPeeringEvtRef( + std::make_shared<CephPeeringEvt>( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + DoRecovery()))); + } else if (is_backfill_unfound()) { + queue_peering_event( + CephPeeringEvtRef( + std::make_shared<CephPeeringEvt>( + get_osdmap()->get_epoch(), + get_osdmap()->get_epoch(), + RequestBackfill()))); + } else { + queue_recovery(); + } stringstream ss; ss << "pg has " << num_unfound From c2572bee3c6256419e0b265e9e2829e7f3afb76d Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Mon, 9 Oct 2017 14:15:51 -0700 Subject: [PATCH 6/7] test: Add replicated recovery/backfill test Signed-off-by: David Zafman <dzafman@redhat.com> --- qa/standalone/osd/osd-rep-recov-eio.sh | 305 +++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100755 qa/standalone/osd/osd-rep-recov-eio.sh diff --git a/qa/standalone/osd/osd-rep-recov-eio.sh b/qa/standalone/osd/osd-rep-recov-eio.sh new file mode 100755 index 00000000000..d0fe538363c --- /dev/null +++ b/qa/standalone/osd/osd-rep-recov-eio.sh @@ -0,0 +1,305 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2017 Red Hat <contact@redhat.com> +# +# +# Author: Kefu Chai <kchai@redhat.com> +# Author: David Zafman <dzafman@redhat.com> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7140" # git grep '\<7140\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + create_rbd_pool || return 1 + + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function setup_osds() { + local count=$1 + shift + + for id in $(seq 0 $(expr $count - 1)) ; do + run_osd $dir $id || return 1 + done + wait_for_clean || return 1 +} + +function get_state() { + local pgid=$1 + local sname=state + ceph --format json pg dump pgs 2>/dev/null | \ + jq -r ".[] | select(.pgid==\"$pgid\") | .$sname" +} + +function delete_pool() { + local poolname=$1 + + ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it +} + +function rados_put() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + + for marker in AAA BBB CCCC DDDD ; do + printf "%*s" 1024 $marker + done > $dir/ORIGINAL + # + # get and put an object, compare they are equal + # + rados --pool $poolname put $objname $dir/ORIGINAL || return 1 +} + +function rados_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + local expect=${4:-ok} + + # + # Expect a failure to get object + # + if [ $expect = "fail" ]; + then + ! rados --pool $poolname get $objname $dir/COPY + return + fi + # + # Expect hang trying to get object + # + if [ $expect = "hang" ]; + then + timeout 5 rados --pool $poolname get $objname $dir/COPY + test "$?" = "124" + return + fi + # + # get an object, compare with $dir/ORIGINAL + # + rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY +} + +function rados_get_data() { + local inject=$1 + shift + local dir=$1 + + local poolname=pool-rep + local objname=obj-$inject-$$ + rados_put $dir $poolname $objname || return 1 + inject_$inject rep data $poolname $objname $dir 0 || return 1 + rados_get $dir $poolname $objname || return 1 + + inject_$inject rep data $poolname $objname $dir 0 || return 1 + inject_$inject rep data $poolname $objname $dir 1 || return 1 + rados_get $dir $poolname $objname || return 1 + + inject_$inject rep data $poolname $objname $dir 0 || return 1 + inject_$inject rep data $poolname $objname $dir 1 || return 1 + inject_$inject rep data $poolname $objname $dir 2 || return 1 + rados_get $dir $poolname $objname hang || return 1 +} + +function TEST_rados_get_with_eio() { + local dir=$1 + + setup_osds 4 || return 1 + + local poolname=pool-rep + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + rados_get_data eio $dir || return 1 + + delete_pool $poolname +} + +# Test backfill with unfound object +function TEST_rep_backfill_unfound() { + local dir=$1 + local objname=myobject + local lastobj=300 + # Must be between 1 and $lastobj + local testobj=obj250 + + export CEPH_ARGS + CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10' + setup_osds 3 || return 1 + + local poolname=test-pool + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + ceph pg dump pgs + + rados_put $dir $poolname $objname || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + ceph pg dump pgs + + dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4 + for i in $(seq 1 $lastobj) + do + rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1 + done + + inject_eio rep data $poolname $testobj $dir 0 || return 1 + inject_eio rep data $poolname $testobj $dir 1 || return 1 + + run_osd $dir ${last_osd} || return 1 + ceph osd in ${last_osd} || return 1 + + sleep 15 + + for tmp in $(seq 1 100); do + state=$(get_state 2.0) + echo $state | grep backfill_unfound + if [ "$?" = "0" ]; then + break + fi + echo "$state " + sleep 1 + done + + ceph pg dump pgs + ceph pg 2.0 list_missing | grep -q $testobj || return 1 + + # Command should hang because object is unfound + timeout 5 rados -p $poolname get $testobj $dir/CHECK + test $? = "124" || return 1 + + ceph pg 2.0 mark_unfound_lost delete + + wait_for_clean || return 1 + + for i in $(seq 1 $lastobj) + do + if [ obj${i} = "$testobj" ]; then + # Doesn't exist anymore + ! rados -p $poolname get $testobj $dir/CHECK || return 1 + else + rados --pool $poolname get obj${i} $dir/CHECK || return 1 + diff -q $dir/ORIGINAL $dir/CHECK || return 1 + fi + done + + rm -f ${dir}/ORIGINAL ${dir}/CHECK + + delete_pool $poolname +} + +# Test recovery with unfound object +function TEST_rep_recovery_unfound() { + local dir=$1 + local objname=myobject + local lastobj=100 + # Must be between 1 and $lastobj + local testobj=obj75 + + setup_osds 3 || return 1 + + local poolname=test-pool + create_pool $poolname 1 1 || return 1 + wait_for_clean || return 1 + + ceph pg dump pgs + + rados_put $dir $poolname $objname || return 1 + + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1 + ceph osd down ${last_osd} || return 1 + ceph osd out ${last_osd} || return 1 + + ceph pg dump pgs + + dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4 + for i in $(seq 1 $lastobj) + do + rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1 + done + + inject_eio rep data $poolname $testobj $dir 0 || return 1 + inject_eio rep data $poolname $testobj $dir 1 || return 1 + + run_osd $dir ${last_osd} || return 1 + ceph osd in ${last_osd} || return 1 + + sleep 15 + + for tmp in $(seq 1 100); do + state=$(get_state 2.0) + echo $state | grep -v recovering + if [ "$?" = "0" ]; then + break + fi + echo "$state " + sleep 1 + done + + ceph pg dump pgs + ceph pg 2.0 list_missing | grep -q $testobj || return 1 + + # Command should hang because object is unfound + timeout 5 rados -p $poolname get $testobj $dir/CHECK + test $? = "124" || return 1 + + ceph pg 2.0 mark_unfound_lost delete + + wait_for_clean || return 1 + + for i in $(seq 1 $lastobj) + do + if [ obj${i} = "$testobj" ]; then + # Doesn't exist anymore + ! rados -p $poolname get $testobj $dir/CHECK || return 1 + else + rados --pool $poolname get obj${i} $dir/CHECK || return 1 + diff -q $dir/ORIGINAL $dir/CHECK || return 1 + fi + done + + rm -f ${dir}/ORIGINAL ${dir}/CHECK + + delete_pool $poolname +} + +main osd-rep-recov-eio.sh "$@" + +# Local Variables: +# compile-command: "cd ../../../build ; make -j4 && ../qa/run-standalone.sh osd-rep-recov-eio.sh" +# End: From 69b5fc54feb59f8b0a26a3ca3e925980c91b5b75 Mon Sep 17 00:00:00 2001 From: David Zafman <dzafman@redhat.com> Date: Wed, 11 Oct 2017 20:16:44 -0700 Subject: [PATCH 7/7] test: Cleanup test-erasure-eio.sh code Signed-off-by: David Zafman <dzafman@redhat.com> --- .../erasure-code/test-erasure-eio.sh | 93 ++++++------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/qa/standalone/erasure-code/test-erasure-eio.sh b/qa/standalone/erasure-code/test-erasure-eio.sh index 8404f7e000b..13b64f1238b 100755 --- a/qa/standalone/erasure-code/test-erasure-eio.sh +++ b/qa/standalone/erasure-code/test-erasure-eio.sh @@ -123,39 +123,6 @@ function rados_get() { rm $dir/COPY } -function rados_put_get() { - local dir=$1 - local poolname=$2 - local objname=${3:-SOMETHING} - local recovery=$4 - - # - # get and put an object, compare they are equal - # - rados_put $dir $poolname $objname || return 1 - # We can read even though caller injected read error on one of the shards - rados_get $dir $poolname $objname || return 1 - - if [ -n "$recovery" ]; - then - # - # take out the last OSD used to store the object, - # bring it back, and check for clean PGs which means - # recovery didn't crash the primary. - # - local -a initial_osds=($(get_osds $poolname $objname)) - local last_osd=${initial_osds[-1]} - # Kill OSD - kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1 - ceph osd out ${last_osd} || return 1 - ! get_osds $poolname $objname | grep '\<'${last_osd}'\>' || return 1 - ceph osd in ${last_osd} || return 1 - run_osd $dir ${last_osd} || return 1 - wait_for_clean || return 1 - fi - - rm $dir/ORIGINAL -} function inject_remove() { local pooltype=$1 @@ -176,33 +143,15 @@ function inject_remove() { objectstore_tool $dir $osd_id $objname remove || return 1 } -function rados_get_data_recovery() { - local inject=$1 - shift - local dir=$1 - shift - local shard_id=$1 - - # inject eio to speificied shard - # - local poolname=pool-jerasure - local objname=obj-$inject-$$-$shard_id - inject_$inject ec data $poolname $objname $dir $shard_id || return 1 - rados_put_get $dir $poolname $objname recovery || return 1 - - shard_id=$(expr $shard_id + 1) - inject_$inject ec data $poolname $objname $dir $shard_id || return 1 - # Now 2 out of 3 shards get EIO, so should fail - rados_get $dir $poolname $objname fail || return 1 -} - # Test with an inject error -function rados_get_data() { +function rados_put_get_data() { local inject=$1 shift local dir=$1 shift local shard_id=$1 + shift + local arg=$1 # inject eio to speificied shard # @@ -212,10 +161,29 @@ function rados_get_data() { inject_$inject ec data $poolname $objname $dir $shard_id || return 1 rados_get $dir $poolname $objname || return 1 + if [ "$arg" = "recovery" ]; + then + # + # take out the last OSD used to store the object, + # bring it back, and check for clean PGs which means + # recovery didn't crash the primary. + # + local -a initial_osds=($(get_osds $poolname $objname)) + local last_osd=${initial_osds[-1]} + # Kill OSD + kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1 + ceph osd out ${last_osd} || return 1 + ! get_osds $poolname $objname | grep '\<'${last_osd}'\>' || return 1 + ceph osd in ${last_osd} || return 1 + run_osd $dir ${last_osd} || return 1 + wait_for_clean || return 1 + fi + shard_id=$(expr $shard_id + 1) inject_$inject ec data $poolname $objname $dir $shard_id || return 1 # Now 2 out of 3 shards get an error, so should fail rados_get $dir $poolname $objname fail || return 1 + rm $dir/ORIGINAL } # Change the size of speificied shard @@ -273,6 +241,7 @@ function rados_get_data_bad_size() { shard_id=$(expr $shard_id + 1) set_size $objname $dir $shard_id $bytes $mode || return 1 rados_get $dir $poolname $objname fail || return 1 + rm $dir/ORIGINAL } # @@ -290,7 +259,7 @@ function TEST_rados_get_subread_eio_shard_0() { create_erasure_coded_pool $poolname 2 1 || return 1 # inject eio on primary OSD (0) and replica OSD (1) local shard_id=0 - rados_get_data eio $dir $shard_id || return 1 + rados_put_get_data eio $dir $shard_id || return 1 delete_pool $poolname } @@ -302,7 +271,7 @@ function TEST_rados_get_subread_eio_shard_1() { create_erasure_coded_pool $poolname 2 1 || return 1 # inject eio into replicas OSD (1) and OSD (2) local shard_id=1 - rados_get_data eio $dir $shard_id || return 1 + rados_put_get_data eio $dir $shard_id || return 1 delete_pool $poolname } @@ -317,7 +286,7 @@ function TEST_rados_get_subread_missing() { create_erasure_coded_pool $poolname 2 1 || return 1 # inject remove into replicas OSD (1) and OSD (2) local shard_id=1 - rados_get_data remove $dir $shard_id || return 1 + rados_put_get_data remove $dir $shard_id || return 1 delete_pool $poolname } @@ -366,23 +335,21 @@ function TEST_rados_get_with_subreadall_eio_shard_0() { local poolname=pool-jerasure create_erasure_coded_pool $poolname 2 1 || return 1 # inject eio on primary OSD (0) - local shard_id=0 - rados_get_data_recovery eio $dir $shard_id || return 1 + rados_put_get_data eio $dir $shard_id recovery || return 1 delete_pool $poolname } function TEST_rados_get_with_subreadall_eio_shard_1() { local dir=$1 - local shard_id=0 + local shard_id=1 setup_osds 4 || return 1 local poolname=pool-jerasure create_erasure_coded_pool $poolname 2 1 || return 1 # inject eio on replica OSD (1) - local shard_id=1 - rados_get_data_recovery eio $dir $shard_id || return 1 + rados_put_get_data eio $dir $shard_id recovery || return 1 delete_pool $poolname } @@ -410,8 +377,6 @@ function TEST_ec_recovery_errors() { # Cluster should recover this object wait_for_clean || return 1 - #rados_get_data_recovery eio $dir $shard_id || return 1 - delete_pool $poolname }