From 1908c061496817ee6dbc483186a41a9b373be01c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 17 Oct 2017 21:45:45 -0500 Subject: [PATCH] osd/ECBackend: fix on_write ordering w/ sync onreadable callbacks When we call handle_sub_write after a write completion, we may do a sync read completion and then call back into check_ops(). Attaching the on_write events to the op we're applying means that we don't ensure that the on_write event(s) happen before the next write in the queue is submitted (when we call back into check_ops()). For example, if we have op A, on_write event W, then op B, a sync applied completion would mean that we would queue the write for A, call back into SubWriteApplied -> handle_sub_write_reply -> check_ops and then process B... before getting to W. Resolve this by attaching the on_write callback to a separate Op that is placed into the queue, just like any other Op. This keeps the ordering logic clean, although it is a bit ugly with the polymorphism around Op being either an Op or an on_write callback. Signed-off-by: Sage Weil --- src/osd/ECBackend.cc | 53 ++++++++++++++++++++++++++------------------ src/osd/ECBackend.h | 6 ++++- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index 19de8ad0642..89b928c3a94 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1496,14 +1496,26 @@ void ECBackend::submit_transaction( dout(10) << "onreadable_sync: " << op->on_local_applied_sync << dendl; } -void ECBackend::call_write_ordered(std::function &&cb) { - if (!waiting_state.empty()) { - waiting_state.back().on_write.emplace_back(std::move(cb)); - } else if (!waiting_reads.empty()) { - waiting_reads.back().on_write.emplace_back(std::move(cb)); - } else { - // Nothing earlier in the pipeline, just call it +void ECBackend::call_write_ordered(std::function &&cb) +{ + if (waiting_state.empty() && + waiting_reads.empty()) { + dout(10) << __func__ << " sync" << dendl; cb(); + } else { + ceph_tid_t tid = parent->get_tid(); + Op& op = tid_to_op_map[tid]; + op.tid = tid; + op.on_write = std::move(cb); + if (!waiting_state.empty()) { + dout(10) << __func__ << " tid " << tid << " waiting_state" << dendl; + waiting_state.push_back(op); + } else if (!waiting_reads.empty()) { + dout(10) << __func__ << " tid " << tid << " waiting_reads" << dendl; + waiting_reads.push_back(op); + } else { + ceph_abort(); + } } } @@ -1884,6 +1896,12 @@ bool ECBackend::try_reads_to_commit() if (waiting_reads.empty()) return false; Op *op = &(waiting_reads.front()); + if (op->on_write) { + waiting_reads.pop_front(); + op->on_write(); + tid_to_op_map.erase(op->tid); + return true; + } if (op->read_in_progress()) return false; waiting_reads.pop_front(); @@ -2021,21 +2039,14 @@ bool ECBackend::try_reads_to_commit() } } if (should_write_local) { - handle_sub_write( - get_parent()->whoami_shard(), - op->client_op, - local_write_op, - op->trace, - op->on_local_applied_sync); - op->on_local_applied_sync = 0; + handle_sub_write( + get_parent()->whoami_shard(), + op->client_op, + local_write_op, + op->trace, + op->on_local_applied_sync); + op->on_local_applied_sync = 0; } - - for (auto i = op->on_write.begin(); - i != op->on_write.end(); - op->on_write.erase(i++)) { - (*i)(); - } - return true; } diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h index 53852583ae4..a854ef99ffa 100644 --- a/src/osd/ECBackend.h +++ b/src/osd/ECBackend.h @@ -469,7 +469,7 @@ public: map obc_map; /// see call_write_ordered - std::list > on_write; + std::function on_write; /// Generated internally set temp_added; @@ -507,6 +507,10 @@ public: Context *on_local_applied_sync = nullptr; Context *on_all_applied = nullptr; Context *on_all_commit = nullptr; + + Op() {} + Op(ceph_tid_t t, std::function&& cb) + : tid(t), on_write(cb) { } ~Op() { delete on_local_applied_sync; delete on_all_applied;