Merge PR #28229 into master

* refs/pull/28229/head:
	os/bluestore: do garbage collection if blob count is too high.
	common/perf_conters: make dump_formatted_xxx funcs as const.
	os/bluestore: store extents for GC within WriteContext.
	os/bluestore: GC class, make some members local.
	os/bluestore: vector -> interval set in GC to track extents to collect.
	tests/store_test: many-many spanning blobs test case

Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Sage Weil 2019-06-18 10:02:40 -05:00
commit d4719f508a
6 changed files with 189 additions and 58 deletions

View File

@ -126,7 +126,7 @@ void PerfCountersCollectionImpl::dump_formatted_generic(
bool schema,
bool histograms,
const std::string &logger,
const std::string &counter)
const std::string &counter) const
{
f->open_object_section("perfcounter_collection");
@ -350,7 +350,7 @@ void PerfCounters::reset()
}
void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
bool histograms, const std::string &counter)
bool histograms, const std::string &counter) const
{
f->open_object_section(m_name.c_str());

View File

@ -243,11 +243,11 @@ public:
void reset();
void dump_formatted(ceph::Formatter *f, bool schema,
const std::string &counter = "") {
const std::string &counter = "") const {
dump_formatted_generic(f, schema, false, counter);
}
void dump_formatted_histograms(ceph::Formatter *f, bool schema,
const std::string &counter = "") {
const std::string &counter = "") const {
dump_formatted_generic(f, schema, true, counter);
}
std::pair<uint64_t, uint64_t> get_tavg_ns(int idx) const;
@ -274,7 +274,7 @@ private:
PerfCounters(const PerfCounters &rhs);
PerfCounters& operator=(const PerfCounters &rhs);
void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
const std::string &counter = "");
const std::string &counter = "") const;
typedef std::vector<perf_counter_data_any_d> perf_counter_data_vec_t;
@ -321,13 +321,13 @@ public:
void dump_formatted(ceph::Formatter *f, bool schema,
const std::string &logger = "",
const std::string &counter = "") {
const std::string &counter = "") const {
dump_formatted_generic(f, schema, false, logger, counter);
}
void dump_formatted_histograms(ceph::Formatter *f, bool schema,
const std::string &logger = "",
const std::string &counter = "") {
const std::string &counter = "") const {
dump_formatted_generic(f, schema, true, logger, counter);
}
@ -348,7 +348,7 @@ public:
private:
void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
const std::string &logger = "",
const std::string &counter = "");
const std::string &counter = "") const;
perf_counters_set_t m_loggers;

View File

@ -18,6 +18,8 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <boost/container/flat_set.hpp>
#include "include/cpp-btree/btree_set.h"
#include "BlueStore.h"
@ -813,7 +815,7 @@ void BlueStore::GarbageCollector::process_protrusive_extents(
bool bExit = false;
do {
if (it->blob.get() == b) {
extents_to_collect.emplace_back(it->logical_offset, it->length);
extents_to_collect.insert(it->logical_offset, it->length);
}
bExit = it == bi.last_lextent;
++it;
@ -839,8 +841,8 @@ int64_t BlueStore::GarbageCollector::estimate(
used_alloc_unit = boost::optional<uint64_t >();
blob_info_counted = nullptr;
gc_start_offset = start_offset;
gc_end_offset = start_offset + length;
uint64_t gc_start_offset = start_offset;
uint64_t gc_end_offset = start_offset + length;
uint64_t end_offset = start_offset + length;
@ -11579,12 +11581,29 @@ void BlueStore::_do_write_small(
prev_ep = end; // to avoid this extent check as it's a duplicate
}
boost::container::flat_set<const bluestore_blob_t*> inspected_blobs;
// We don't want to have more blobs than min alloc units fit
// into 2 max blobs
size_t blob_threshold = max_blob_size / min_alloc_size * 2 + 1;
bool above_blob_threshold = false;
inspected_blobs.reserve(blob_threshold);
uint64_t max_off = 0;
auto start_ep = ep;
auto end_ep = ep; // exclusively
do {
any_change = false;
if (ep != end && ep->logical_offset < offset + max_bsize) {
BlobRef b = ep->blob;
if (!above_blob_threshold) {
inspected_blobs.insert(&b->get_blob());
above_blob_threshold = inspected_blobs.size() >= blob_threshold;
}
max_off = ep->logical_end();
auto bstart = ep->blob_start();
dout(20) << __func__ << " considering " << *b
<< " bstart 0x" << std::hex << bstart << std::dec << dendl;
if (bstart >= end_offs) {
@ -11783,12 +11802,18 @@ void BlueStore::_do_write_small(
}
}
++ep;
end_ep = ep;
any_change = true;
} // if (ep != end && ep->logical_offset < offset + max_bsize)
// check extent for reuse in reverse order
if (prev_ep != end && prev_ep->logical_offset >= min_off) {
BlobRef b = prev_ep->blob;
if (!above_blob_threshold) {
inspected_blobs.insert(&b->get_blob());
above_blob_threshold = inspected_blobs.size() >= blob_threshold;
}
start_ep = prev_ep;
auto bstart = prev_ep->blob_start();
dout(20) << __func__ << " considering " << *b
<< " bstart 0x" << std::hex << bstart << std::dec << dendl;
@ -11835,6 +11860,24 @@ void BlueStore::_do_write_small(
} // if (prev_ep != end && prev_ep->logical_offset >= min_off)
} while (any_change);
if (above_blob_threshold) {
dout(10) << __func__ << " request GC, blobs >= " << inspected_blobs.size()
<< " " << std::hex << min_off << "~" << max_off << std::dec
<< dendl;
ceph_assert(start_ep != end_ep);
for (auto ep = start_ep; ep != end_ep; ++ep) {
dout(20) << __func__ << " inserting for GC "
<< std::hex << ep->logical_offset << "~" << ep->length
<< std::dec << dendl;
wctx->extents_to_gc.union_insert(ep->logical_offset, ep->length);
}
// insert newly written extent to GC
wctx->extents_to_gc.union_insert(offset, length);
dout(20) << __func__ << " inserting (last) for GC "
<< std::hex << offset << "~" << length
<< std::dec << dendl;
}
// new blob.
BlobRef b = c->new_blob();
uint64_t b_off = p2phase<uint64_t>(offset, alloc_len);
@ -12453,34 +12496,38 @@ int BlueStore::_do_gc(
TransContext *txc,
CollectionRef& c,
OnodeRef o,
const GarbageCollector& gc,
const WriteContext& wctx,
uint64_t *dirty_start,
uint64_t *dirty_end)
{
auto& extents_to_collect = gc.get_extents_to_collect();
bool dirty_range_updated = false;
WriteContext wctx_gc;
wctx_gc.fork(wctx); // make a clone for garbage collection
auto & extents_to_collect = wctx.extents_to_gc;
for (auto it = extents_to_collect.begin();
it != extents_to_collect.end();
++it) {
bufferlist bl;
int r = _do_read(c.get(), o, it->offset, it->length, bl, 0);
ceph_assert(r == (int)it->length);
auto offset = (*it).first;
auto length = (*it).second;
dout(20) << __func__ << " processing " << std::hex
<< offset << "~" << length << std::dec
<< dendl;
int r = _do_read(c.get(), o, offset, length, bl, 0);
ceph_assert(r == (int)length);
_do_write_data(txc, c, o, it->offset, it->length, bl, &wctx_gc);
logger->inc(l_bluestore_gc_merged, it->length);
_do_write_data(txc, c, o, offset, length, bl, &wctx_gc);
logger->inc(l_bluestore_gc_merged, length);
if (*dirty_start > it->offset) {
*dirty_start = it->offset;
if (*dirty_start > offset) {
*dirty_start = offset;
dirty_range_updated = true;
}
if (*dirty_end < it->offset + it->length) {
*dirty_end = it->offset + it->length;
if (*dirty_end < offset + length) {
*dirty_end = offset + length;
dirty_range_updated = true;
}
}
@ -12528,7 +12575,7 @@ int BlueStore::_do_write(
uint64_t end = offset + length;
GarbageCollector gc(c->store->cct);
int64_t benefit;
int64_t benefit = 0;
auto dirty_start = offset;
auto dirty_end = end;
@ -12543,14 +12590,18 @@ int BlueStore::_do_write(
goto out;
}
if (wctx.extents_to_gc.empty() ||
wctx.extents_to_gc.range_start() > offset ||
wctx.extents_to_gc.range_end() < offset + length) {
benefit = gc.estimate(offset,
length,
o->extent_map,
wctx.old_extents,
min_alloc_size);
}
// NB: _wctx_finish() will empty old_extents
// so we must do gc estimation before that
benefit = gc.estimate(offset,
length,
o->extent_map,
wctx.old_extents,
min_alloc_size);
_wctx_finish(txc, c, o, &wctx);
if (end > o->onode.size) {
dout(20) << __func__ << " extending size to 0x" << std::hex << end
@ -12559,18 +12610,24 @@ int BlueStore::_do_write(
}
if (benefit >= g_conf()->bluestore_gc_enable_total_threshold) {
if (!gc.get_extents_to_collect().empty()) {
dout(20) << __func__ << " perform garbage collection, "
<< "expected benefit = " << benefit << " AUs" << dendl;
r = _do_gc(txc, c, o, gc, wctx, &dirty_start, &dirty_end);
if (r < 0) {
derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
<< dendl;
goto out;
}
dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
<< "~" << dirty_end - dirty_start << std::dec << dendl;
wctx.extents_to_gc.union_of(gc.get_extents_to_collect());
dout(20) << __func__
<< " perform garbage collection for compressed extents, "
<< "expected benefit = " << benefit << " AUs" << dendl;
}
if (!wctx.extents_to_gc.empty()) {
dout(20) << __func__ << " perform garbage collection" << dendl;
r = _do_gc(txc, c, o,
wctx,
&dirty_start, &dirty_end);
if (r < 0) {
derr << __func__ << " _do_gc failed with " << cpp_strerror(r)
<< dendl;
goto out;
}
dout(20)<<__func__<<" gc range is " << std::hex << dirty_start
<< "~" << dirty_end - dirty_start << std::dec << dendl;
}
o->extent_map.compress_extent_map(dirty_start, dirty_end - dirty_start);
o->extent_map.dirty_range(dirty_start, dirty_end - dirty_start);

View File

@ -968,7 +968,7 @@ public:
uint64_t min_alloc_size);
/// return a collection of extents to perform GC on
const vector<bluestore_pextent_t>& get_extents_to_collect() const {
const interval_set<uint64_t>& get_extents_to_collect() const {
return extents_to_collect;
}
GarbageCollector(CephContext* _cct) : cct(_cct) {}
@ -999,7 +999,7 @@ public:
///< specific write
///< protrusive extents that should be collected if GC takes place
vector<bluestore_pextent_t> extents_to_collect;
interval_set<uint64_t> extents_to_collect;
boost::optional<uint64_t > used_alloc_unit; ///< last processed allocation
///< unit when traversing
@ -1021,8 +1021,6 @@ public:
int64_t expected_for_release = 0; ///< alloc units currently used by
///< compressed blobs that might
///< gone after GC
uint64_t gc_start_offset = 0; ///starting offset for GC
uint64_t gc_end_offset = 0; ///ending offset for GC
protected:
void process_protrusive_extents(const BlueStore::ExtentMap& extent_map,
@ -2771,6 +2769,7 @@ private:
unsigned csum_order = 0; ///< target checksum chunk order
old_extent_map_t old_extents; ///< must deref these blobs
interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
struct write_item {
uint64_t logical_offset; ///< write logical offset
@ -2889,7 +2888,6 @@ private:
int _do_gc(TransContext *txc,
CollectionRef& c,
OnodeRef o,
const GarbageCollector& gc,
const WriteContext& wctx,
uint64_t *dirty_start,
uint64_t *dirty_end);

View File

@ -1318,6 +1318,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreStatFSTest) {
return;
StartDeferred(65536);
SetVal(g_conf(), "bluestore_compression_mode", "force");
SetVal(g_conf(), "bluestore_max_blob_size", "524288");
// just a big number to disble gc
SetVal(g_conf(), "bluestore_gc_enable_total_threshold", "100000");
SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
@ -7750,7 +7751,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsHdd) {
ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 1u);
}
}
TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsSsd) {
if (string(GetParam()) != "bluestore")
return;
@ -7783,6 +7784,81 @@ TEST_P(StoreTestSpecificAUSize, BluestoreEnforceHWSettingsSsd) {
ASSERT_EQ(logger->get(l_bluestore_write_big_blobs), 8u);
}
}
TEST_P(StoreTestSpecificAUSize, ReproNoBlobMultiTest) {
if(string(GetParam()) != "bluestore")
return;
SetVal(g_conf(), "bluestore_block_db_create", "true");
SetVal(g_conf(), "bluestore_block_db_size", "4294967296");
SetVal(g_conf(), "bluestore_block_size", "12884901888");
SetVal(g_conf(), "bluestore_max_blob_size", "524288");
g_conf().apply_changes(nullptr);
StartDeferred(65536);
int r;
coll_t cid;
ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
ghobject_t hoid2 = hoid;
hoid2.hobj.snap = 1;
auto ch = store->create_new_collection(cid);
{
ObjectStore::Transaction t;
t.create_collection(cid, 0);
cerr << "Creating collection " << cid << std::endl;
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
{
bool exists = store->exists(ch, hoid);
ASSERT_TRUE(!exists);
ObjectStore::Transaction t;
t.touch(cid, hoid);
cerr << "Creating object " << hoid << std::endl;
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
exists = store->exists(ch, hoid);
ASSERT_EQ(true, exists);
}
{
uint64_t offs = 0;
bufferlist bl;
const int size = 0x100;
bufferptr ap(size);
memset(ap.c_str(), 'a', size);
bl.append(ap);
int i = 0;
uint64_t blob_size = 524288;
uint64_t total = 0;
for (i = 0; i <= 512; i++) {
offs = 0 + i * size;
ObjectStore::Transaction t;
ghobject_t hoid2 = hoid;
hoid2.hobj.snap = i + 1;
while (offs < 128 * 1024 * 1024) {
t.write(cid, hoid, offs, ap.length(), bl);
offs += blob_size;
total += ap.length();
}
t.clone(cid, hoid, hoid2);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
cerr << "Total written = " << total << std::endl;
}
{
cerr << "Finalizing" << std::endl;
const PerfCounters* logger = store->get_perf_counters();
ASSERT_GE(logger->get(l_bluestore_gc_merged), 1024*1024*1024);
}
}
#endif // WITH_BLUESTORE

View File

@ -1241,8 +1241,8 @@ TEST(GarbageCollector, BasicTest)
saving = gc.estimate(300, 100, em, old_extents, 4096);
ASSERT_EQ(saving, 1);
auto& to_collect = gc.get_extents_to_collect();
ASSERT_EQ(to_collect.size(), 1u);
ASSERT_EQ(to_collect[0], bluestore_pextent_t(100,10) );
ASSERT_EQ(to_collect.num_intervals(), 1u);
ASSERT_EQ(*to_collect.begin(), std::make_pair(100ul, 10ul));
em.clear();
old_extents.clear();
@ -1311,11 +1311,11 @@ TEST(GarbageCollector, BasicTest)
saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000);
ASSERT_EQ(saving, 2);
auto& to_collect = gc.get_extents_to_collect();
ASSERT_EQ(to_collect.size(), 2u);
ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
to_collect[1] == bluestore_pextent_t(0x0,0x8000));
ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
ASSERT_EQ(to_collect.num_intervals(), 2u);
ASSERT_TRUE((*to_collect.begin()) == std::make_pair(0x0ul ,0x8000ul) ||
*(++to_collect.begin()) == std::make_pair(0x0ul, 0x8000ul));
ASSERT_TRUE((*to_collect.begin()) == std::make_pair(0x3f000ul, 0x1000ul) ||
*(++to_collect.begin()) == std::make_pair(0x3f000ul, 0x1000ul));
em.clear();
old_extents.clear();
@ -1357,7 +1357,7 @@ TEST(GarbageCollector, BasicTest)
saving = gc.estimate(0x3000, 0x4000, em, old_extents, 0x1000);
ASSERT_EQ(saving, 0);
auto& to_collect = gc.get_extents_to_collect();
ASSERT_EQ(to_collect.size(), 0u);
ASSERT_EQ(to_collect.num_intervals(), 0u);
em.clear();
old_extents.clear();
}
@ -1432,11 +1432,11 @@ TEST(GarbageCollector, BasicTest)
saving = gc.estimate(0x30000, 0xf000, em, old_extents, 0x10000);
ASSERT_EQ(saving, 2);
auto& to_collect = gc.get_extents_to_collect();
ASSERT_EQ(to_collect.size(), 2u);
ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x0,0x8000) ||
to_collect[1] == bluestore_pextent_t(0x0,0x8000));
ASSERT_TRUE(to_collect[0] == bluestore_pextent_t(0x3f000,0x1000) ||
to_collect[1] == bluestore_pextent_t(0x3f000,0x1000));
ASSERT_EQ(to_collect.num_intervals(), 2u);
ASSERT_TRUE(*to_collect.begin() == std::make_pair(0x0ul, 0x8000ul) ||
*(++to_collect.begin()) == std::make_pair(0x0ul, 0x8000ul));
ASSERT_TRUE(*to_collect.begin() == std::make_pair(0x3f000ul, 0x1000ul) ||
*(++to_collect.begin()) == std::make_pair(0x3f000ul, 0x1000ul));
em.clear();
old_extents.clear();