blk: bring MAP_HUGETLB-based buffer pool to KernelDevice.

The idea here is to bring a pool of `mmap`-allocated,
constantly-sized buffers which would take precedence
over the 2 MB-aligned, THP-based mechanism. On first
attempt to acquire a 4 MB buffer, KernelDevice mmaps
`bdev_read_preallocated_huge_buffer_num` (default 128)
memory regions using the MAP_HUGETLB option. If this
fails, the entire process is aborted. Buffers, after
their life-times going over, are recycled with lock-
free queue shared across entire process.

Remember about allocating the appropriate number of
huge pages in the system! For instance:

```
echo 256 | sudo tee /proc/sys/vm/nr_hugepages
```

This commit bases on / cherry-picks with changes
897a4932bee5cba3641c18619cccd0ee945bfcf8.

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
This commit is contained in:
Radoslaw Zarzynski 2021-11-04 20:50:17 +00:00
parent 67ce52f5f9
commit 64aae2b955
3 changed files with 109 additions and 8 deletions

View File

@ -20,7 +20,10 @@
#include <fcntl.h>
#include <sys/file.h>
#include <boost/lockfree/queue.hpp>
#include "KernelDevice.h"
#include "include/buffer_raw.h"
#include "include/intarith.h"
#include "include/types.h"
#include "include/compat.h"
@ -1041,20 +1044,111 @@ int KernelDevice::discard(uint64_t offset, uint64_t len)
return r;
}
template <std::size_t BufferSizeV>
struct ExplicitHugePagePool {
using region_queue_t = boost::lockfree::queue<void*>;
struct mmaped_buffer_raw : public buffer::raw {
region_queue_t& region_q; // for recycling
mmaped_buffer_raw(void* mmaped_region, region_queue_t& region_q)
: raw(static_cast<char*>(mmaped_region), BufferSizeV),
region_q(region_q) {
// the `mmaped_region` has been passed to `raw` as the buffer's `data`
}
~mmaped_buffer_raw() override {
// don't delete nor unmmap; recycle the region instead
region_q.push(data);
}
raw* clone_empty() override {
// the entire cloning facility is used solely by the dev-only MemDB.
// see: https://github.com/ceph/ceph/pull/36282
ceph_abort_msg("this should be never called on this path!");
}
};
ExplicitHugePagePool(size_t pool_size)
: region_q(pool_size) {
while (pool_size--) {
void* const mmaped_region = ::mmap(
nullptr,
BufferSizeV,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB,
-1,
0);
if (mmaped_region == MAP_FAILED) {
ceph_abort("can't allocate huge buffer;"
" /proc/sys/vm/nr_hugepages misconfigured?");
} else {
region_q.push(mmaped_region);
}
}
}
~ExplicitHugePagePool() {
void* mmaped_region;
while (region_q.pop(mmaped_region)) {
::munmap(mmaped_region, BufferSizeV);
}
}
ceph::unique_leakable_ptr<buffer::raw> try_create() {
if (void* mmaped_region; region_q.pop(mmaped_region)) {
return ceph::unique_leakable_ptr<buffer::raw> {
new mmaped_buffer_raw(mmaped_region, region_q)
};
} else {
// oops, empty queue.
return nullptr;
}
}
bool empty_estimation() const {
return region_q.empty();
}
private:
region_queue_t region_q;
};
#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
// create a buffer basing on user-configurable. it's intended to make
// our buffers THP-able.
static ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(
CephContext* const cct,
const size_t len)
ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
const size_t len) const
{
// just to preserve the logic of create_small_page_aligned().
if (len < CEPH_PAGE_SIZE) {
return ceph::buffer::create_small_page_aligned(len);
} else {
const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
return ceph::buffer::create_aligned(len, custom_alignment);
} else if (len == LUCKY_BUFFER_SIZE) {
static ExplicitHugePagePool<LUCKY_BUFFER_SIZE> hp_pool{
cct->_conf->bdev_read_preallocated_huge_buffer_num
};
if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
dout(20) << __func__ << " allocated from huge pool"
<< " lucky_raw.data=" << (void*)lucky_raw->get_data()
<< " bdev_read_preallocated_huge_buffer_num="
<< cct->_conf->bdev_read_preallocated_huge_buffer_num
<< dendl;
return lucky_raw;
} else {
// fallthrough due to empty buffer pool. this can happen also
// when the configurable was explicitly set to 0.
dout(20) << __func__ << " cannot allocate from huge pool"
<< " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
<< " bdev_read_preallocated_huge_buffer_num="
<< cct->_conf->bdev_read_preallocated_huge_buffer_num
<< dendl;
}
}
const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
dout(20) << __func__ << " with the custom alignment;"
<< " len=" << len
<< " custom_alignment=" << custom_alignment
<< dendl;
return ceph::buffer::create_aligned(len, custom_alignment);
}
int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
@ -1070,7 +1164,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
auto start1 = mono_clock::now();
auto p = ceph::buffer::ptr_node::create(create_custom_aligned(cct, len));
auto p = ceph::buffer::ptr_node::create(create_custom_aligned(len));
int r = ::pread(choose_fd(buffered, WRITE_LIFE_NOT_SET),
p->c_str(), len, off);
auto age = cct->_conf->bdev_debug_aio_log_age;
@ -1122,7 +1216,7 @@ int KernelDevice::aio_read(
++ioc->num_pending;
aio_t& aio = ioc->pending_aios.back();
aio.bl.push_back(
ceph::buffer::ptr_node::create(create_custom_aligned(cct, len)));
ceph::buffer::ptr_node::create(create_custom_aligned(len)));
aio.bl.prepare_iov(&aio.iov);
aio.preadv(off, len);
dout(30) << aio << dendl;

View File

@ -112,6 +112,8 @@ private:
void _detect_vdo();
int choose_fd(bool buffered, int write_hint) const;
ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(size_t len) const;
public:
KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);

View File

@ -3920,6 +3920,11 @@ options:
level: advanced
default: 4_K
with_legacy: true
- name: bdev_read_preallocated_huge_buffer_num
type: size
level: advanced
default: 128
with_legacy: true
- name: bdev_debug_aio
type: bool
level: dev