mirror of
https://github.com/ceph/ceph
synced 2025-02-23 19:17:37 +00:00
blk: bring MAP_HUGETLB-based buffer pool to KernelDevice.
The idea here is to bring a pool of `mmap`-allocated, constantly-sized buffers which would take precedence over the 2 MB-aligned, THP-based mechanism. On first attempt to acquire a 4 MB buffer, KernelDevice mmaps `bdev_read_preallocated_huge_buffer_num` (default 128) memory regions using the MAP_HUGETLB option. If this fails, the entire process is aborted. Buffers, after their life-times going over, are recycled with lock- free queue shared across entire process. Remember about allocating the appropriate number of huge pages in the system! For instance: ``` echo 256 | sudo tee /proc/sys/vm/nr_hugepages ``` This commit bases on / cherry-picks with changes 897a4932bee5cba3641c18619cccd0ee945bfcf8. Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
This commit is contained in:
parent
67ce52f5f9
commit
64aae2b955
@ -20,7 +20,10 @@
|
||||
#include <fcntl.h>
|
||||
#include <sys/file.h>
|
||||
|
||||
#include <boost/lockfree/queue.hpp>
|
||||
|
||||
#include "KernelDevice.h"
|
||||
#include "include/buffer_raw.h"
|
||||
#include "include/intarith.h"
|
||||
#include "include/types.h"
|
||||
#include "include/compat.h"
|
||||
@ -1041,20 +1044,111 @@ int KernelDevice::discard(uint64_t offset, uint64_t len)
|
||||
return r;
|
||||
}
|
||||
|
||||
template <std::size_t BufferSizeV>
|
||||
struct ExplicitHugePagePool {
|
||||
using region_queue_t = boost::lockfree::queue<void*>;
|
||||
|
||||
struct mmaped_buffer_raw : public buffer::raw {
|
||||
region_queue_t& region_q; // for recycling
|
||||
|
||||
mmaped_buffer_raw(void* mmaped_region, region_queue_t& region_q)
|
||||
: raw(static_cast<char*>(mmaped_region), BufferSizeV),
|
||||
region_q(region_q) {
|
||||
// the `mmaped_region` has been passed to `raw` as the buffer's `data`
|
||||
}
|
||||
~mmaped_buffer_raw() override {
|
||||
// don't delete nor unmmap; recycle the region instead
|
||||
region_q.push(data);
|
||||
}
|
||||
raw* clone_empty() override {
|
||||
// the entire cloning facility is used solely by the dev-only MemDB.
|
||||
// see: https://github.com/ceph/ceph/pull/36282
|
||||
ceph_abort_msg("this should be never called on this path!");
|
||||
}
|
||||
};
|
||||
|
||||
ExplicitHugePagePool(size_t pool_size)
|
||||
: region_q(pool_size) {
|
||||
while (pool_size--) {
|
||||
void* const mmaped_region = ::mmap(
|
||||
nullptr,
|
||||
BufferSizeV,
|
||||
PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB,
|
||||
-1,
|
||||
0);
|
||||
if (mmaped_region == MAP_FAILED) {
|
||||
ceph_abort("can't allocate huge buffer;"
|
||||
" /proc/sys/vm/nr_hugepages misconfigured?");
|
||||
} else {
|
||||
region_q.push(mmaped_region);
|
||||
}
|
||||
}
|
||||
}
|
||||
~ExplicitHugePagePool() {
|
||||
void* mmaped_region;
|
||||
while (region_q.pop(mmaped_region)) {
|
||||
::munmap(mmaped_region, BufferSizeV);
|
||||
}
|
||||
}
|
||||
|
||||
ceph::unique_leakable_ptr<buffer::raw> try_create() {
|
||||
if (void* mmaped_region; region_q.pop(mmaped_region)) {
|
||||
return ceph::unique_leakable_ptr<buffer::raw> {
|
||||
new mmaped_buffer_raw(mmaped_region, region_q)
|
||||
};
|
||||
} else {
|
||||
// oops, empty queue.
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
bool empty_estimation() const {
|
||||
return region_q.empty();
|
||||
}
|
||||
|
||||
private:
|
||||
region_queue_t region_q;
|
||||
};
|
||||
|
||||
|
||||
#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
|
||||
|
||||
// create a buffer basing on user-configurable. it's intended to make
|
||||
// our buffers THP-able.
|
||||
static ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(
|
||||
CephContext* const cct,
|
||||
const size_t len)
|
||||
ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
|
||||
const size_t len) const
|
||||
{
|
||||
// just to preserve the logic of create_small_page_aligned().
|
||||
if (len < CEPH_PAGE_SIZE) {
|
||||
return ceph::buffer::create_small_page_aligned(len);
|
||||
} else {
|
||||
const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
|
||||
return ceph::buffer::create_aligned(len, custom_alignment);
|
||||
} else if (len == LUCKY_BUFFER_SIZE) {
|
||||
static ExplicitHugePagePool<LUCKY_BUFFER_SIZE> hp_pool{
|
||||
cct->_conf->bdev_read_preallocated_huge_buffer_num
|
||||
};
|
||||
if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
|
||||
dout(20) << __func__ << " allocated from huge pool"
|
||||
<< " lucky_raw.data=" << (void*)lucky_raw->get_data()
|
||||
<< " bdev_read_preallocated_huge_buffer_num="
|
||||
<< cct->_conf->bdev_read_preallocated_huge_buffer_num
|
||||
<< dendl;
|
||||
return lucky_raw;
|
||||
} else {
|
||||
// fallthrough due to empty buffer pool. this can happen also
|
||||
// when the configurable was explicitly set to 0.
|
||||
dout(20) << __func__ << " cannot allocate from huge pool"
|
||||
<< " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
|
||||
<< " bdev_read_preallocated_huge_buffer_num="
|
||||
<< cct->_conf->bdev_read_preallocated_huge_buffer_num
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
|
||||
dout(20) << __func__ << " with the custom alignment;"
|
||||
<< " len=" << len
|
||||
<< " custom_alignment=" << custom_alignment
|
||||
<< dendl;
|
||||
return ceph::buffer::create_aligned(len, custom_alignment);
|
||||
}
|
||||
|
||||
int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
|
||||
@ -1070,7 +1164,7 @@ int KernelDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
|
||||
|
||||
auto start1 = mono_clock::now();
|
||||
|
||||
auto p = ceph::buffer::ptr_node::create(create_custom_aligned(cct, len));
|
||||
auto p = ceph::buffer::ptr_node::create(create_custom_aligned(len));
|
||||
int r = ::pread(choose_fd(buffered, WRITE_LIFE_NOT_SET),
|
||||
p->c_str(), len, off);
|
||||
auto age = cct->_conf->bdev_debug_aio_log_age;
|
||||
@ -1122,7 +1216,7 @@ int KernelDevice::aio_read(
|
||||
++ioc->num_pending;
|
||||
aio_t& aio = ioc->pending_aios.back();
|
||||
aio.bl.push_back(
|
||||
ceph::buffer::ptr_node::create(create_custom_aligned(cct, len)));
|
||||
ceph::buffer::ptr_node::create(create_custom_aligned(len)));
|
||||
aio.bl.prepare_iov(&aio.iov);
|
||||
aio.preadv(off, len);
|
||||
dout(30) << aio << dendl;
|
||||
|
@ -112,6 +112,8 @@ private:
|
||||
void _detect_vdo();
|
||||
int choose_fd(bool buffered, int write_hint) const;
|
||||
|
||||
ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(size_t len) const;
|
||||
|
||||
public:
|
||||
KernelDevice(CephContext* cct, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv);
|
||||
|
||||
|
@ -3920,6 +3920,11 @@ options:
|
||||
level: advanced
|
||||
default: 4_K
|
||||
with_legacy: true
|
||||
- name: bdev_read_preallocated_huge_buffer_num
|
||||
type: size
|
||||
level: advanced
|
||||
default: 128
|
||||
with_legacy: true
|
||||
- name: bdev_debug_aio
|
||||
type: bool
|
||||
level: dev
|
||||
|
Loading…
Reference in New Issue
Block a user