Merge pull request #25608 from tchaikov/bluestore_fbsd

os/bluestore: support for FreeBSD

Reviewed-by: Willem Jan Withagen <wjw@digiware.nl>
Reviewed-by: Sage Weil <sage@redhat.com>
This commit is contained in:
Kefu Chai 2018-12-22 09:39:28 +08:00 committed by GitHub
commit fe89e0e96c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 145 additions and 25 deletions

View File

@ -258,8 +258,13 @@ endif()
option(WITH_BLUESTORE "Bluestore OSD backend" ON)
if(WITH_BLUESTORE)
find_package(aio)
set(HAVE_LIBAIO ${AIO_FOUND})
if(LINUX)
find_package(aio)
set(HAVE_LIBAIO ${AIO_FOUND})
elseif(FREEBSD)
# POSIX AIO is integrated into FreeBSD kernel, and exposed by libc.
set(HAVE_POSIXAIO ON)
endif()
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i686|amd64|x86_64|AMD64|aarch64")
@ -285,7 +290,7 @@ if(WITH_PMEM)
endif()
if(WITH_BLUESTORE)
if(NOT AIO_FOUND AND NOT WITH_SPDK AND NOT WITH_PMEM)
if(NOT AIO_FOUND AND NOT HAVE_POSIXAIO AND NOT WITH_SPDK AND NOT WITH_PMEM)
message(SEND_ERROR "WITH_BLUESTORE is ON, "
"but none of the bluestore backends is enabled. "
"Please install libaio, or enable WITH_SPDK or WITH_PMEM (experimental)")

View File

@ -44,7 +44,6 @@ fi
-D WITH_BABELTRACE=OFF \
-D WITH_SEASTAR=OFF \
-D WITH_BLKID=OFF \
-D WITH_BLUESTORE=OFF \
-D WITH_FUSE=ON \
-D WITH_KRBD=OFF \
-D WITH_XFS=OFF \

View File

@ -93,6 +93,9 @@
/* Defined if you have libaio */
#cmakedefine HAVE_LIBAIO
/* Defind if you have POSIX AIO */
#cmakedefine HAVE_POSIXAIO
/* Defined if OpenLDAP enabled */
#cmakedefine HAVE_OPENLDAP

View File

@ -35,7 +35,7 @@ if(WITH_BLUESTORE)
)
endif(WITH_BLUESTORE)
if(HAVE_LIBAIO)
if(HAVE_LIBAIO OR HAVE_POSIXAIO)
list(APPEND libos_srcs
bluestore/KernelDevice.cc
bluestore/aio.cc)

View File

@ -19,7 +19,7 @@
#include "BlockDevice.h"
#if defined(HAVE_LIBAIO)
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "KernelDevice.h"
#endif
@ -63,7 +63,7 @@ uint64_t IOContext::get_num_ios() const
// a configurable (with different hdd and ssd defaults), and add
// that to the bytes value.
uint64_t ios = 0;
#ifdef HAVE_LIBAIO
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
for (auto& p : pending_aios) {
ios += p.iov.size();
}
@ -77,7 +77,7 @@ uint64_t IOContext::get_num_ios() const
void IOContext::release_running_aios()
{
ceph_assert(!num_running);
#ifdef HAVE_LIBAIO
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
// release aio contexts (including pinned buffers).
running_aios.clear();
#endif
@ -120,7 +120,7 @@ BlockDevice *BlockDevice::create(CephContext* cct, const string& path,
return new PMEMDevice(cct, cb, cbpriv);
}
#endif
#if defined(HAVE_LIBAIO)
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
if (type == "kernel") {
return new KernelDevice(cct, cb, cbpriv, d_cb, d_cbpriv);
}

View File

@ -29,8 +29,8 @@
#include "acconfig.h"
#include "common/ceph_mutex.h"
#ifdef HAVE_LIBAIO
#include "aio.h"
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "ceph_aio.h"
#endif
#include "include/ceph_assert.h"
#include "include/buffer.h"
@ -70,7 +70,7 @@ public:
std::atomic_int total_nseg = {0};
#endif
#ifdef HAVE_LIBAIO
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
std::list<aio_t> pending_aios; ///< not yet submitted
std::list<aio_t> running_aios; ///< submitting or submitted
#endif

View File

@ -24,6 +24,9 @@
#include "include/stringify.h"
#include "common/blkdev.h"
#include "common/errno.h"
#if defined(__FreeBSD__)
#include "bsm/audit_errno.h"
#endif
#include "common/debug.h"
#include "common/align.h"
@ -451,9 +454,14 @@ static bool is_expected_ioerr(const int r)
{
// https://lxr.missinglinkelectronics.com/linux+v4.15/block/blk-core.c#L135
return (r == -EOPNOTSUPP || r == -ETIMEDOUT || r == -ENOSPC ||
r == -ENOLINK || r == -EREMOTEIO || r == -EBADE ||
r == -ENOLINK || r == -EREMOTEIO || r == -EAGAIN || r == -EIO ||
r == -ENODATA || r == -EILSEQ || r == -ENOMEM ||
r == -EAGAIN || r == -EREMCHG || r == -EIO);
#if defined(__linux__)
r == -EREMCHG || r == -EBADE
#elif defined(__FreeBSD__)
r == - BSM_ERRNO_EREMCHG || r == -BSM_ERRNO_EBADE
#endif
);
}
void KernelDevice::_aio_thread()
@ -729,6 +737,7 @@ int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int w
derr << __func__ << " pwritev error: " << cpp_strerror(r) << dendl;
return r;
}
#ifdef HAVE_SYNC_FILE_RANGE
if (buffered) {
// initiate IO (but do not wait)
r = ::sync_file_range(fd_buffereds[WRITE_LIFE_NOT_SET], off, len, SYNC_FILE_RANGE_WRITE);
@ -738,6 +747,7 @@ int KernelDevice::_sync_write(uint64_t off, bufferlist &bl, bool buffered, int w
return r;
}
}
#endif
io_since_flush.store(true);

View File

@ -22,7 +22,7 @@
#include "common/Thread.h"
#include "include/utime.h"
#include "aio.h"
#include "ceph_aio.h"
#include "BlockDevice.h"
class KernelDevice : public BlockDevice {

View File

@ -21,7 +21,7 @@
#include "os/fs/FS.h"
#include "include/interval_set.h"
#include "aio.h"
#include "ceph_aio.h"
#include "BlockDevice.h"
class PMEMDevice : public BlockDevice {

View File

@ -2,7 +2,7 @@
// vim: ts=8 sw=2 smarttab
#include <algorithm>
#include "aio.h"
#include "ceph_aio.h"
std::ostream& operator<<(std::ostream& os, const aio_t& aio)
{
@ -22,20 +22,37 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
// 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
int attempts = 16;
int delay = 125;
int r;
aio_iter cur = begin;
struct iocb *piocb[aios_size];
struct aio_t *piocb[aios_size];
int left = 0;
while (cur != end) {
cur->priv = priv;
*(piocb+left) = &cur->iocb;
*(piocb+left) = &(*cur);
++left;
++cur;
}
ceph_assert(aios_size >= left);
int done = 0;
while (left > 0) {
int r = io_submit(ctx, std::min(left, max_iodepth), piocb + done);
#if defined(HAVE_LIBAIO)
r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done));
#elif defined(HAVE_POSIXAIO)
if (piocb[done]->n_aiocb == 1) {
// TODO: consider batching multiple reads together with lio_listio
piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done];
r = aio_read(&piocb[done]->aio.aiocb);
} else {
struct sigevent sev;
sev.sigev_notify = SIGEV_KEVENT;
sev.sigev_notify_kqueue = ctx;
sev.sigev_value.sival_ptr = piocb[done];
r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev);
}
#endif
if (r < 0) {
if (r == -EAGAIN && attempts-- > 0) {
usleep(delay);
@ -56,7 +73,11 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
{
io_event event[max];
#if defined(HAVE_LIBAIO)
io_event events[max];
#elif defined(HAVE_POSIXAIO)
struct kevent events[max];
#endif
struct timespec t = {
timeout_ms / 1000,
(timeout_ms % 1000) * 1000 * 1000
@ -64,12 +85,40 @@ int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
int r = 0;
do {
r = io_getevents(ctx, 1, max, event, &t);
#if defined(HAVE_LIBAIO)
r = io_getevents(ctx, 1, max, events, &t);
#elif defined(HAVE_POSIXAIO)
r = kevent(ctx, NULL, 0, events, max, &t);
if (r < 0)
r = -errno;
#endif
} while (r == -EINTR);
for (int i=0; i<r; ++i) {
paio[i] = (aio_t *)event[i].obj;
paio[i]->rval = event[i].res;
#if defined(HAVE_LIBAIO)
paio[i] = (aio_t *)events[i].obj;
paio[i]->rval = events[i].res;
#else
paio[i] = (aio_t*)events[i].udata;
if (paio[i]->n_aiocb == 1) {
paio[i]->rval = aio_return(&paio[i]->aio.aiocb);
} else {
// Emulate the return value of pwritev. I can't find any documentation
// for what the value of io_event.res is supposed to be. I'm going to
// assume that it's just like pwritev/preadv/pwrite/pread.
paio[i]->rval = 0;
for (int j = 0; j < paio[i]->n_aiocb; j++) {
int res = aio_return(&paio[i]->aio.aiocbp[j]);
if (res < 0) {
paio[i]->rval = res;
break;
} else {
paio[i]->rval += res;
}
}
free(paio[i]->aio.aiocbp);
}
#endif
}
return r;
}

View File

@ -2,7 +2,15 @@
// vim: ts=8 sw=2 smarttab
#pragma once
# include <libaio.h>
#include "acconfig.h"
#if defined(HAVE_LIBAIO)
#include <libaio.h>
#elif defined(HAVE_POSIXAIO)
#include <aio.h>
#include <sys/event.h>
#endif
#include <boost/intrusive/list.hpp>
#include <boost/container/small_vector.hpp>
@ -11,7 +19,16 @@
#include "include/types.h"
struct aio_t {
#if defined(HAVE_LIBAIO)
struct iocb iocb{}; // must be first element; see shenanigans in aio_queue_t
#elif defined(HAVE_POSIXAIO)
// static long aio_listio_max = -1;
union {
struct aiocb aiocb;
struct aiocb *aiocbp;
} aio;
int n_aiocb;
#endif
void *priv;
int fd;
boost::container::small_vector<iovec,4> iov;
@ -27,13 +44,34 @@ struct aio_t {
void pwritev(uint64_t _offset, uint64_t len) {
offset = _offset;
length = len;
#if defined(HAVE_LIBAIO)
io_prep_pwritev(&iocb, fd, &iov[0], iov.size(), offset);
#elif defined(HAVE_POSIXAIO)
n_aiocb = iov.size();
aio.aiocbp = (struct aiocb*)calloc(iov.size(), sizeof(struct aiocb));
for (int i = 0; i < iov.size(); i++) {
aio.aiocbp[i].aio_fildes = fd;
aio.aiocbp[i].aio_offset = offset;
aio.aiocbp[i].aio_buf = iov[i].iov_base;
aio.aiocbp[i].aio_nbytes = iov[i].iov_len;
aio.aiocbp[i].aio_lio_opcode = LIO_WRITE;
offset += iov[i].iov_len;
}
#endif
}
void pread(uint64_t _offset, uint64_t len) {
offset = _offset;
length = len;
bufferptr p = buffer::create_small_page_aligned(length);
#if defined(HAVE_LIBAIO)
io_prep_pread(&iocb, fd, p.c_str(), length, offset);
#elif defined(HAVE_POSIXAIO)
n_aiocb = 1;
aio.aiocb.aio_fildes = fd;
aio.aiocb.aio_buf = p.c_str();
aio.aiocb.aio_nbytes = length;
aio.aiocb.aio_offset = offset;
#endif
bl.append(std::move(p));
}
@ -53,7 +91,11 @@ typedef boost::intrusive::list<
struct aio_queue_t {
int max_iodepth;
#if defined(HAVE_LIBAIO)
io_context_t ctx;
#elif defined(HAVE_POSIXAIO)
int ctx;
#endif
typedef list<aio_t>::iterator aio_iter;
@ -67,6 +109,7 @@ struct aio_queue_t {
int init() {
ceph_assert(ctx == 0);
#if defined(HAVE_LIBAIO)
int r = io_setup(max_iodepth, &ctx);
if (r < 0) {
if (ctx) {
@ -75,10 +118,21 @@ struct aio_queue_t {
}
}
return r;
#elif defined(HAVE_POSIXAIO)
ctx = kqueue();
if (ctx < 0)
return -errno;
else
return 0;
#endif
}
void shutdown() {
if (ctx) {
#if defined(HAVE_LIBAIO)
int r = io_destroy(ctx);
#elif defined(HAVE_POSIXAIO)
int r = close(ctx);
#endif
ceph_assert(r == 0);
ctx = 0;
}