blk/kernel: add plugin system for devices with compression and move VDO support into plugin

The current VDO support implementation is buried inside the common/blkdev.cc
with a simple interface used by KernelDevice. It is not easily extendable
and can not be easily used for other devices providing similar capabilities.
This patch adds a plugin system that is based in its structure on the
erasure code plugin system and moves the VDO support code into a VDO plugin.

Signed-off-by: Martin Ohmacht <mohmacht@us.ibm.com>
This commit is contained in:
Martin Ohmacht 2022-09-28 14:09:04 -04:00
parent 652bf75409
commit a6658c91bb
26 changed files with 868 additions and 221 deletions

View File

@ -233,6 +233,7 @@ BuildRequires: libaio-devel
BuildRequires: libblkid-devel >= 2.17
BuildRequires: cryptsetup-devel
BuildRequires: libcurl-devel
BuildRequires: libcap-devel
BuildRequires: libcap-ng-devel
BuildRequires: fmt-devel >= 6.2.1
BuildRequires: pkgconfig(libudev)
@ -1563,6 +1564,8 @@ rm -rf %{_vpath_builddir}
%dir %{_libdir}/ceph
%dir %{_libdir}/ceph/erasure-code
%{_libdir}/ceph/erasure-code/libec_*.so*
%dir %{_libdir}/ceph/extblkdev
%{_libdir}/ceph/extblkdev/libceph_*.so*
%dir %{_libdir}/ceph/compressor
%{_libdir}/ceph/compressor/libceph_*.so*
%{_unitdir}/ceph-crash.service

View File

@ -9,6 +9,7 @@ usr/bin/osdmaptool
usr/bin/ceph-kvstore-tool
usr/libexec/ceph/ceph_common.sh
usr/lib/ceph/erasure-code/*
usr/lib/ceph/extblkdev/*
usr/lib/rados-classes/*
usr/sbin/ceph-create-keys
usr/share/doc/ceph/sample.ceph.conf

1
debian/control vendored
View File

@ -36,6 +36,7 @@ Build-Depends: automake,
libcrypto++-dev <pkg.ceph.crimson>,
libcryptsetup-dev,
libcap-ng-dev,
libcap-dev,
libcunit1-dev,
libcurl4-openssl-dev,
libevent-dev,

View File

@ -433,6 +433,7 @@ add_subdirectory(common)
add_subdirectory(crush)
add_subdirectory(msg)
add_subdirectory(arch)
add_subdirectory(extblkdev)
set(ceph_common_objs
$<TARGET_OBJECTS:common-auth-objs>
@ -446,7 +447,7 @@ set(ceph_common_objs
$<TARGET_OBJECTS:common_mountcephfs_objs>
$<TARGET_OBJECTS:crush_objs>)
set(ceph_common_deps
json_spirit erasure_code arch crc32
json_spirit erasure_code extblkdev arch crc32
${LIB_RESOLV}
Boost::thread
Boost::system
@ -678,7 +679,7 @@ set(ceph_osd_srcs
ceph_osd.cc)
add_executable(ceph-osd ${ceph_osd_srcs})
add_dependencies(ceph-osd erasure_code_plugins)
add_dependencies(ceph-osd erasure_code_plugins extblkdev_plugins)
target_link_libraries(ceph-osd osd os global-static common
${ALLOC_LIBS}
${BLKID_LIBRARIES})

View File

@ -29,6 +29,7 @@
#include "acconfig.h"
#include "common/ceph_mutex.h"
#include "include/common_fwd.h"
#include "extblkdev/ExtBlkDevInterface.h"
#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO)
#include "aio/aio.h"
@ -237,8 +238,8 @@ public:
uint64_t get_optimal_io_size() const { return optimal_io_size; }
/// hook to provide utilization of thinly-provisioned device
virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const {
return false;
virtual int get_ebd_state(ExtBlkDevState &state) const {
return -ENOENT;
}
virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) const = 0;

View File

@ -31,7 +31,7 @@ if(libblk_srcs)
endif()
if(HAVE_LIBAIO)
target_link_libraries(blk PUBLIC ${AIO_LIBRARIES})
target_link_libraries(blk PUBLIC ${AIO_LIBRARIES} extblkdev)
endif(HAVE_LIBAIO)
if(WITH_SPDK)

View File

@ -242,7 +242,12 @@ int KernelDevice::open(const string& p)
support_discard = blkdev_buffered.support_discard();
optimal_io_size = blkdev_buffered.get_optimal_io_size();
this->devname = devname;
_detect_vdo();
// check if any extended block device plugin recognizes this device
// detect_vdo has moved into the VDO plugin
int rc = extblkdev::detect_device(cct, devname, ebd_impl);
if (rc != 0) {
dout(20) << __func__ << " no plugin volume maps to " << devname << dendl;
}
}
}
@ -305,10 +310,7 @@ void KernelDevice::close()
_discard_stop();
_pre_close();
if (vdo_fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
vdo_fd = -1;
}
extblkdev::release_device(ebd_impl);
for (int i = 0; i < WRITE_LIFE_MAX; i++) {
assert(fd_directs[i] >= 0);
@ -335,11 +337,10 @@ int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm)
} else {
(*pm)[prefix + "type"] = "ssd";
}
if (vdo_fd >= 0) {
(*pm)[prefix + "vdo"] = "true";
uint64_t total, avail;
get_vdo_utilization(vdo_fd, &total, &avail);
(*pm)[prefix + "vdo_physical_size"] = stringify(total);
// if compression device detected, collect meta data for device
// VDO specific meta data has moved into VDO plugin
if (ebd_impl) {
ebd_impl->collect_metadata(prefix, pm);
}
{
@ -407,24 +408,14 @@ int KernelDevice::collect_metadata(const string& prefix, map<string,string> *pm)
return 0;
}
void KernelDevice::_detect_vdo()
int KernelDevice::get_ebd_state(ExtBlkDevState &state) const
{
vdo_fd = get_vdo_stats_handle(devname.c_str(), &vdo_name);
if (vdo_fd >= 0) {
dout(1) << __func__ << " VDO volume " << vdo_name
<< " maps to " << devname << dendl;
} else {
dout(20) << __func__ << " no VDO volume maps to " << devname << dendl;
// use compression driver plugin to determine physical size and availability
// VDO specific get_thin_utilization has moved into VDO plugin
if (ebd_impl) {
return ebd_impl->get_state(state);
}
return;
}
bool KernelDevice::get_thin_utilization(uint64_t *total, uint64_t *avail) const
{
if (vdo_fd < 0) {
return false;
}
return get_vdo_utilization(vdo_fd, total, avail);
return -ENOENT;
}
int KernelDevice::choose_fd(bool buffered, int write_hint) const

View File

@ -24,6 +24,7 @@
#include "aio/aio.h"
#include "BlockDevice.h"
#include "extblkdev/ExtBlkDevPlugin.h"
#define RW_IO_MAX (INT_MAX & CEPH_PAGE_MASK)
@ -35,8 +36,7 @@ private:
bool enable_wrt = true;
bool aio, dio;
int vdo_fd = -1; ///< fd for vdo sysfs directory
std::string vdo_name;
ExtBlkDevInterfaceRef ebd_impl; // structure for retrieving compression state from extended block device
std::string devname; ///< kernel dev name (/sys/block/$devname), if any
@ -109,7 +109,6 @@ private:
void debug_aio_link(aio_t& aio);
void debug_aio_unlink(aio_t& aio);
void _detect_vdo();
int choose_fd(bool buffered, int write_hint) const;
ceph::unique_leakable_ptr<buffer::raw> create_custom_aligned(size_t len, IOContext* ioc) const;
@ -130,7 +129,7 @@ public:
}
int get_devices(std::set<std::string> *ls) const override;
bool get_thin_utilization(uint64_t *total, uint64_t *avail) const override;
int get_ebd_state(ExtBlkDevState &state) const override;
int read(uint64_t off, uint64_t len, ceph::buffer::list *pbl,
IOContext *ioc,

View File

@ -26,6 +26,7 @@
#include "mon/MonClient.h"
#include "include/ceph_features.h"
#include "common/config.h"
#include "extblkdev/ExtBlkDevPlugin.h"
#include "mon/MonMap.h"
@ -472,6 +473,14 @@ flushjournal_out:
forker.exit(0);
}
{
int r = extblkdev::preload(g_ceph_context);
if (r < 0) {
derr << "Failed preloading extblkdev plugins, error code: " << r << dendl;
forker.exit(1);
}
}
string magic;
uuid_d cluster_fsid, osd_fsid;
ceph_release_t require_osd_release = ceph_release_t::unknown;

View File

@ -338,95 +338,6 @@ void get_raw_devices(const std::string& in,
}
}
int _get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
int vdo_fd = -1;
// we need to go from the raw devname (e.g., dm-4) to the VDO volume name.
// currently the best way seems to be to look at /dev/mapper/* ...
std::string expect = std::string("../") + devname; // expected symlink target
DIR *dir = ::opendir("/dev/mapper");
if (!dir) {
return -1;
}
struct dirent *de = nullptr;
while ((de = ::readdir(dir))) {
if (de->d_name[0] == '.')
continue;
char fn[4096], target[4096];
snprintf(fn, sizeof(fn), "/dev/mapper/%s", de->d_name);
int r = readlink(fn, target, sizeof(target));
if (r < 0 || r >= (int)sizeof(target))
continue;
target[r] = 0;
if (expect == target) {
snprintf(fn, sizeof(fn), "/sys/kvdo/%s/statistics", de->d_name);
vdo_fd = ::open(fn, O_RDONLY|O_CLOEXEC); //DIRECTORY);
if (vdo_fd >= 0) {
*vdo_name = de->d_name;
break;
}
}
}
closedir(dir);
return vdo_fd;
}
int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
std::set<std::string> devs = { devname };
while (!devs.empty()) {
std::string dev = *devs.begin();
devs.erase(devs.begin());
int fd = _get_vdo_stats_handle(dev.c_str(), vdo_name);
if (fd >= 0) {
// yay, it's vdo
return fd;
}
// ok, see if there are constituent devices
if (dev.find("dm-") == 0) {
get_dm_parents(dev, &devs);
}
}
return -1;
}
int64_t get_vdo_stat(int vdo_fd, const char *property)
{
int64_t ret = 0;
int fd = ::openat(vdo_fd, property, O_RDONLY|O_CLOEXEC);
if (fd < 0) {
return 0;
}
char buf[1024];
int r = ::read(fd, buf, sizeof(buf) - 1);
if (r > 0) {
buf[r] = 0;
ret = atoll(buf);
}
TEMP_FAILURE_RETRY(::close(fd));
return ret;
}
bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
{
int64_t block_size = get_vdo_stat(fd, "block_size");
int64_t physical_blocks = get_vdo_stat(fd, "physical_blocks");
int64_t overhead_blocks_used = get_vdo_stat(fd, "overhead_blocks_used");
int64_t data_blocks_used = get_vdo_stat(fd, "data_blocks_used");
if (!block_size
|| !physical_blocks
|| !overhead_blocks_used
|| !data_blocks_used) {
return false;
}
int64_t avail_blocks =
physical_blocks - overhead_blocks_used - data_blocks_used;
*total = block_size * physical_blocks;
*avail = block_size * avail_blocks;
return true;
}
std::string _decode_model_enc(const std::string& in)
{
auto v = boost::replace_all_copy(in, "\\x20", " ");
@ -908,21 +819,6 @@ void get_raw_devices(const std::string& in,
{
}
int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
return -1;
}
int64_t get_vdo_stat(int fd, const char *property)
{
return 0;
}
bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
{
return false;
}
std::string get_device_id(const std::string& devname,
std::string *err)
{
@ -1083,21 +979,6 @@ void get_raw_devices(const std::string& in,
{
}
int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
return -1;
}
int64_t get_vdo_stat(int fd, const char *property)
{
return 0;
}
bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
{
return false;
}
std::string get_device_id(const std::string& devname,
std::string *err)
{
@ -1237,21 +1118,6 @@ void get_raw_devices(const std::string& in,
{
}
int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
return -1;
}
int64_t get_vdo_stat(int fd, const char *property)
{
return 0;
}
bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
{
return false;
}
std::string get_device_id(const std::string& devname,
std::string *err)
{

View File

@ -36,12 +36,6 @@ extern int block_device_get_metrics(const std::string& devname, int timeout,
extern void get_raw_devices(const std::string& in,
std::set<std::string> *ls);
// for VDO
/// return an op fd for the sysfs stats dir, if this is a VDO device
extern int get_vdo_stats_handle(const char *devname, std::string *vdo_name);
extern int64_t get_vdo_stat(int fd, const char *property);
extern bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail);
class BlkDev {
public:
BlkDev(int fd);

View File

@ -27,6 +27,16 @@ options:
- osd_numa_auto_affinity
flags:
- startup
- name: set_keepcaps
type: bool
level: advanced
desc: set the keepcaps flag before changing UID, preserving the permitted capability set
long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
a component that is capability aware needs a specific capability, the keepcaps flag maintains
the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
default: false
flags:
- startup
- name: osd_smart_report_timeout
type: uint
level: advanced
@ -1267,6 +1277,13 @@ options:
default: 512
fmt_desc: The maximum number of objects per backfill scan.p
with_legacy: true
- name: osd_extblkdev_plugins
type: str
level: advanced
desc: extended block device plugins to load, provide compression feedback at runtime
default: vdo
flags:
- startup
# minimum number of peers
- name: osd_heartbeat_min_peers
type: int

View File

@ -94,21 +94,6 @@ void get_raw_devices(const std::string& in,
{
}
int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
{
return -1;
}
int64_t get_vdo_stat(int fd, const char *property)
{
return 0;
}
bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
{
return false;
}
std::string get_device_id(const std::string& devname,
std::string *err)
{

View File

@ -0,0 +1,14 @@
## extended block device plugins
set(extblkdev_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/extblkdev)
add_subdirectory(vdo)
add_library(extblkdev STATIC ExtBlkDevPlugin.cc)
if(NOT WIN32)
target_link_libraries(extblkdev cap)
endif()
add_custom_target(extblkdev_plugins DEPENDS
ceph_ebd_vdo)

View File

@ -0,0 +1,141 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph distributed storage system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file ceph/src/erasure-code/ErasureCodeInterface.h
* Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
* Author: Loic Dachary <loic@dachary.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*/
#ifndef CEPH_EXT_BLK_DEV_INTERFACE_H
#define CEPH_EXT_BLK_DEV_INTERFACE_H
/*! @file ExtBlkDevInterface.h
@brief Interface provided by extended block device plugins
Block devices with verdor specific capabilities rely on plugins implementing
**ExtBlkDevInterface** to provide access to their capabilities.
Methods returning an **int** return **0** on success and a
negative value on error.
*/
#include <string>
#include <map>
#include <ostream>
#include <memory>
#ifdef __linux__
#include <sys/capability.h>
#else
typedef void *cap_t;
#endif
#include "common/PluginRegistry.h"
namespace ceph {
class ExtBlkDevState {
uint64_t logical_total=0;
uint64_t logical_avail=0;
uint64_t physical_total=0;
uint64_t physical_avail=0;
public:
uint64_t get_logical_total(){return logical_total;}
uint64_t get_logical_avail(){return logical_avail;}
uint64_t get_physical_total(){return physical_total;}
uint64_t get_physical_avail(){return physical_avail;}
void set_logical_total(uint64_t alogical_total){logical_total=alogical_total;}
void set_logical_avail(uint64_t alogical_avail){logical_avail=alogical_avail;}
void set_physical_total(uint64_t aphysical_total){physical_total=aphysical_total;}
void set_physical_avail(uint64_t aphysical_avail){physical_avail=aphysical_avail;}
};
class ExtBlkDevInterface {
public:
virtual ~ExtBlkDevInterface() {}
/**
* Initialize the instance if device logdevname is supported
*
* Return 0 on success or a negative errno on error
*
* @param [in] logdevname name of device to check for support by this plugin
* @return 0 on success or a negative errno on error.
*/
virtual int init(const std::string& logdevname) = 0;
/**
* Return the name of the underlying device detected by **init** method
*
* @return the name of the underlying device
*/
virtual const std::string& get_devname() const = 0;
/**
* Provide status of underlying physical storage after compression
*
* Return 0 on success or a negative errno on error.
*
* @param [out] state current state of the undelying device
* @return 0 on success or a negative errno on error.
*/
virtual int get_state(ExtBlkDevState& state) = 0;
/**
* Populate property map with meta data of device.
*
* @param [in] prefix prefix to be prepended to all map values by this method
* @param [in,out] pm property map of the device, to be extended by attributes detected by this plugin
* @return 0 on success or a negative errno on error.
*/
virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm) = 0;
};
typedef std::shared_ptr<ExtBlkDevInterface> ExtBlkDevInterfaceRef;
class ExtBlkDevPlugin : public Plugin {
public:
explicit ExtBlkDevPlugin(CephContext *cct) : Plugin(cct) {}
virtual ~ExtBlkDevPlugin() {}
/**
* Indicate plugin-required capabilities in permitted set
* If a plugin requires a capability to be active in the
* permitted set when invoked, it must indicate so by setting
* the required flags in the cap_t structure passed into this method.
* The cap_t structure is empty when passed into the method, and only the
* method's modifications to the permitted set are used by ceph.
* The plugin must elevate the capabilities into the effective
* set at a later point when needed during the invocation of its
* other methods, and is responsible to restore the effective set
* before returning from the method
*
* @param [out] caps capability set indicating the necessary capabilities
*/
virtual int get_required_cap_set(cap_t caps) = 0;
/**
* Factory method, creating ExtBlkDev instances
*
* @param [in] logdevname name of logic device, may be composed of physical devices
* @param [out] ext_blk_dev object created on successful device support detection
* @return 0 on success or a negative errno on error.
*/
virtual int factory(const std::string& logdevname,
ExtBlkDevInterfaceRef& ext_blk_dev) = 0;
};
}
#endif

View File

@ -0,0 +1,268 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph distributed storage system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file ceph/src/erasure-code/ErasureCodePlugin.cc
* Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
* Copyright (C) 2014 Red Hat <contact@redhat.com>
*
* Author: Loic Dachary <loic@dachary.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*/
#include <errno.h>
#include "ceph_ver.h"
#include "ExtBlkDevPlugin.h"
#include "common/errno.h"
#include "include/dlfcn_compat.h"
#include "include/str_list.h"
#include "include/ceph_assert.h"
#include "common/ceph_context.h"
#include "common/debug.h"
#define dout_subsys ceph_subsys_bdev
#define dout_context cct
using namespace std;
namespace ceph {
namespace extblkdev {
#ifdef __linux__
// iterate across plugins and determine each capability's reqirement
// merge requirements into merge_caps set
int get_required_caps(CephContext *cct, cap_t &merge_caps)
{
cap_t plugin_caps = nullptr;
auto close_caps_on_return = make_scope_guard([&] {
if (plugin_caps != nullptr) {
cap_free(plugin_caps);
}
});
// plugin-private cap set to populate by a plugin
plugin_caps = cap_init();
if (plugin_caps == nullptr) {
return -errno;
}
auto registry = cct->get_plugin_registry();
std::lock_guard l(registry->lock);
// did we preload any extblkdev type plugins?
auto ptype = registry->plugins.find("extblkdev");
if (ptype != registry->plugins.end()) {
// iterate over all extblkdev plugins
for (auto& it : ptype->second) {
// clear cap set before passing to plugin
if (cap_clear(plugin_caps) < 0) {
return -errno;
}
// let plugin populate set with required caps
auto ebdplugin = dynamic_cast<ExtBlkDevPlugin*>(it.second);
if (ebdplugin == nullptr) {
derr << __func__ << " Is not an extblkdev plugin: " << it.first << dendl;
return -ENOENT;
}
int rc = ebdplugin->get_required_cap_set(plugin_caps);
if (rc != 0)
return rc;
// iterate over capabilities and check for active bits
for (int i = 0; i <= CAP_LAST_CAP; ++i) {
cap_flag_value_t val;
if (cap_get_flag(plugin_caps, i, CAP_PERMITTED, &val) < 0) {
return -errno;
}
if (val != CAP_CLEAR) {
cap_value_t arr[1];
arr[0] = i;
// set capability in merged set
if (cap_set_flag(merge_caps, CAP_PERMITTED, 1, arr, CAP_SET) < 0) {
return -errno;
}
}
}
}
}
return 0;
}
// trim away all capabilities of this process that are not explicitly set in merge_set
int trim_caps(CephContext *cct, cap_t &merge_caps)
{
cap_t proc_caps = nullptr;
auto close_caps_on_return = make_scope_guard([&] {
if (proc_caps != nullptr) {
cap_free(proc_caps);
}
});
bool changed = false;
// get process capability set
proc_caps = cap_get_proc();
if (proc_caps == nullptr) {
dout(1) << " cap_get_proc failed with errno: " << errno << dendl;
return -errno;
}
{
char *cap_str = cap_to_text(proc_caps, 0);
if (cap_str != nullptr){
dout(10) << " cap_get_proc yields: " << cap_str << dendl;
cap_free(cap_str);
}
}
// iterate over capabilities
for (int i = 0; i <= CAP_LAST_CAP; ++i) {
cap_flag_value_t val;
if (cap_get_flag(merge_caps, i, CAP_PERMITTED, &val) < 0) {
return -errno;
}
if (val == CAP_CLEAR) {
if (cap_get_flag(proc_caps, i, CAP_PERMITTED, &val) < 0) {
return -errno;
}
if (val != CAP_CLEAR) {
// if bit clear in merged set, but set in process set, clear in process set
changed = true;
cap_value_t arr[1];
arr[0] = i;
if (cap_set_flag(proc_caps, CAP_PERMITTED, 1, arr, CAP_CLEAR) < 0) {
return -errno;
}
if (cap_set_flag(proc_caps, CAP_EFFECTIVE, 1, arr, CAP_CLEAR) < 0) {
return -errno;
}
}
}
}
// apply reduced capability set to process
if (changed) {
char *cap_str = cap_to_text(proc_caps, 0);
if (cap_str != nullptr){
dout(10) << " new caps for cap_set_proc: " << cap_str << dendl;
cap_free(cap_str);
}
if (cap_set_proc(proc_caps) < 0) {
dout(1) << " cap_set_proc failed with errno: " << errno << dendl;
return -errno;
}
}
return 0;
}
int limit_caps(CephContext *cct)
{
cap_t merge_caps = nullptr;
auto close_caps_on_return = make_scope_guard([&] {
if (merge_caps != nullptr) {
cap_free(merge_caps);
}
});
// collect required caps in merge_caps
merge_caps = cap_init();
if (merge_caps == nullptr) {
return -errno;
}
int rc = get_required_caps(cct, merge_caps);
if (rc != 0) {
return rc;
}
return trim_caps(cct, merge_caps);
}
#endif
// preload set of extblkdev plugins defined in config
int preload(CephContext *cct)
{
const auto& conf = cct->_conf;
string plugins = conf.get_val<std::string>("osd_extblkdev_plugins");
dout(10) << "starting preload of extblkdev plugins: " << plugins << dendl;
list<string> plugins_list;
get_str_list(plugins, plugins_list);
auto registry = cct->get_plugin_registry();
{
std::lock_guard l(registry->lock);
for (auto& plg : plugins_list) {
dout(10) << "starting load of extblkdev plugin: " << plg << dendl;
int rc = registry->load("extblkdev", std::string("ebd_") + plg);
if (rc) {
derr << __func__ << " failed preloading extblkdev plugin: " << plg << dendl;
return rc;
}else{
dout(10) << "successful load of extblkdev plugin: " << plg << dendl;
}
}
}
#ifdef __linux__
// if we are still running as root, we do not need to trim capabilities
// as we are intended to use the privileges
if (geteuid() == 0) {
return 0;
}
return limit_caps(cct);
#else
return 0;
#endif
}
// scan extblkdev plugins for support of this device
int detect_device(CephContext *cct,
const std::string &logdevname,
ExtBlkDevInterfaceRef& ebd_impl)
{
int rc = -ENOENT;
std::string plg_name;
auto registry = cct->get_plugin_registry();
std::lock_guard l(registry->lock);
auto ptype = registry->plugins.find("extblkdev");
if (ptype == registry->plugins.end()) {
return -ENOENT;
}
for (auto& it : ptype->second) {
dout(10) << __func__ << " Trying to detect block device " << logdevname
<< " using plugin " << it.first << dendl;
auto ebdplugin = dynamic_cast<ExtBlkDevPlugin*>(it.second);
if (ebdplugin == nullptr) {
derr << __func__ << " Is not an extblkdev plugin: " << it.first << dendl;
return -ENOENT;
}
rc = ebdplugin->factory(logdevname, ebd_impl);
if (rc == 0) {
plg_name = it.first;
break;
}
}
if (rc == 0) {
dout(1) << __func__ << " using plugin " << plg_name << ", " << "volume " << ebd_impl->get_devname()
<< " maps to " << logdevname << dendl;
} else {
dout(10) << __func__ << " no plugin volume maps to " << logdevname << dendl;
}
return rc;
}
// release device object
int release_device(ExtBlkDevInterfaceRef& ebd_impl)
{
if (ebd_impl) {
ebd_impl.reset();
}
return 0;
}
}
}

View File

@ -0,0 +1,38 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph distributed storage system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file ceph/src/erasure-code/ErasureCodePlugin.h
* Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
* Copyright (C) 2014 Red Hat <contact@redhat.com>
*
* Author: Loic Dachary <loic@dachary.org>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*/
#ifndef CEPH_EXT_BLK_DEV_PLUGIN_H
#define CEPH_EXT_BLK_DEV_PLUGIN_H
#include "ExtBlkDevInterface.h"
namespace ceph {
namespace extblkdev {
int preload(CephContext *cct);
int detect_device(CephContext *cct,
const std::string &logdevname,
ExtBlkDevInterfaceRef& ebd_impl);
int release_device(ExtBlkDevInterfaceRef& ebd_impl);
}
}
#endif

View File

@ -0,0 +1,9 @@
# vdo plugin
set(vdo_srcs
ExtBlkDevPluginVdo.cc
ExtBlkDevVdo.cc
)
add_library(ceph_ebd_vdo SHARED ${vdo_srcs})
install(TARGETS ceph_ebd_vdo DESTINATION ${extblkdev_plugin_dir})

View File

@ -0,0 +1,59 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file src/erasure-code/clay/ErasureCodePluginClay.cc
* Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
*
* Author: Myna Vajha <mynaramana@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*/
#include "ceph_ver.h"
#include "ExtBlkDevPluginVdo.h"
#include "common/ceph_context.h"
// This plugin does not require any capabilities to be set
int ExtBlkDevPluginVdo::get_required_cap_set(cap_t caps)
{
return 0;
}
int ExtBlkDevPluginVdo::factory(const std::string& logdevname,
ceph::ExtBlkDevInterfaceRef& ext_blk_dev)
{
auto vdo = new ExtBlkDevVdo(cct);
int r = vdo->init(logdevname);
if (r != 0) {
delete vdo;
return r;
}
ext_blk_dev.reset(vdo);
return 0;
};
const char *__ceph_plugin_version() { return CEPH_GIT_NICE_VER; }
int __ceph_plugin_init(CephContext *cct,
const std::string& type,
const std::string& name)
{
auto plg = new ExtBlkDevPluginVdo(cct);
if(plg == 0) return -ENOMEM;
int rc = cct->get_plugin_registry()->add(type, name, plg);
if(rc != 0){
delete plg;
}
return rc;
}

View File

@ -0,0 +1,34 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph distributed storage system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file src/erasure-code/clay/ErasureCodePluginClay.h
* Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
*
* Author: Myna Vajha <mynaramana@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*/
#ifndef CEPH_EXT_BLK_DEV_PLUGIN_VDO_H
#define CEPH_EXT_BLK_DEV_PLUGIN_VDO_H
#include "ExtBlkDevVdo.h"
class ExtBlkDevPluginVdo : public ceph::ExtBlkDevPlugin {
public:
explicit ExtBlkDevPluginVdo(CephContext *cct) : ExtBlkDevPlugin(cct) {}
int get_required_cap_set(cap_t caps) override;
int factory(const std::string& logdevname,
ceph::ExtBlkDevInterfaceRef& ext_blk_dev) override;
};
#endif

View File

@ -0,0 +1,156 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file ceph/src/common/blkdev.cc
* Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#include "ExtBlkDevVdo.h"
#include "common/blkdev.h"
#include "include/stringify.h"
#include <errno.h>
#include "common/debug.h"
#define dout_subsys ceph_subsys_bdev
#define dout_context cct
#undef dout_prefix
#define dout_prefix *_dout << "vdo(" << this << ") "
int ExtBlkDevVdo::_get_vdo_stats_handle(const std::string& devname)
{
int rc = -ENOENT;
dout(10) << __func__ << " VDO init checking device: " << devname << dendl;
// we need to go from the raw devname (e.g., dm-4) to the VDO volume name.
// currently the best way seems to be to look at /dev/mapper/* ...
std::string expect = std::string("../") + devname; // expected symlink target
DIR *dir = ::opendir("/dev/mapper");
if (!dir) {
return -errno;
}
struct dirent *de = nullptr;
while ((de = ::readdir(dir))) {
if (de->d_name[0] == '.')
continue;
char fn[4096], target[4096];
snprintf(fn, sizeof(fn), "/dev/mapper/%s", de->d_name);
int r = readlink(fn, target, sizeof(target));
if (r < 0 || r >= (int)sizeof(target))
continue;
target[r] = 0;
if (expect == target) {
snprintf(fn, sizeof(fn), "/sys/kvdo/%s/statistics", de->d_name);
int vdo_fd = ::open(fn, O_RDONLY|O_CLOEXEC);
if (vdo_fd >= 0) {
name = de->d_name;
vdo_dir_fd = vdo_fd;
rc = 0;
break;
}
}
}
closedir(dir);
return rc;
}
int ExtBlkDevVdo::get_vdo_stats_handle()
{
std::set<std::string> devs = { logdevname };
while (!devs.empty()) {
std::string dev = *devs.begin();
devs.erase(devs.begin());
int rc = _get_vdo_stats_handle(dev);
if (rc == 0) {
// yay, it's vdo
return rc;
}
// ok, see if there are constituent devices
if (dev.find("dm-") == 0) {
get_dm_parents(dev, &devs);
}
}
return -ENOENT;
}
int64_t ExtBlkDevVdo::get_vdo_stat(const char *property)
{
int64_t ret = 0;
int fd = ::openat(vdo_dir_fd, property, O_RDONLY|O_CLOEXEC);
if (fd < 0) {
return 0;
}
char buf[1024];
int r = ::read(fd, buf, sizeof(buf) - 1);
if (r > 0) {
buf[r] = 0;
ret = atoll(buf);
}
VOID_TEMP_FAILURE_RETRY(::close(fd));
return ret;
}
int ExtBlkDevVdo::init(const std::string& alogdevname)
{
logdevname = alogdevname;
// get directory handle for VDO metadata
return get_vdo_stats_handle();
}
int ExtBlkDevVdo::get_state(ceph::ExtBlkDevState& state)
{
int64_t block_size = get_vdo_stat("block_size");
int64_t physical_blocks = get_vdo_stat("physical_blocks");
int64_t overhead_blocks_used = get_vdo_stat("overhead_blocks_used");
int64_t data_blocks_used = get_vdo_stat("data_blocks_used");
int64_t logical_blocks = get_vdo_stat("logical_blocks");
int64_t logical_blocks_used = get_vdo_stat("logical_blocks_used");
if (!block_size
|| !physical_blocks
|| !overhead_blocks_used
|| !data_blocks_used
|| !logical_blocks) {
dout(1) << __func__ << " VDO sysfs provided zero value for at least one statistic: " << dendl;
dout(1) << __func__ << " VDO block_size: " << block_size << dendl;
dout(1) << __func__ << " VDO physical_blocks: " << physical_blocks << dendl;
dout(1) << __func__ << " VDO overhead_blocks_used: " << overhead_blocks_used << dendl;
dout(1) << __func__ << " VDO data_blocks_used: " << data_blocks_used << dendl;
dout(1) << __func__ << " VDO logical_blocks: " << logical_blocks << dendl;
return -1;
}
int64_t avail_blocks =
physical_blocks - overhead_blocks_used - data_blocks_used;
int64_t logical_avail_blocks =
logical_blocks - logical_blocks_used;
state.set_logical_total(block_size * logical_blocks);
state.set_logical_avail(block_size * logical_avail_blocks);
state.set_physical_total(block_size * physical_blocks);
state.set_physical_avail(block_size * avail_blocks);
return 0;
}
int ExtBlkDevVdo::collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm)
{
ceph::ExtBlkDevState state;
int rc = get_state(state);
if(rc != 0){
return rc;
}
(*pm)[prefix + "vdo"] = "true";
(*pm)[prefix + "vdo_physical_size"] = stringify(state.get_physical_total());
return 0;
}

View File

@ -0,0 +1,52 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* (C) Copyright IBM Corporation 2022
* Author: Martin Ohmacht <mohmacht@us.ibm.com>
*
* Based on the file ceph/src/common/blkdev.cc
* Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
*
* And also based on the file src/erasure-code/clay/ErasureCodeClay.h
* Copyright (C) 2018 Indian Institute of Science <office.ece@iisc.ac.in>
*
* Author: Myna Vajha <mynaramana@gmail.com>
*
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
#ifndef CEPH_EXT_BLK_DEV_VDO_H
#define CEPH_EXT_BLK_DEV_VDO_H
#include "extblkdev/ExtBlkDevInterface.h"
#include "include/compat.h"
class ExtBlkDevVdo final : public ceph::ExtBlkDevInterface
{
int vdo_dir_fd = -1; ///< fd for vdo sysfs directory
std::string name; // name of the underlying vdo device
std::string logdevname; // name of the top level logical device
CephContext *cct;
public:
explicit ExtBlkDevVdo(CephContext *cct) : cct(cct) {}
~ExtBlkDevVdo(){
if(vdo_dir_fd >= 0)
VOID_TEMP_FAILURE_RETRY(::close(vdo_dir_fd));
}
int _get_vdo_stats_handle(const std::string& devname);
int get_vdo_stats_handle();
int64_t get_vdo_stat(const char *property);
virtual int init(const std::string& logdevname);
virtual const std::string& get_devname() const {return name;}
virtual int get_state(ceph::ExtBlkDevState& state);
virtual int collect_metadata(const std::string& prefix, std::map<std::string,std::string> *pm);
};
#endif

View File

@ -22,6 +22,7 @@
#include "common/signal.h"
#include "common/version.h"
#include "erasure-code/ErasureCodePlugin.h"
#include "extblkdev/ExtBlkDevPlugin.h"
#include "global/global_context.h"
#include "global/global_init.h"
#include "global/pidfile.h"
@ -317,6 +318,13 @@ global_init(const std::map<std::string,std::string> *defaults,
<< std::endl;
exit(1);
}
#if defined(HAVE_SYS_PRCTL_H)
if (g_conf().get_val<bool>("set_keepcaps")) {
if (prctl(PR_SET_KEEPCAPS, 1) == -1) {
cerr << "warning: unable to set keepcaps flag: " << cpp_strerror(errno) << std::endl;
}
}
#endif
if (setuid(uid) != 0) {
cerr << "unable to setuid " << uid << ": " << cpp_strerror(errno)
<< std::endl;

View File

@ -10421,15 +10421,16 @@ void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
- buf->omap_allocated;
}
uint64_t thin_total, thin_avail;
if (bdev->get_thin_utilization(&thin_total, &thin_avail)) {
buf->total += thin_total;
ExtBlkDevState ebd_state;
int rc = bdev->get_ebd_state(ebd_state);
if (rc == 0) {
buf->total += ebd_state.get_physical_total();
// we are limited by both the size of the virtual device and the
// underlying physical device.
bfree = std::min(bfree, thin_avail);
bfree = std::min(bfree, ebd_state.get_physical_avail());
buf->allocated = thin_total - thin_avail;
buf->allocated = ebd_state.get_physical_total() - ebd_state.get_physical_avail();;
} else {
buf->total += bdev->get_size();
}

View File

@ -731,10 +731,10 @@ void FileStore::collect_metadata(map<string,string> *pm)
(*pm)["backend_filestore_dev_node"] = string(dev_node);
devname = dev_node;
}
if (rc == 0 && vdo_fd >= 0) {
(*pm)["vdo"] = "true";
(*pm)["vdo_physical_size"] =
stringify(4096 * get_vdo_stat(vdo_fd, "physical_blocks"));
// if compression device detected, collect meta data for device
// VDO specific meta data has moved into VDO plugin
if (rc == 0 && ebd_impl) {
ebd_impl->collect_metadata("", pm);
}
if (journal) {
journal->collect_metadata(pm);
@ -778,12 +778,19 @@ int FileStore::statfs(struct store_statfs_t *buf0, osd_alert_list_t* alerts)
buf0->omap_allocated += object_map->get_db()->get_estimated_size(kv_usage);
}
uint64_t thin_total, thin_avail;
if (get_vdo_utilization(vdo_fd, &thin_total, &thin_avail)) {
buf0->total = thin_total;
bfree = std::min(bfree, thin_avail);
buf0->allocated = thin_total - thin_avail;
buf0->data_stored = bfree;
if (ebd_impl) {
ExtBlkDevState state;
int rc = ebd_impl->get_state(state);
if (rc == 0){
buf0->total = state.get_physical_total();
bfree = std::min(bfree, state.get_physical_avail());
buf0->allocated = state.get_physical_total() - state.get_physical_avail();
buf0->data_stored = bfree;
} else {
buf0->total = buf.f_blocks * buf.f_bsize;
buf0->allocated = bfree;
buf0->data_stored = bfree;
}
} else {
buf0->total = buf.f_blocks * buf.f_bsize;
buf0->allocated = bfree;
@ -1287,16 +1294,11 @@ int FileStore::_detect_fs()
return r;
}
// vdo
{
char dev_node[PATH_MAX];
if (int rc = BlkDev{fsid_fd}.wholedisk(dev_node, PATH_MAX); rc == 0) {
vdo_fd = get_vdo_stats_handle(dev_node, &vdo_name);
if (vdo_fd >= 0) {
dout(0) << __func__ << " VDO volume " << vdo_name << " for " << dev_node
<< dendl;
}
}
// check if any extended block device plugin recognizes this device
// detect_vdo has moved into the VDO plugin
int rc = extblkdev::detect_device(cct, devname, ebd_impl);
if (rc != 0) {
dout(20) << __func__ << " no plugin volume maps to " << devname << dendl;
}
// test xattrs
@ -2092,10 +2094,7 @@ int FileStore::umount()
(*it)->stop();
}
if (vdo_fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(vdo_fd));
vdo_fd = -1;
}
extblkdev::release_device(ebd_impl);
if (fsid_fd >= 0) {
VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
fsid_fd = -1;

View File

@ -47,6 +47,7 @@
#include "WBThrottle.h"
#include "include/uuid.h"
#include "extblkdev/ExtBlkDevPlugin.h"
#if defined(__linux__)
# ifndef BTRFS_SUPER_MAGIC
@ -162,8 +163,7 @@ private:
std::string devname;
int vdo_fd = -1;
std::string vdo_name;
ExtBlkDevInterfaceRef ebd_impl; // structure for retrieving compression state from extended block device
deque<uint64_t> snaps;