From 02f5543c8df9e2a175ce5d9a96773b641e212c1c Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Thu, 27 Jun 2019 19:05:26 +0530 Subject: [PATCH 01/18] libradosstriper: add function to read into char* Signed-off-by: Milind Changire Signed-off-by: Patrick Donnelly --- src/include/radosstriper/libradosstriper.hpp | 1 + src/libradosstriper/libradosstriper.cc | 22 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp index fb790b0d7ef..064c7d2c3d8 100644 --- a/src/include/radosstriper/libradosstriper.hpp +++ b/src/include/radosstriper/libradosstriper.hpp @@ -168,6 +168,7 @@ namespace libradosstriper * synchronously read from the striped object at the specified offset. */ int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off); + int read(const std::string& soid, char *buf, size_t len, uint64_t off); /** * asynchronously read from the striped object at the specified offset. diff --git a/src/libradosstriper/libradosstriper.cc b/src/libradosstriper/libradosstriper.cc index e98dfc17935..7f23a380550 100644 --- a/src/libradosstriper/libradosstriper.cc +++ b/src/libradosstriper/libradosstriper.cc @@ -254,6 +254,28 @@ int libradosstriper::RadosStriper::read(const std::string& soid, return rados_striper_impl->read(soid, bl, len, off); } +int libradosstriper::RadosStriper::read(const std::string& soid, + char *buf, + size_t len, + uint64_t off) +{ + bufferlist bl; + bufferptr bp = buffer::create_static(len, buf); + + bl.push_back(bp); + + int ret = rados_striper_impl->read(soid, &bl, len, off); + + if (ret >= 0) { + if (bl.length() > len) + return -ERANGE; + if (!bl.is_provided_buffer(buf)) + bl.begin().copy(bl.length(), buf); + ret = bl.length(); // hrm :/ + } + return ret; +} + int libradosstriper::RadosStriper::aio_read(const std::string& soid, librados::AioCompletion *c, bufferlist* bl, From 1bc229381af97d684c86a0625d5ab9699889ada4 Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Tue, 16 Jul 2019 11:41:25 +0530 Subject: [PATCH 02/18] libcephsqlite: sqlite interface to RADOS This library provides a SQLite front-end to the RADOS objects. This effort will help alleviate the restriction on number of key-value pairs that can be stored in an object. This interface is a generic one without any constraint on the database schema either. Library clients can enforce any schema and use SQLite API to store data in the database backed by RADOS Objects. Signed-off-by: Milind Changire --- CMakeLists.txt | 1 + src/CMakeLists.txt | 14 + src/libcephsqlite.cc | 927 +++++++++++++++++++++++++++++++++ src/libcephsqlite.h | 24 + src/test/test_libcephsqlite.cc | 433 +++++++++++++++ 5 files changed, 1399 insertions(+) create mode 100644 src/libcephsqlite.cc create mode 100644 src/libcephsqlite.h create mode 100644 src/test/test_libcephsqlite.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index f3512d60ce4..4e41b367ff3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -280,6 +280,7 @@ endif(WITH_QATZIP) # needs mds and? XXX option(WITH_LIBCEPHFS "libcephfs client library" ON) +option(WITH_LIBCEPHSQLITE "libcephsqlite client library" ON) # key-value store option(WITH_KVS "Key value store is here" ON) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bb139e25f4c..cd387786d8c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -803,6 +803,20 @@ if(WITH_LIBCEPHFS) endif() endif(WITH_LIBCEPHFS) +if(WITH_LIBCEPHSQLITE) + set(libcephsqlite_srcs libcephsqlite.cc) + add_library(cephsqlite ${CEPH_SHARED} ${libcephsqlite_srcs}) + target_link_libraries(cephsqlite PRIVATE client ceph-common + ${CRYPTO_LIBS} ${EXTRALIBS}) + if(ENABLE_SHARED) + set_target_properties(cephsqlite PROPERTIES + OUTPUT_NAME cephsqlite + VERSION 1.0.0 + SOVERSION 1) + endif(ENABLE_SHARED) + install(TARGETS cephsqlite DESTINATION ${CMAKE_INSTALL_LIBDIR}) +endif(WITH_LIBCEPHSQLITE) + if(WITH_FUSE) set(ceph_fuse_srcs ceph_fuse.cc diff --git a/src/libcephsqlite.cc b/src/libcephsqlite.cc new file mode 100644 index 00000000000..f2a52b30f6f --- /dev/null +++ b/src/libcephsqlite.cc @@ -0,0 +1,927 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "include/rados/librados.hpp" +#include "include/radosstriper/libradosstriper.hpp" +#include "common/ceph_mutex.h" + +static int _cephsqlite3_Close(sqlite3_file *pFile); +static int _cephsqlite3_Read(sqlite3_file *pFile, void *zBuf, int iAmt, sqlite_int64 iOfst); +static int _cephsqlite3_Write(sqlite3_file *pFile, const void *zBuf, int iAmt, sqlite_int64 iOfst); +static int _cephsqlite3_Truncate(sqlite3_file *pFile, sqlite_int64 size); +static int _cephsqlite3_Sync(sqlite3_file *pFile, int flags); +static int _cephsqlite3_FileSize(sqlite3_file *pFile, sqlite_int64 *pSize); +static int _cephsqlite3_Lock(sqlite3_file *pFile, int eLock); +static int _cephsqlite3_Unlock(sqlite3_file *pFile, int eLock); +static int _cephsqlite3_CheckReservedLock(sqlite3_file *pFile, int *pResOut); +static int _cephsqlite3_FileControl(sqlite3_file *pFile, int op, void *pArg); +static int _cephsqlite3_SectorSize(sqlite3_file *pFile); +static int _cephsqlite3_DeviceCharacteristics(sqlite3_file *pFile); + +static int _cephsqlite3_Open(sqlite3_vfs *pVfs, const char *zName, sqlite3_file *pFile, int flags, int *pOutFlags); +static int _cephsqlite3_Delete(sqlite3_vfs *pVfs, const char *zPath, int dirSync); +static int _cephsqlite3_Access(sqlite3_vfs *pVfs, const char *zPath, int flags, int *pResOut); +static int _cephsqlite3_FullPathname(sqlite3_vfs *pVfs, const char *zPath, int nPathOut, char *zPathOut); +static void *_cephsqlite3_DlOpen(sqlite3_vfs *pVfs, const char *zPath); +static void _cephsqlite3_DlError(sqlite3_vfs *pVfs, int nByte, char *zErrMsg); +static void (*_cephsqlite3_DlSym(sqlite3_vfs *pVfs, void *pH, const char *z))(void); +static void _cephsqlite3_DlClose(sqlite3_vfs *pVfs, void *pHandle); +static int _cephsqlite3_Randomness(sqlite3_vfs *pVfs, int nByte, char *zByte); +static int _cephsqlite3_Sleep(sqlite3_vfs *pVfs, int nMicro); +static int _cephsqlite3_CurrentTime(sqlite3_vfs *pVfs, double *pTime); + +static sqlite3_vfs *_cephsqlite3__vfs(void); +static librados::IoCtx *get_io_ctx(const char *zPath); +static libradosstriper::RadosStriper *get_radosstriper(const char *zPath); +static std::string get_db_name(const char *zName); +static int get_lock_type(const std::string &db_name, const std::string &file_name); +static void set_lock_type(const std::string &db_name, const std::string &file_name, int eLock); +static std::string get_lock_file_name(const std::string &db_name, int eLock); +static int create_lock_files(librados::IoCtx *io_ctx, const std::string &db_name, bool must_create); +static int remove_lock_files(librados::IoCtx *io_ctx, const std::string &db_file); +static int lock_file_in_rados(librados::IoCtx *io_ctx, const std::string &lock_file_name, const std::string &lock_name, bool exclusive); + + +struct CephVFSContext { + librados::IoCtx *io_ctx = nullptr; + libradosstriper::RadosStriper *rs = nullptr; + /* file name -> lock type map (SQLITE_LOCK_NONE, etc.) + * file names include main as well as temporary files: eg. *-journal, *-wal, etc.) + */ + std::map lock_info; +}; + +struct CephFile { + sqlite3_file base; + const char *name = nullptr; + int file_open_flags = 0; +}; + +// map to hold pointers to vfs contexts for various databases +static ceph::mutex vfs_context_map_mutex; // ("vfs_context_map_mutex"); +static std::map vfs_context_map; + +static const std::string lock_name = "cephsqlite3_vfs_lock"; +static const std::string emptystr = ""; + + +/* +** Close a file. +*/ +static +int _cephsqlite3_Close(sqlite3_file *pFile) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + + CephFile *p = (CephFile*)pFile; + + // if only we are closing the main database + if (get_db_name(p->name) == p->name) { + CephVFSContext *cc = nullptr; + + vfs_context_map_mutex.lock(); + cc = vfs_context_map[std::string(p->name)]; + if (cc) + vfs_context_map.erase(std::string(p->name)); + vfs_context_map_mutex.unlock(); + + if (cc) { + delete cc->rs; + if (cc->io_ctx) { + cc->io_ctx->close(); + delete cc->io_ctx; + } + delete cc; + } + } + + return SQLITE_OK; +} + +/* +** Read data from a file. +*/ +static +int _cephsqlite3_Read(sqlite3_file *pFile, void *zBuf, int iAmt, sqlite_int64 iOfst) +{ + CephFile *p = (CephFile*)pFile; + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "read(" << p->name << ", " << std::dec << iAmt << ", " << std::dec << iOfst << ")" << std::endl; + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(p->name).c_str()); + + /* ceph::bufferlist buffer pointers are all char* */ + char *b = static_cast(zBuf); + if (rs->read(p->name, b, iAmt, iOfst) < 0) { + // int e = errno; + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::errno:" << e << std::endl; + return SQLITE_IOERR_READ; + } + + return SQLITE_OK; +} + +/* +** Write data to a crash-file. +*/ +static +int _cephsqlite3_Write(sqlite3_file *pFile, const void *zBuf, int iAmt, sqlite_int64 iOfst) +{ + CephFile *p = (CephFile*)pFile; + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "write(" << p->name << ", " << std::dec << iAmt << ", " << std::dec << iOfst << ")" << std::endl; + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(p->name).c_str()); + + /* ceph::bufferlist buffer pointers are all char* */ + char *b = reinterpret_cast(const_cast(zBuf)); + ceph::bufferlist bl = ceph::bufferlist::static_from_mem(b, iAmt); + + if (rs->write(p->name, bl, iAmt, iOfst) < 0) + return SQLITE_IOERR; + + return SQLITE_OK; +} + +/* +** Truncate a file. This is a no-op for this VFS (see header comments at +** the top of the file). +*/ +static +int _cephsqlite3_Truncate(sqlite3_file *pFile, sqlite_int64 size) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + CephFile *p = (CephFile*)pFile; + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(p->name).c_str()); + + if (rs->trunc(p->name, size) != 0) + return SQLITE_IOERR; + + return SQLITE_OK; +} + +/* +** Sync the contents of the file to the persistent media. +*/ +static +int _cephsqlite3_Sync(sqlite3_file *pFile, int flags) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return SQLITE_OK; +} + + +/* +** Write the size of the file in bytes to *pSize. +*/ +static +int _cephsqlite3_FileSize(sqlite3_file *pFile, sqlite_int64 *pSize) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + CephFile *p = (CephFile*)pFile; + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(p->name).c_str()); + + uint64_t size = 0; + time_t mtime = 0; + int rc = rs->stat(p->name, &size, &mtime); + + *pSize = 0; + + if (rc == 0) + *pSize = (sqlite_int64)size; + + return SQLITE_OK; +} + +#define CASE(x) case x: return #x + +static +const char *lock_type_str(int eLock) +{ + switch (eLock) { + CASE(SQLITE_LOCK_NONE); + CASE(SQLITE_LOCK_SHARED); + CASE(SQLITE_LOCK_RESERVED); + CASE(SQLITE_LOCK_PENDING); + CASE(SQLITE_LOCK_EXCLUSIVE); + } + return "UNKNOWN"; +} + +static +std::string get_lock_file_name(const std::string &db_name, int eLock) +{ + std::string lock_file_name = db_name; + lock_file_name += "-"; + lock_file_name += lock_type_str(eLock); + return lock_file_name; +} + +std::string get_cookie() +{ + std::stringstream ss; + + ss << std::hex << std::setfill('0') << std::setw(16) << "0x" << pthread_self(); + + return ss.str(); +} + +/* +** Locking functions. The xLock() and xUnlock() methods are both no-ops. +** The xCheckReservedLock() always indicates that no other process holds +** a reserved lock on the database file. This ensures that if a hot-journal +** file is found in the file-system it is rolled back. +*/ +static +int _cephsqlite3_Lock(sqlite3_file *pFile, int eLock) +{ + CephFile *p = (CephFile *)pFile; + + assert(p != NULL); + + std::string db_name = get_db_name(p->name); + std::string db_file = p->name; + /* Make sure the locking sequence is correct. + ** (1) We never move from unlocked to anything higher than shared lock. + ** (2) SQLite never explicitly requests a pendig lock. + ** (3) A shared lock is always held when a reserve lock is requested. + */ + int curr_lock_type = get_lock_type(db_name, db_file); + + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::eLock " << p->name << " from:" << lock_type_str(curr_lock_type) << " to:" << lock_type_str(eLock) << std::endl; + + assert(curr_lock_type != SQLITE_LOCK_NONE || eLock == SQLITE_LOCK_SHARED); + assert(eLock != SQLITE_LOCK_PENDING); + assert(eLock != SQLITE_LOCK_RESERVED || curr_lock_type == SQLITE_LOCK_SHARED); + + + if (curr_lock_type == eLock) + return SQLITE_OK; + + librados::IoCtx *io_ctx = get_io_ctx(db_name.c_str()); + assert(io_ctx != nullptr); + + std::string lock_file_name; + + if (curr_lock_type == SQLITE_LOCK_NONE) { + if (eLock == SQLITE_LOCK_SHARED) { + lock_file_name = get_lock_file_name(db_file, SQLITE_LOCK_SHARED); + if (lock_file_in_rados(io_ctx, lock_file_name, lock_name, false) != 0) + return SQLITE_BUSY; + set_lock_type(db_name, db_file, SQLITE_LOCK_SHARED); + return SQLITE_OK; + } + return SQLITE_IOERR_RDLOCK; + } else if (curr_lock_type == SQLITE_LOCK_SHARED) { + lock_file_name = get_lock_file_name(db_file, SQLITE_LOCK_SHARED /*SQLITE_LOCK_EXCLUSIVE*/); + + // unlock in shared mode and lock in exclusive mode + if (io_ctx->unlock(lock_file_name, lock_name, get_cookie()) == 0) { + if (lock_file_in_rados(io_ctx, lock_file_name, lock_name, true) != 0) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::returning SQLITE_BUSY" << std::endl; + return SQLITE_BUSY; + } + set_lock_type(db_name, db_file, eLock); + return SQLITE_OK; + } + } else if (eLock == SQLITE_LOCK_RESERVED || eLock == SQLITE_LOCK_PENDING || eLock == SQLITE_LOCK_EXCLUSIVE) { + set_lock_type(db_name, db_file, eLock); + return SQLITE_OK; + } + + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::returning SQLITE_IOERR_LOCK" << std::endl; + return SQLITE_IOERR_LOCK; +} + +static +int _cephsqlite3_Unlock(sqlite3_file *pFile, int eLock) +{ + CephFile *p = (CephFile *)pFile; + + assert(p != NULL); + + + std::string db_name = get_db_name(p->name); + std::string db_file = p->name; + + librados::IoCtx *io_ctx = get_io_ctx(db_name.c_str()); + assert(io_ctx != nullptr); + + int curr_lock_type = get_lock_type(db_name, db_file); + + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::eLock " << p->name << " from:" << lock_type_str(curr_lock_type) << " to:" << lock_type_str(eLock) << std::endl; + + if (eLock == curr_lock_type) + return SQLITE_OK; + + assert(eLock < curr_lock_type); + + std::string lock_file_name = get_lock_file_name(db_file, SQLITE_LOCK_SHARED); + if (eLock <= SQLITE_LOCK_SHARED) { + if (io_ctx->unlock(lock_file_name, lock_name, get_cookie()) != 0) + return SQLITE_IOERR_UNLOCK; + } + + if (eLock == SQLITE_LOCK_SHARED) { + if (lock_file_in_rados(io_ctx, lock_file_name, lock_name, false) != 0) + return SQLITE_BUSY; + } + + set_lock_type(db_name, db_file, eLock); + + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << ":: returning SQLITE_OK" << std::endl; + return SQLITE_OK; +} + +static +int _cephsqlite3_CheckReservedLock(sqlite3_file *pFile, int *pResOut) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + CephFile *p = (CephFile *)pFile; + + assert(p != NULL); + + std::string db_name = get_db_name(p->name); + std::string db_file = p->name; + // librados::IoCtx *io_ctx = get_io_ctx(db_name.c_str()); + // assert(io_ctx != nullptr); + int curr_lock_type = get_lock_type(db_name, db_file); + + *pResOut = (curr_lock_type > SQLITE_LOCK_SHARED); + return SQLITE_OK; +} + +/* +** No xFileControl() verbs are implemented by this VFS. +*/ +static +int _cephsqlite3_FileControl(sqlite3_file *pFile, int op, void *pArg) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return SQLITE_NOTFOUND; +} + +/* +** The xSectorSize() and xDeviceCharacteristics() methods. These two +** may return special values allowing SQLite to optimize file-system +** access to some extent. But it is also safe to simply return 0. +*/ +static +int _cephsqlite3_SectorSize(sqlite3_file *pFile) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return 4096; +} + +static +int _cephsqlite3_DeviceCharacteristics(sqlite3_file *pFile) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return 0; +} + +/* +** Open a file handle. +*/ +static +int _cephsqlite3_Open( + sqlite3_vfs *pVfs, /* VFS */ + const char *zName, /* File to open, or 0 for a temp file */ + sqlite3_file *pFile, /* Pointer to DemoFile struct to populate */ + int flags, /* Input SQLITE_OPEN_XXX flags */ + int *pOutFlags /* Output SQLITE_OPEN_XXX flags (or NULL) */ +) +{ + static const sqlite3_io_methods _cephsqlite3_io = { + 1, /* iVersion */ + _cephsqlite3_Close, /* xClose */ + _cephsqlite3_Read, /* xRead */ + _cephsqlite3_Write, /* xWrite */ + _cephsqlite3_Truncate, /* xTruncate */ + _cephsqlite3_Sync, /* xSync */ + _cephsqlite3_FileSize, /* xFileSize */ + _cephsqlite3_Lock, /* xLock */ + _cephsqlite3_Unlock, /* xUnlock */ + _cephsqlite3_CheckReservedLock, /* xCheckReservedLock */ + _cephsqlite3_FileControl, /* xFileControl */ + _cephsqlite3_SectorSize, /* xSectorSize */ + _cephsqlite3_DeviceCharacteristics /* xDeviceCharacteristics */ + }; + + CephFile *p = (CephFile*)pFile; /* Populate this structure */ + + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "zName:" << zName << std::endl; + + if ((zName == 0) || (strncmp(zName, ":memory:", 8) == 0)) { + /* we are not going to create temporary files */ + return SQLITE_IOERR; + } + + + p->name = zName; /* save the file name */ + p->file_open_flags = flags; + libradosstriper::RadosStriper* rs = get_radosstriper(get_db_name(zName).c_str()); + + // std::cerr << "rs:" << std::hex << rs << std::endl; + + assert(rs != nullptr); + + /* exclusive create + * radosstriper doesn't have a create()! ... since the data is striped, + * radosstriper can't decide beforehand which of the stripe files it needs + * to create + * so here, we just force create the first stripe by writing to offset zero + * so that the rest of the interface functions work as expected + */ + if (flags & SQLITE_OPEN_CREATE) { + // std::cerr << __FUNCTION__ << "::creating database" << std::endl; + char dummy[4096] = {0,}; + ceph::bufferlist bl = ceph::bufferlist::static_from_mem(dummy, sizeof(dummy)); + if (rs->write(p->name, bl, sizeof(dummy), 0) != 0) { + // int e = errno; + // std::cerr << __FUNCTION__ << "::error: during write():errno(" << e << ")" << std::endl; + return SQLITE_IOERR_WRITE; + } + if (rs->trunc(p->name, 0) < 0) { + // int e = errno; + // std::cerr << __FUNCTION__ << "::error: during trunc():errno(" << e << ")" << std::endl; + return SQLITE_IOERR_TRUNCATE; + } + + // we also create the sentinel file which would be for locking operations + librados::IoCtx *io_ctx = get_io_ctx(get_db_name(zName).c_str()); + + if (create_lock_files(io_ctx, std::string(zName), true) != SQLITE_OK) + return SQLITE_ERROR; + } + + if (pOutFlags) { + *pOutFlags = flags; + } + p->base.pMethods = &_cephsqlite3_io; + return SQLITE_OK; +} + +/* +** Delete the file identified by argument zPath. If the dirSync parameter +** is non-zero, then ensure the file-system modification to delete the +** file has been synced to disk before returning. +*/ +static +int _cephsqlite3_Delete(sqlite3_vfs *pVfs, const char *zPath, int dirSync) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(zPath).c_str()); + librados::IoCtx *io_ctx = get_io_ctx(get_db_name(zPath).c_str()); + + int ret = rs->remove(zPath); + if (ret == 0) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::deleted " << zPath << std::endl; + return SQLITE_OK; + } + + remove_lock_files(io_ctx, std::string(zPath)); + + // int e = errno; + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::error deleting " << zPath << " ret:" << ret << ", errno:" << e << std::endl; + return SQLITE_IOERR_DELETE; +} + +/* +** Query the file-system to see if the named file exists, is readable or +** is both readable and writable. +*/ +static +int _cephsqlite3_Access(sqlite3_vfs *pVfs, const char *zPath, int flags, int *pResOut) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << ":zPath:" << zPath << std::endl; + + assert(flags == SQLITE_ACCESS_EXISTS || /* access(zPath, F_OK) */ + flags == SQLITE_ACCESS_READ || /* access(zPath, R_OK) */ + flags == SQLITE_ACCESS_READWRITE /* access(zPath, R_OK|W_OK) */ + ); + + libradosstriper::RadosStriper *rs = get_radosstriper(get_db_name(zPath).c_str()); + + uint64_t size = 0; + time_t mtime = 0; + int rc = rs->stat(zPath, &size, &mtime); + + *pResOut = (rc == 0); + + return SQLITE_OK; +} + +/* +** Argument zPath points to a nul-terminated string containing a file path. +** If zPath is an absolute path, then it is copied as is into the output +** buffer. Otherwise, if it is a relative path, then the equivalent full +** path is written to the output buffer. +** +** This function assumes that paths are UNIX style. Specifically, that: +** +** 1. Path components are separated by a '/'. and +** 2. Full paths begin with a '/' character. +*/ +static +int _cephsqlite3_FullPathname( + sqlite3_vfs *pVfs, /* VFS */ + const char *zPath, /* Input path (possibly a relative path) */ + int nPathOut, /* Size of output buffer in bytes */ + char *zPathOut /* Pointer to output buffer */ +) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << ":zPath:" << zPath << std::endl; + + /* There are no directories to be searched for RADOS obects. + * They are always at the root. + * So, just copy the path if starting with '/' or prefix the path with a '/' + * and copy to output. + */ + if (strlen(zPath) >= (unsigned long)nPathOut) + return SQLITE_ERROR; + + memcpy(zPathOut, zPath, strlen(zPath) + 1); + zPathOut[strlen(zPath)] = '\0'; + + return SQLITE_OK; +} + +/* +** The following four VFS methods: +** +** xDlOpen +** xDlError +** xDlSym +** xDlClose +** +** are supposed to implement the functionality needed by SQLite to load +** extensions compiled as shared objects. This VFS does not support +** this functionality, so the following functions are no-ops. +*/ +static +void *_cephsqlite3_DlOpen(sqlite3_vfs *pVfs, const char *zPath) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return 0; +} + +static +void _cephsqlite3_DlError(sqlite3_vfs *pVfs, int nByte, char *zErrMsg) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + snprintf(zErrMsg, nByte, "Loadable extensions are not supported"); + zErrMsg[nByte-1] = '\0'; +} + +static +void (*_cephsqlite3_DlSym(sqlite3_vfs *pVfs, void *pH, const char *z))(void) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return 0; +} + +static +void _cephsqlite3_DlClose(sqlite3_vfs *pVfs, void *pHandle) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return; +} + +/* +** Parameter zByte points to a buffer nByte bytes in size. Populate this +** buffer with pseudo-random data. +*/ +static +int _cephsqlite3_Randomness(sqlite3_vfs *pVfs, int nByte, char *zByte) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + int fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + if (read(fd, zByte, nByte) == nByte) { + close(fd); + return SQLITE_OK; + } + close(fd); + } + return SQLITE_ERROR; +} + +/* +** Sleep for at least nMicro microseconds. Return the (approximate) number +** of microseconds slept for. +*/ +static +int _cephsqlite3_Sleep(sqlite3_vfs *pVfs, int nMicro) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + sleep(nMicro / 1000000); + usleep(nMicro % 1000000); + return nMicro; +} + +/* +** Set *pTime to the current UTC time expressed as a Julian day. Return +** SQLITE_OK if successful, or an error code otherwise. +** +** http://en.wikipedia.org/wiki/Julian_day +** +** This implementation is not very good. The current time is rounded to +** an integer number of seconds. Also, assuming time_t is a signed 32-bit +** value, it will stop working some time in the year 2038 AD (the so-called +** "year 2038" problem that afflicts systems that store time this way). +*/ +static +int _cephsqlite3_CurrentTime(sqlite3_vfs *pVfs, double *pTime) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + time_t t = time(0); + *pTime = t/86400.0 + 2440587.5; + return SQLITE_OK; +} + +static +sqlite3_vfs *_cephsqlite3__vfs(void) +{ + static sqlite3_vfs _cephsqlite3_vfs = { + 1, /* iVersion */ + sizeof(CephFile), /* szOsFile */ + PATH_MAX, /* mxPathname */ + 0, /* pNext */ + "cephsqlite3", /* zName */ + 0, /* pAppData */ + _cephsqlite3_Open, /* xOpen */ + _cephsqlite3_Delete, /* xDelete */ + _cephsqlite3_Access, /* xAccess */ + _cephsqlite3_FullPathname, /* xFullPathname */ + _cephsqlite3_DlOpen, /* xDlOpen */ + _cephsqlite3_DlError, /* xDlError */ + _cephsqlite3_DlSym, /* xDlSym */ + _cephsqlite3_DlClose, /* xDlClose */ + _cephsqlite3_Randomness, /* xRandomness */ + _cephsqlite3_Sleep, /* xSleep */ + _cephsqlite3_CurrentTime, /* xCurrentTime */ + }; + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + return &_cephsqlite3_vfs; +} + +static +void __attribute__ ((constructor)) _cephsqlite3__vfs_register() +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + sqlite3_vfs_register(_cephsqlite3__vfs(), 1); +} + +static +void __attribute__ ((destructor)) _cephsqlite3__vfs_unregister() +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << std::endl; + sqlite3_vfs_unregister(_cephsqlite3__vfs()); +} + +extern "C" +sqlite3 *ceph_sqlite3_open( + librados::Rados &cluster, + const char *dbname, /* eg. "sql" instead of "sql.db" */ + const char *rados_namespace, + int ceph_pool_id, + bool must_create +) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::must_create:" << must_create << std::endl; + int ret = -1; + /* FIXME + * how long should the io_ctx and cluster objects exist for radosstriper + * to be functional ? + */ + librados::IoCtx *io_ctx = new librados::IoCtx; + libradosstriper::RadosStriper *rs = new libradosstriper::RadosStriper; + + ret = cluster.ioctx_create2(ceph_pool_id, *io_ctx); + if (ret < 0) { + /* unable to create the IO Context */ + return NULL; + } + + io_ctx->set_namespace(rados_namespace); + + ret = libradosstriper::RadosStriper::striper_create(*io_ctx, rs); + if (ret < 0) { + /* unable to create the striper */ + return NULL; + } + + uint64_t alignment = 0; + ret = io_ctx->pool_required_alignment2(&alignment); + if (ret < 0) { + /* no alignment retrieved */ + return NULL; + } + + rs->set_object_layout_stripe_unit(alignment); + + sqlite3 *db = NULL; + const int db_open_flags = SQLITE_OPEN_NOMUTEX | /* single client access */ + SQLITE_OPEN_PRIVATECACHE | + SQLITE_OPEN_READWRITE | + SQLITE_OPEN_URI | + (must_create ? SQLITE_OPEN_CREATE : 0); + + std::stringstream ss; + + /* pass the address of the RadosStriper C++ object in the URI so that the VFS + * methods can access it + */ + std::string mode = (must_create ? "rwc" : "rw"); + /* create a URI based file name */ + ss << "file:" << dbname << ".db?mode=" << mode << "&cache=private&vfs=cephsqlite3"; + std::string filename = ss.str(); + + // (void) pthread_once(&ceph_vfs_context_key_once, make_key); + CephVFSContext *cc = NULL; + vfs_context_map_mutex.lock(); + if (vfs_context_map[std::string(dbname) + ".db"] == nullptr) { + cc = new CephVFSContext; + cc->io_ctx = io_ctx; + cc->rs = rs; + vfs_context_map[std::string(dbname) + ".db"] = cc; + } + vfs_context_map_mutex.unlock(); + + ret = sqlite3_open_v2(filename.c_str(), &db, db_open_flags, "cephsqlite3"); + if (ret < 0) { + /* db creation failed */ + return NULL; + } + + return db; +} + +extern "C" +void ceph_sqlite3_set_db_params( + const char *dbname, /* eg. "sql" instead of "sql.db" */ + int stripe_count, + int obj_size +) +{ + vfs_context_map_mutex.lock(); + CephVFSContext *cc = vfs_context_map[std::string(dbname) + ".db"]; + + if (cc) { + libradosstriper::RadosStriper *rs = cc->rs; + + if (rs) { + rs->set_object_layout_stripe_count(stripe_count); + rs->set_object_layout_object_size(obj_size); + } + } + vfs_context_map_mutex.unlock(); +} + +static +int create_lock_files(librados::IoCtx *io_ctx, const std::string &db_file, bool must_create) +{ + int ret = io_ctx->create(get_lock_file_name(db_file, SQLITE_LOCK_SHARED), must_create); + if (ret < 0 && ret != -EEXIST) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::error: while creating:" << get_lock_file_name(db_file, SQLITE_LOCK_SHARED) << std::endl; + return SQLITE_ERROR; + } + + ret = io_ctx->create(get_lock_file_name(db_file, SQLITE_LOCK_RESERVED), must_create); + if (ret < 0 && ret != -EEXIST) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::error: while creating:" << get_lock_file_name(db_file, SQLITE_LOCK_RESERVED) << std::endl; + return SQLITE_ERROR; + } + + ret = io_ctx->create(get_lock_file_name(db_file, SQLITE_LOCK_PENDING), must_create); + if (ret < 0 && ret != -EEXIST) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::error: while creating:" << get_lock_file_name(db_file, SQLITE_LOCK_PENDING) << std::endl; + return SQLITE_ERROR; + } + + ret = io_ctx->create(get_lock_file_name(db_file, SQLITE_LOCK_EXCLUSIVE), must_create); + if (ret < 0 && ret != -EEXIST) { + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::error: while creating:" << get_lock_file_name(db_file, SQLITE_LOCK_EXCLUSIVE) << std::endl; + return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +static +int remove_lock_files(librados::IoCtx *io_ctx, const std::string &db_file) +{ + io_ctx->remove(get_lock_file_name(db_file, SQLITE_LOCK_SHARED)); + io_ctx->remove(get_lock_file_name(db_file, SQLITE_LOCK_RESERVED)); + io_ctx->remove(get_lock_file_name(db_file, SQLITE_LOCK_PENDING)); + io_ctx->remove(get_lock_file_name(db_file, SQLITE_LOCK_EXCLUSIVE)); + + return SQLITE_OK; +} + +/* input is the full URI file name*/ +static +librados::IoCtx *get_io_ctx(const char *zPath) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "zPath:" << zPath << std::endl; + + librados::IoCtx *io_ctx = nullptr; + + vfs_context_map_mutex.lock(); + if (vfs_context_map[std::string(zPath)]) + io_ctx = vfs_context_map[std::string(zPath)]->io_ctx; + vfs_context_map_mutex.unlock(); + + return io_ctx; +} + +/* input is the full URI file name*/ +static +libradosstriper::RadosStriper *get_radosstriper(const char *zPath) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "zPath:" << zPath << std::endl; + + libradosstriper::RadosStriper *rs = nullptr; + + vfs_context_map_mutex.lock(); + if (vfs_context_map[std::string(zPath)]) + rs = vfs_context_map[std::string(zPath)]->rs; + vfs_context_map_mutex.unlock(); + + return rs; +} + +static +std::string get_db_name(const char *zName) +{ + // std::cerr << std::hex << pthread_self() << ":" << __FUNCTION__ << "::" << "zName:" << zName << std::endl; + /* eg. "accounts.db-journal" + * so filename should be extracted as "accounts.db" + * NOTE main database file name will always have a .db extension + * see ceph_sqlite3_open() + */ + const char *e = strstr(zName, ".db-"); + if (!e) + return std::string(zName); + + e += 3; // point to the '-' after .db + + return std::string(zName, e - zName); +} + +static +int get_lock_type(const std::string &db_name, const std::string &file_name) +{ + int eLock = SQLITE_LOCK_NONE; + + vfs_context_map_mutex.lock(); + CephVFSContext *cc = vfs_context_map[db_name]; + assert(cc); + + eLock = cc->lock_info[file_name]; + vfs_context_map_mutex.unlock(); + + return eLock; +} + +static +void set_lock_type(const std::string &db_name, const std::string &file_name, int eLock) +{ + vfs_context_map_mutex.lock(); + CephVFSContext *cc = vfs_context_map[db_name]; + assert(cc != NULL); + + cc->lock_info[file_name] = eLock; + vfs_context_map_mutex.unlock(); +} + +static +int lock_file_in_rados(librados::IoCtx *io_ctx, const std::string &lock_file_name, const std::string &lock_name, bool exclusive) +{ + int ret = -1; + int retries = 20; + + while (retries--) { + if (exclusive) + ret = io_ctx->lock_exclusive(lock_file_name, lock_name, get_cookie(), emptystr, NULL, 0); + else + ret = io_ctx->lock_shared(lock_file_name, lock_name, get_cookie(), emptystr, emptystr, NULL, 0); + + if (ret == 0) + return ret; + + if (ret == -EBUSY) + usleep(1000); // sleep for 1 milisecond + } + return ret; +} diff --git a/src/libcephsqlite.h b/src/libcephsqlite.h new file mode 100644 index 00000000000..d4514097b1d --- /dev/null +++ b/src/libcephsqlite.h @@ -0,0 +1,24 @@ +#ifndef LIBCEPHSQLITE_H +#define LIBCEPHSQLITE_H + +#include "rados/librados.hpp" + +struct sqlite3; + +extern "C" +sqlite3 *ceph_sqlite3_open( + librados::Rados &cluster, + const char *dbname, /* eg. "sql" instead of "sql.db" */ + const char *rados_namespace, + int ceph_pool_id, + bool must_create +); + +extern "C" +void ceph_sqlite3_set_db_params( + const char *dbname, /* eg. "sql" instead of "sql.db" */ + int stripe_count, + int obj_size +); + +#endif diff --git a/src/test/test_libcephsqlite.cc b/src/test/test_libcephsqlite.cc new file mode 100644 index 00000000000..7a40dc9fe2f --- /dev/null +++ b/src/test/test_libcephsqlite.cc @@ -0,0 +1,433 @@ +/* g++ -I ../src -I ../src/include/ -L ./lib/ -lcephsqlite -lsqlite3 -lrados -lradosstriper -o test_libsqlite ../src/test/test_libcephsqlite.cc + */ +#include + +#include "rados/librados.hpp" +#include "radosstriper/libradosstriper.hpp" +#include "libcephsqlite.h" + +#include +#include +#include + +#include + +void create_db(sqlite3 *db); +void insert_db(sqlite3 *db, const char *file_name); +long int count_db(sqlite3 *db); +void list_db(sqlite3 *db); +void delete_db(sqlite3 *db); +const char *sqlite_error_str(int err); +const char *sqlite_exterror_str(int err); + + +int main(int argc, char **argv) +{ + int ret = -1; + librados::Rados cluster; + + ret = cluster.init2("client.admin", "ceph", 0); + if( ret < 0) + { + std::cerr << "Couldn't init cluster "<< ret << std::endl; + } + + // make sure ceph.conf is in /etc/ceph/ and is world readable + ret = cluster.conf_read_file("ceph.conf"); + if( ret < 0) + { + std::cerr << "Couldn't read conf file "<< ret << std::endl; + } + + ret = cluster.connect(); + if(ret < 0) + { + std::cerr << "Couldn't connect to cluster "<< ret << std::endl; + } + else + { + std::cout << "Connected to Cluster"<< std::endl; + } + + std::string cmd = (argv[1] ? argv[1] : ""); + int pool_id = 2; // for cephfs.a.data + + std::cout << "cmd:" << cmd << std::endl; + sqlite3 *db = ceph_sqlite3_open(cluster, "rados-db-test", "DATABASE", pool_id, (cmd == "create")); + + ceph_sqlite3_set_db_params("rados-db-test", 3, (1<<22)); + + if (!db) { + std::cerr << "error: while initializing database" << std::endl; + return 1; + } + + if (cmd == "create") { + create_db(db); + } else if (cmd == "insert") { + insert_db(db, argv[2]); /* argv[2] contains file name of data file */ + } else if (cmd == "count") { + std::cout << "total rows: " << count_db(db) << std::endl; + } else if (cmd == "list") { + list_db(db); + } else if (cmd == "delete") { + delete_db(db); + } else { + std::cout << "Usage:" << std::endl; + std::cout << "\t" << argv[0] << " {create|insert |count|list|delete}" << std::endl; + } + + sqlite3_close(db); + cluster.shutdown(); + return 0; +} + +inline +void dump_error(const char *func_name, int line, const char *errmsg, const char *sqlite_errmsg) +{ + std::cerr << func_name << ":" << std::dec << line << ":" << errmsg << ":" << sqlite_errmsg << std::endl; +} + +void create_db(sqlite3 *db) +{ + const char *ddl1 = + "CREATE TABLE IF NOT EXISTS t1(fname TEXT PRIMARY KEY NOT NULL, sha256sum TEXT NOT NULL)"; + const char *ddl2 = + "CREATE UNIQUE INDEX t1fname ON t1(fname);"; + sqlite3_stmt *stmt = NULL; + const char *unused = NULL; + + int ret = sqlite3_prepare_v3(db, ddl1, strlen(ddl1), 0x0, &stmt, &unused); + std::cerr << __FUNCTION__ << ": prepare:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; + if (ret != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); + goto out; + } + + dump_error(__FUNCTION__, __LINE__, "stepping", ""); + ret = sqlite3_step(stmt); + std::cerr << __FUNCTION__ << ": step:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; + if (ret != SQLITE_DONE) { + dump_error(__FUNCTION__, __LINE__, "error: when stepping", sqlite3_errmsg(db)); + goto out; + } + +#if 0 + sqlite3_finalize(stmt); + + ret = sqlite3_prepare_v3(db, ddl2, strlen(ddl2), 0x0, &stmt, &unused); + std::cerr << __FUNCTION__ << ": prepare:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; + if (ret != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); + goto out; + } + + ret = sqlite3_step(stmt); + std::cerr << __FUNCTION__ << ": step:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; + if (ret != SQLITE_DONE) { + dump_error(__FUNCTION__, __LINE__, "error: when stepping", sqlite3_errmsg(db)); + } +#endif +out: + sqlite3_finalize(stmt); +} + +void insert_db(sqlite3 *db, const char *file_name) +{ + int row = 0; + std::string col_fname; + std::string col_sha256sum; + + std::ifstream is(file_name); + + const char *dml = "INSERT INTO t1(fname, sha256sum) VALUES(?,?);"; + sqlite3_stmt *stmt = NULL; + const char *unused = NULL; + + while (is >> col_sha256sum >> col_fname) { + if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: while preparing", sqlite3_errmsg(db)); + goto out; + } + + if (sqlite3_bind_text(stmt, 1, col_fname.c_str(), strlen(col_fname.c_str()), NULL) != SQLITE_OK) { + std::stringstream ss; + + ss << "error: while attempting to sqlite3_bind_text(col_fname) for row " << row; + dump_error(__FUNCTION__, __LINE__, ss.str().c_str(), sqlite3_errmsg(db)); + goto out; + } + if (sqlite3_bind_text(stmt, 2, col_sha256sum.c_str(), strlen(col_sha256sum.c_str()), NULL) != SQLITE_OK) { + std::stringstream ss; + + ss << "error: while attempting to sqlite3_bind_text(col_sha256sum) for row " << row; + dump_error(__FUNCTION__, __LINE__, ss.str().c_str(), sqlite3_errmsg(db)); + goto out; + } + + int retries = 20; + int ret = -1; + while ((ret = sqlite3_step(stmt)) != SQLITE_DONE) { + usleep(1000); + retries--; + } + if (ret != SQLITE_DONE) { + std::stringstream ss; + + ss << "error:0x" << std::hex << ret << " while attempting to sqlite3_step() for row " << row; + dump_error(__FUNCTION__, __LINE__, ss.str().c_str(), sqlite3_errmsg(db)); + goto out; + } + if (sqlite3_reset(stmt) != SQLITE_OK) { + std::stringstream ss; + + ss << "error: while resetting for row " << row; + dump_error(__FUNCTION__, __LINE__, ss.str().c_str(), sqlite3_errmsg(db)); + goto out; + } + sqlite3_finalize(stmt); + ++row; + std::cerr << "inserted row: " << row << std::endl; + } +out: + return; +} + +long int count_db(sqlite3 *db) +{ + long ret = -1; + const char *dml = "SELECT COUNT(*) FROM t1;"; + sqlite3_stmt *stmt = NULL; + const char *unused = NULL; + + if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); + goto out; + } + + if (sqlite3_step(stmt) != SQLITE_ROW) { + std::stringstream ss; + + ss << "error: while stepping"; + dump_error(__FUNCTION__, __LINE__, ss.str().c_str(), sqlite3_errmsg(db)); + goto out; + } + ret = sqlite3_column_int64(stmt, 0); +out: + sqlite3_finalize(stmt); + return ret; +} + +void list_db(sqlite3 *db) +{ + const char *dml = "SELECT * FROM t1;"; + sqlite3_stmt *stmt = NULL; + const char *unused = NULL; + + if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); + goto out; + } + + while (sqlite3_step(stmt) == SQLITE_ROW) { + std::cout << sqlite3_column_text(stmt, 1) << " " << sqlite3_column_text(stmt, 0) << std::endl; + } +out: + sqlite3_finalize(stmt); +} + +void delete_db(sqlite3 *db) +{ + const char *dml = "DELETE FROM t1; DROP INDEX t1fname; DROP TABLE t1;"; + sqlite3_stmt *stmt = NULL; + const char *unused = NULL; + + if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); + goto out; + } + + if (sqlite3_step(stmt) != SQLITE_DONE) { + dump_error(__FUNCTION__, __LINE__, "error: while deleting table", sqlite3_errmsg(db)); + } +out: + sqlite3_finalize(stmt); +} + +#define CASE(x) case x: return #x + +const char *sqlite_error_str(int err) +{ + switch (err & 0xff) { + CASE(SQLITE_OK); + CASE(SQLITE_ERROR); + CASE(SQLITE_INTERNAL); + CASE(SQLITE_PERM); + CASE(SQLITE_ABORT); + CASE(SQLITE_BUSY); + CASE(SQLITE_LOCKED); + CASE(SQLITE_NOMEM); + CASE(SQLITE_READONLY); + CASE(SQLITE_INTERRUPT); + CASE(SQLITE_IOERR); + CASE(SQLITE_CORRUPT); + CASE(SQLITE_NOTFOUND); + CASE(SQLITE_FULL); + CASE(SQLITE_CANTOPEN); + CASE(SQLITE_PROTOCOL); + CASE(SQLITE_EMPTY); + CASE(SQLITE_SCHEMA); + CASE(SQLITE_TOOBIG); + CASE(SQLITE_CONSTRAINT); + CASE(SQLITE_MISMATCH); + CASE(SQLITE_MISUSE); + CASE(SQLITE_NOLFS); + CASE(SQLITE_AUTH); + CASE(SQLITE_FORMAT); + CASE(SQLITE_RANGE); + CASE(SQLITE_NOTADB); + CASE(SQLITE_NOTICE); + CASE(SQLITE_WARNING); + CASE(SQLITE_ROW); + CASE(SQLITE_DONE); + } + return "NULL"; +} + +const char *sqlite_exterror_str(int err) +{ + switch (err & 0xff) { + case SQLITE_ERROR: + switch (err) { + CASE(SQLITE_ERROR_MISSING_COLLSEQ); + CASE(SQLITE_ERROR_RETRY); + CASE(SQLITE_ERROR_SNAPSHOT); + } + break; + + case SQLITE_IOERR: + switch (err) { + CASE(SQLITE_IOERR_READ); + CASE(SQLITE_IOERR_SHORT_READ); + CASE(SQLITE_IOERR_WRITE); + CASE(SQLITE_IOERR_FSYNC); + CASE(SQLITE_IOERR_DIR_FSYNC); + CASE(SQLITE_IOERR_TRUNCATE); + CASE(SQLITE_IOERR_FSTAT); + CASE(SQLITE_IOERR_UNLOCK); + CASE(SQLITE_IOERR_RDLOCK); + CASE(SQLITE_IOERR_DELETE); + CASE(SQLITE_IOERR_BLOCKED); + CASE(SQLITE_IOERR_NOMEM); + CASE(SQLITE_IOERR_ACCESS); + CASE(SQLITE_IOERR_CHECKRESERVEDLOCK); + CASE(SQLITE_IOERR_LOCK); + CASE(SQLITE_IOERR_CLOSE); + CASE(SQLITE_IOERR_DIR_CLOSE); + CASE(SQLITE_IOERR_SHMOPEN); + CASE(SQLITE_IOERR_SHMSIZE); + CASE(SQLITE_IOERR_SHMLOCK); + CASE(SQLITE_IOERR_SHMMAP); + CASE(SQLITE_IOERR_SEEK); + CASE(SQLITE_IOERR_DELETE_NOENT); + CASE(SQLITE_IOERR_MMAP); + CASE(SQLITE_IOERR_GETTEMPPATH); + CASE(SQLITE_IOERR_CONVPATH); + CASE(SQLITE_IOERR_VNODE); + CASE(SQLITE_IOERR_AUTH); + CASE(SQLITE_IOERR_BEGIN_ATOMIC); + CASE(SQLITE_IOERR_COMMIT_ATOMIC); + CASE(SQLITE_IOERR_ROLLBACK_ATOMIC); + } + break; + + case SQLITE_LOCKED: + switch (err) { + CASE(SQLITE_LOCKED_SHAREDCACHE); + CASE(SQLITE_LOCKED_VTAB); + } + break; + + case SQLITE_BUSY: + switch (err) { + CASE(SQLITE_BUSY_RECOVERY); + CASE(SQLITE_BUSY_SNAPSHOT); + } + break; + + case SQLITE_CANTOPEN: + switch (err) { + CASE(SQLITE_CANTOPEN_NOTEMPDIR); + CASE(SQLITE_CANTOPEN_ISDIR); + CASE(SQLITE_CANTOPEN_FULLPATH); + CASE(SQLITE_CANTOPEN_CONVPATH); + CASE(SQLITE_CANTOPEN_DIRTYWAL); + } + break; + + case SQLITE_CORRUPT: + switch (err) { + CASE(SQLITE_CORRUPT_VTAB); + CASE(SQLITE_CORRUPT_SEQUENCE); + } + break; + + case SQLITE_READONLY: + switch (err) { + CASE(SQLITE_READONLY_RECOVERY); + CASE(SQLITE_READONLY_CANTLOCK); + CASE(SQLITE_READONLY_ROLLBACK); + CASE(SQLITE_READONLY_DBMOVED); + CASE(SQLITE_READONLY_CANTINIT); + CASE(SQLITE_READONLY_DIRECTORY); + } + break; + + case SQLITE_ABORT: + switch (err) { + CASE(SQLITE_ABORT_ROLLBACK); + } + break; + + case SQLITE_CONSTRAINT: + switch (err) { + CASE(SQLITE_CONSTRAINT_CHECK); + CASE(SQLITE_CONSTRAINT_COMMITHOOK); + CASE(SQLITE_CONSTRAINT_FOREIGNKEY); + CASE(SQLITE_CONSTRAINT_FUNCTION); + CASE(SQLITE_CONSTRAINT_NOTNULL); + CASE(SQLITE_CONSTRAINT_PRIMARYKEY); + CASE(SQLITE_CONSTRAINT_TRIGGER); + CASE(SQLITE_CONSTRAINT_UNIQUE); + CASE(SQLITE_CONSTRAINT_VTAB); + CASE(SQLITE_CONSTRAINT_ROWID); + } + break; + + case SQLITE_NOTICE: + switch (err) { + CASE(SQLITE_NOTICE_RECOVER_WAL); + CASE(SQLITE_NOTICE_RECOVER_ROLLBACK); + } + break; + + case SQLITE_WARNING: + switch (err) { + CASE(SQLITE_WARNING_AUTOINDEX); + } + break; + + case SQLITE_AUTH: + switch (err) { + CASE(SQLITE_AUTH_USER); + } + break; + + case SQLITE_OK: + switch (err) { + CASE(SQLITE_OK_LOAD_PERMANENTLY); + } + break; + } + return "EXTNULL"; +} From eb52c869d5a3672a85b9dec25b9e372e57239bcf Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 23 Jul 2019 14:43:50 -0700 Subject: [PATCH 03/18] cmake: improve build inst for cephsqlite Notably, find SQLite and build the test code. Signed-off-by: Patrick Donnelly --- CMakeLists.txt | 9 ++++++++- cmake/modules/FindSQLite3.cmake | 10 ++++++++++ src/CMakeLists.txt | 4 ++-- src/test/CMakeLists.txt | 5 +++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 cmake/modules/FindSQLite3.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e41b367ff3..1c44b7fc05d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -280,7 +280,14 @@ endif(WITH_QATZIP) # needs mds and? XXX option(WITH_LIBCEPHFS "libcephfs client library" ON) -option(WITH_LIBCEPHSQLITE "libcephsqlite client library" ON) + +find_package(SQLite3) +if(SQLITE3_FOUND) + option(WITH_LIBCEPHSQLITE "libcephsqlite client library" ON) +else() + message(WARNING "disabling WITH_LIBCEPHSQLITE, which depends on SQLite3") + set(WITH_LIBCEPHSQLITE OFF) +endif() # key-value store option(WITH_KVS "Key value store is here" ON) diff --git a/cmake/modules/FindSQLite3.cmake b/cmake/modules/FindSQLite3.cmake new file mode 100644 index 00000000000..3d96d66524e --- /dev/null +++ b/cmake/modules/FindSQLite3.cmake @@ -0,0 +1,10 @@ +find_path(SQLITE3_INCLUDE_DIR NAMES sqlite3.h PATHS /usr/include /usr/local/include) +find_library(SQLITE3_LIBRARY NAMES sqlite3) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(SQLITE3 DEFAULT_MSG SQLITE3_LIBRARY SQLITE3_INCLUDE_DIR) + +if(SQLITE3_FOUND) + set(SQLITE3_LIBRARIES ${SQLITE3_LIBRARY}) + set(SQLITE3_INCLUDE_DIRS ${SQLITE3_INCLUDE_DIR}) +endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cd387786d8c..dc5de2c6c1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -806,8 +806,8 @@ endif(WITH_LIBCEPHFS) if(WITH_LIBCEPHSQLITE) set(libcephsqlite_srcs libcephsqlite.cc) add_library(cephsqlite ${CEPH_SHARED} ${libcephsqlite_srcs}) - target_link_libraries(cephsqlite PRIVATE client ceph-common - ${CRYPTO_LIBS} ${EXTRALIBS}) + set_target_properties(cephsqlite PROPERTIES COMPILE_FLAGS "-I${SQLITE3_INCLUDE_DIRS}") + target_link_libraries(cephsqlite radosstriper librados ${SQLITE3_LIBRARIES} ${EXTRALIBS}) if(ENABLE_SHARED) set_target_properties(cephsqlite PROPERTIES OUTPUT_NAME cephsqlite diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 599747073a1..b8a7e5f7c79 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -961,4 +961,9 @@ add_ceph_unittest(unittest_any) add_executable(unittest_weighted_shuffle test_weighted_shuffle.cc) add_ceph_unittest(unittest_weighted_shuffle) +if(WITH_LIBCEPHSQLITE) + add_executable(test_libcephsqlite test_libcephsqlite.cc) + target_link_libraries(test_libcephsqlite cephsqlite radosstriper librados) +endif(WITH_LIBCEPHSQLITE) + #make check ends here From bdb4443c34a04a4ab579499cca18dcfbe545972d Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 23 Jul 2019 14:45:07 -0700 Subject: [PATCH 04/18] cephsqlite: fix compiler errors Signed-off-by: Patrick Donnelly --- src/libcephsqlite.cc | 2 +- src/test/test_libcephsqlite.cc | 36 ++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/libcephsqlite.cc b/src/libcephsqlite.cc index f2a52b30f6f..cb9788d2787 100644 --- a/src/libcephsqlite.cc +++ b/src/libcephsqlite.cc @@ -71,7 +71,7 @@ struct CephFile { }; // map to hold pointers to vfs contexts for various databases -static ceph::mutex vfs_context_map_mutex; // ("vfs_context_map_mutex"); +static ceph::mutex vfs_context_map_mutex("vfs_context_map_mutex"); static std::map vfs_context_map; static const std::string lock_name = "cephsqlite3_vfs_lock"; diff --git a/src/test/test_libcephsqlite.cc b/src/test/test_libcephsqlite.cc index 7a40dc9fe2f..989d1b0c24c 100644 --- a/src/test/test_libcephsqlite.cc +++ b/src/test/test_libcephsqlite.cc @@ -97,7 +97,7 @@ void create_db(sqlite3 *db) sqlite3_stmt *stmt = NULL; const char *unused = NULL; - int ret = sqlite3_prepare_v3(db, ddl1, strlen(ddl1), 0x0, &stmt, &unused); + int ret = sqlite3_prepare_v2(db, ddl1, strlen(ddl1), &stmt, &unused); std::cerr << __FUNCTION__ << ": prepare:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; if (ret != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); @@ -115,7 +115,7 @@ void create_db(sqlite3 *db) #if 0 sqlite3_finalize(stmt); - ret = sqlite3_prepare_v3(db, ddl2, strlen(ddl2), 0x0, &stmt, &unused); + ret = sqlite3_prepare_v2(db, ddl2, strlen(ddl2), &stmt, &unused); std::cerr << __FUNCTION__ << ": prepare:0x" << std::hex << ret << "(" << sqlite_error_str(ret) << ")" << std::endl; if (ret != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); @@ -145,7 +145,7 @@ void insert_db(sqlite3 *db, const char *file_name) const char *unused = NULL; while (is >> col_sha256sum >> col_fname) { - if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + if (sqlite3_prepare_v2(db, dml, strlen(dml), &stmt, &unused) != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: while preparing", sqlite3_errmsg(db)); goto out; } @@ -200,7 +200,7 @@ long int count_db(sqlite3 *db) sqlite3_stmt *stmt = NULL; const char *unused = NULL; - if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + if (sqlite3_prepare_v2(db, dml, strlen(dml), &stmt, &unused) != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); goto out; } @@ -224,7 +224,7 @@ void list_db(sqlite3 *db) sqlite3_stmt *stmt = NULL; const char *unused = NULL; - if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + if (sqlite3_prepare_v2(db, dml, strlen(dml), &stmt, &unused) != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); goto out; } @@ -242,7 +242,7 @@ void delete_db(sqlite3 *db) sqlite3_stmt *stmt = NULL; const char *unused = NULL; - if (sqlite3_prepare_v3(db, dml, strlen(dml), 0x0, &stmt, &unused) != SQLITE_OK) { + if (sqlite3_prepare_v2(db, dml, strlen(dml), &stmt, &unused) != SQLITE_OK) { dump_error(__FUNCTION__, __LINE__, "error: when preparing", sqlite3_errmsg(db)); goto out; } @@ -299,9 +299,15 @@ const char *sqlite_exterror_str(int err) switch (err & 0xff) { case SQLITE_ERROR: switch (err) { +#ifdef SQLITE_ERROR_MISSING_COLLSEQ CASE(SQLITE_ERROR_MISSING_COLLSEQ); +#endif +#ifdef SQLITE_ERROR_RETRY CASE(SQLITE_ERROR_RETRY); +#endif +#ifdef SQLITE_ERROR_SNAPSHOT CASE(SQLITE_ERROR_SNAPSHOT); +#endif } break; @@ -335,16 +341,24 @@ const char *sqlite_exterror_str(int err) CASE(SQLITE_IOERR_CONVPATH); CASE(SQLITE_IOERR_VNODE); CASE(SQLITE_IOERR_AUTH); +#ifdef SQLITE_IOERR_BEGIN_ATOMIC CASE(SQLITE_IOERR_BEGIN_ATOMIC); +#endif +#ifdef SQLITE_IOERR_COMMIT_ATOMIC CASE(SQLITE_IOERR_COMMIT_ATOMIC); +#endif +#ifdef SQLITE_IOERR_ROLLBACK_ATOMIC CASE(SQLITE_IOERR_ROLLBACK_ATOMIC); +#endif } break; case SQLITE_LOCKED: switch (err) { CASE(SQLITE_LOCKED_SHAREDCACHE); +#ifdef SQLITE_LOCKED_VTAB CASE(SQLITE_LOCKED_VTAB); +#endif } break; @@ -361,14 +375,18 @@ const char *sqlite_exterror_str(int err) CASE(SQLITE_CANTOPEN_ISDIR); CASE(SQLITE_CANTOPEN_FULLPATH); CASE(SQLITE_CANTOPEN_CONVPATH); +#ifdef SQLITE_CANTOPEN_DIRTYWAL CASE(SQLITE_CANTOPEN_DIRTYWAL); +#endif } break; case SQLITE_CORRUPT: switch (err) { CASE(SQLITE_CORRUPT_VTAB); +#ifdef SQLITE_CORRUPT_SEQUENCE CASE(SQLITE_CORRUPT_SEQUENCE); +#endif } break; @@ -378,8 +396,12 @@ const char *sqlite_exterror_str(int err) CASE(SQLITE_READONLY_CANTLOCK); CASE(SQLITE_READONLY_ROLLBACK); CASE(SQLITE_READONLY_DBMOVED); +#ifdef SQLITE_READONLY_CANTINIT CASE(SQLITE_READONLY_CANTINIT); +#endif +#ifdef SQLITE_READONLY_DIRECTORY CASE(SQLITE_READONLY_DIRECTORY); +#endif } break; @@ -425,7 +447,9 @@ const char *sqlite_exterror_str(int err) case SQLITE_OK: switch (err) { +#ifdef SQLITE_OK_LOAD_PERMANENTLY CASE(SQLITE_OK_LOAD_PERMANENTLY); +#endif } break; } From 8dbf5de84164c7991cc87b6fc57fb7f4847308f7 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 23 Jul 2019 14:46:03 -0700 Subject: [PATCH 05/18] test_libcephsqlite: test random inserts Signed-off-by: Patrick Donnelly --- src/test/test_libcephsqlite.cc | 55 ++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/src/test/test_libcephsqlite.cc b/src/test/test_libcephsqlite.cc index 989d1b0c24c..248a9682f14 100644 --- a/src/test/test_libcephsqlite.cc +++ b/src/test/test_libcephsqlite.cc @@ -12,7 +12,32 @@ #include +#define sqlcatchcode(S, code) \ +do {\ + rc = S;\ + if (rc != code) {\ + if (rc == SQLITE_BUSY) {\ + rc = EAGAIN;\ + } else {\ + std::cerr << "[" << __FILE__ << ":" << __LINE__ << "]"\ + << " sqlite3 error: " << rc << " `" << sqlite3_errstr(rc)\ + << "': " << sqlite3_errmsg(db) << std::endl;\ + if (rc == SQLITE_CONSTRAINT) {\ + rc = EINVAL;\ + } else {\ + rc = EIO;\ + }\ + }\ + sqlite3_finalize(stmt);\ + stmt = NULL;\ + goto out;\ + }\ +} while (0) + +#define sqlcatch(S) sqlcatchcode(S, 0) + void create_db(sqlite3 *db); +void insert_rand(sqlite3 *db, uint64_t count, uint64_t size); void insert_db(sqlite3 *db, const char *file_name); long int count_db(sqlite3 *db); void list_db(sqlite3 *db); @@ -64,6 +89,8 @@ int main(int argc, char **argv) if (cmd == "create") { create_db(db); + } else if (cmd == "insert_rand") { + insert_rand(db, strtoul(argv[2], NULL, 10), strtoul(argv[3], NULL, 10)); /* argv[2] contains file name of data file */ } else if (cmd == "insert") { insert_db(db, argv[2]); /* argv[2] contains file name of data file */ } else if (cmd == "count") { @@ -132,6 +159,34 @@ out: sqlite3_finalize(stmt); } +void insert_rand(sqlite3 *db, uint64_t count, uint64_t size) +{ + static const char SQL[] = + "CREATE TABLE IF NOT EXISTS rand(text BLOB NOT NULL);" + "INSERT INTO rand VALUES (randomblob(?));" + ; + + sqlite3_stmt *stmt = NULL; + const char *current = SQL; + int rc; + + sqlcatch(sqlite3_prepare_v2(db, current, -1, &stmt, ¤t)); + sqlcatchcode(sqlite3_step(stmt), SQLITE_DONE); + sqlcatch(sqlite3_finalize(stmt)); + + sqlcatch(sqlite3_prepare_v2(db, current, -1, &stmt, ¤t)); + sqlcatch(sqlite3_bind_int64(stmt, 1, (sqlite3_int64)size)); + for (uint64_t i = 0; i < count; i++) { + sqlcatchcode(sqlite3_step(stmt), SQLITE_DONE); + std::cout << "last row inserted: " << sqlite3_last_insert_rowid(db) << std::endl; + } + sqlcatch(sqlite3_finalize(stmt)); + +out: + (void)0; +} + + void insert_db(sqlite3 *db, const char *file_name) { int row = 0; From d0092746c7d1d234da9a9ba8fd6e3822d98897c6 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sat, 13 Mar 2021 12:53:50 -0800 Subject: [PATCH 06/18] Revert "libradosstriper: add function to read into char*" This reverts commit f7494cc1288dd4ba075975c110170e485c3e211b. This change is no longer needed. Signed-off-by: Patrick Donnelly --- src/include/radosstriper/libradosstriper.hpp | 1 - src/libradosstriper/libradosstriper.cc | 22 -------------------- 2 files changed, 23 deletions(-) diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp index 064c7d2c3d8..fb790b0d7ef 100644 --- a/src/include/radosstriper/libradosstriper.hpp +++ b/src/include/radosstriper/libradosstriper.hpp @@ -168,7 +168,6 @@ namespace libradosstriper * synchronously read from the striped object at the specified offset. */ int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off); - int read(const std::string& soid, char *buf, size_t len, uint64_t off); /** * asynchronously read from the striped object at the specified offset. diff --git a/src/libradosstriper/libradosstriper.cc b/src/libradosstriper/libradosstriper.cc index 7f23a380550..e98dfc17935 100644 --- a/src/libradosstriper/libradosstriper.cc +++ b/src/libradosstriper/libradosstriper.cc @@ -254,28 +254,6 @@ int libradosstriper::RadosStriper::read(const std::string& soid, return rados_striper_impl->read(soid, bl, len, off); } -int libradosstriper::RadosStriper::read(const std::string& soid, - char *buf, - size_t len, - uint64_t off) -{ - bufferlist bl; - bufferptr bp = buffer::create_static(len, buf); - - bl.push_back(bp); - - int ret = rados_striper_impl->read(soid, &bl, len, off); - - if (ret >= 0) { - if (bl.length() > len) - return -ERANGE; - if (!bl.is_provided_buffer(buf)) - bl.begin().copy(bl.length(), buf); - ret = bl.length(); // hrm :/ - } - return ret; -} - int libradosstriper::RadosStriper::aio_read(const std::string& soid, librados::AioCompletion *c, bufferlist* bl, From b158ee04f9e75b9a9c9d4f46e297b80319689754 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 19 Feb 2021 19:18:25 -0800 Subject: [PATCH 07/18] common: add timeval conversion for durations Signed-off-by: Patrick Donnelly --- src/common/ceph_time.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h index 2f99188e439..92048029274 100644 --- a/src/common/ceph_time.h +++ b/src/common/ceph_time.h @@ -64,6 +64,16 @@ typedef int64_t signed_rep; // differences between now and a time point in the past. typedef std::chrono::duration signedspan; +template +struct timeval to_timeval(Duration d) { + struct timeval tv; + auto sec = std::chrono::duration_cast(d); + tv.tv_sec = sec.count(); + auto usec = std::chrono::duration_cast(d-sec); + tv.tv_usec = usec.count(); + return tv; +} + // We define our own clocks so we can have our choice of all time // sources supported by the operating system. With the standard // library the resolution and cost are unspecified. (For example, From d3d683427d98b62d9b9974e455243bebe5119b53 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sun, 21 Feb 2021 19:19:25 -0800 Subject: [PATCH 08/18] librados: define must renew lock flag This flag already exists in cls_lock but was not made externally available via librados. Additionally, internally cls_lock refers to the _RENEW flag as _MAY_RENEW, add an alias for librados to match. Signed-off-by: Patrick Donnelly --- src/include/rados/librados.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h index 5df8f99fe11..96de0661bb0 100644 --- a/src/include/rados/librados.h +++ b/src/include/rados/librados.h @@ -55,7 +55,9 @@ extern "C" { /* RADOS lock flags * They are also defined in cls_lock_types.h. Keep them in sync! */ -#define LIBRADOS_LOCK_FLAG_RENEW 0x1 +#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0) +#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW +#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1) /* * Constants for rados_write_op_create(). From 108f486afe772dff771137a13b33887e5bc2a55e Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sat, 13 Mar 2021 09:41:34 -0800 Subject: [PATCH 09/18] mon: define simple-rados-client-with-blocklist profile A mon profile that grants the ability for rados clients to blocklist others (similar to rbd). Signed-off-by: Patrick Donnelly --- doc/rados/operations/user-management.rst | 7 +++++++ src/mon/MonCap.cc | 11 +++++++++++ 2 files changed, 18 insertions(+) diff --git a/doc/rados/operations/user-management.rst b/doc/rados/operations/user-management.rst index 2ea49a88c53..4e487a18f9a 100644 --- a/doc/rados/operations/user-management.rst +++ b/doc/rados/operations/user-management.rst @@ -295,6 +295,13 @@ The following entries describe valid capability profiles: :Description: Gives a user read-only permissions for monitor, OSD, and PG data. Intended for use by direct librados client applications. +``profile simple-rados-client-with-blocklist`` (Monitor only) + +:Description: Gives a user read-only permissions for monitor, OSD, and PG data. + Intended for use by direct librados client applications. Also + includes permission to add blocklist entries to build HA + applications. + ``profile fs-client`` (Monitor only) :Description: Gives a user read-only permissions for monitor, OSD, PG, and MDS diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc index d6055283271..f96deacd8e5 100644 --- a/src/mon/MonCap.cc +++ b/src/mon/MonCap.cc @@ -290,6 +290,17 @@ void MonCapGrant::expand_profile(const EntityName& name) const profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); } + if (profile == "simple-rados-client-with-blocklist") { + profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("pg", MON_CAP_R)); + profile_grants.push_back(MonCapGrant("osd blocklist")); + profile_grants.back().command_args["blocklistop"] = StringConstraint( + StringConstraint::MATCH_TYPE_EQUAL, "add"); + profile_grants.back().command_args["addr"] = StringConstraint( + StringConstraint::MATCH_TYPE_REGEX, "^[^/]+/[0-9]+$"); + + } if (boost::starts_with(profile, "rbd")) { profile_grants.push_back(MonCapGrant("mon", MON_CAP_R)); profile_grants.push_back(MonCapGrant("osd", MON_CAP_R)); From 9d82cc629dff505403a5040595eb0773384992e8 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 28 Jan 2021 15:04:01 -0800 Subject: [PATCH 10/18] SimpleRADOSStriper: add new minimal async striper This was developed because the two other striper implementations were unsuitable for libcephsqlite: - libradosstriper: while the async APIs exist, its current protocol requires synchronously locking an object for every write/read whether that operation is async or not. For this reason, it's too far too slow for latency sensitive applications. - osdc/Filer: this requires the object name to be an inode number. It also comes with other overhead burden which is not necessary for libcephsqlite including caching/buffering. SimpleRADOSStriper aims to be a minimalistic heavily asynchronous striper. One way it achieves this is through the use of exclusive locks to protect access to the striped objects. Most metadata updates are deferred until the striped file is unlocked, flushed, (or closed). All reads/writes are asynchronous (but a read implicitly gathers async striped reads for each op). Writes are not buffered. Reads are not cached. There is no readahead. SimpleRADOSStriper aims to be compatible with the rados binary --striper option for extracting files out of RADOS but it should not be used otherwise. Signed-off-by: Patrick Donnelly --- src/SimpleRADOSStriper.cc | 758 ++++++++++++++++++++++++++++++++++++++ src/SimpleRADOSStriper.h | 137 +++++++ 2 files changed, 895 insertions(+) create mode 100644 src/SimpleRADOSStriper.cc create mode 100644 src/SimpleRADOSStriper.h diff --git a/src/SimpleRADOSStriper.cc b/src/SimpleRADOSStriper.cc new file mode 100644 index 00000000000..1c2b96c2973 --- /dev/null +++ b/src/SimpleRADOSStriper.cc @@ -0,0 +1,758 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License version 2.1, as published by + * the Free Software Foundation. See file COPYING. + * + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include "include/ceph_assert.h" +#include "include/rados/librados.hpp" + +#include "cls/lock/cls_lock_client.h" + +#include "common/ceph_argparse.h" +#include "common/ceph_mutex.h" +#include "common/common_init.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/version.h" + +#include "SimpleRADOSStriper.h" + +using ceph::bufferlist; + +#define dout_subsys ceph_subsys_client +#undef dout_prefix +#define dout_prefix *_dout << "client." << ioctx.get_instance_id() << ": SimpleRADOSStriper: " << __func__ << ": " << oid << ": " +#define d(lvl) ldout((CephContext*)ioctx.cct(), (lvl)) + +enum { + P_FIRST = 0xe0000, + P_UPDATE_METADATA, + P_UPDATE_ALLOCATED, + P_UPDATE_SIZE, + P_UPDATE_VERSION, + P_SHRINK, + P_SHRINK_BYTES, + P_LOCK, + P_UNLOCK, + P_LAST, +}; + +int SimpleRADOSStriper::config_logger(CephContext* cct, std::string_view name, std::shared_ptr* l) +{ + PerfCountersBuilder plb(cct, name.data(), P_FIRST, P_LAST); + plb.add_u64_counter(P_UPDATE_METADATA, "update_metadata", "Number of metadata updates"); + plb.add_u64_counter(P_UPDATE_ALLOCATED, "update_allocated", "Number of allocated updates"); + plb.add_u64_counter(P_UPDATE_SIZE, "update_size", "Number of size updates"); + plb.add_u64_counter(P_UPDATE_VERSION, "update_version", "Number of version updates"); + plb.add_u64_counter(P_SHRINK, "shrink", "Number of allocation shrinks"); + plb.add_u64_counter(P_SHRINK_BYTES, "shrink_bytes", "Bytes shrunk"); + plb.add_u64_counter(P_LOCK, "lock", "Number of locks"); + plb.add_u64_counter(P_UNLOCK, "unlock", "Number of unlocks"); + l->reset(plb.create_perf_counters()); + return 0; +} + +SimpleRADOSStriper::~SimpleRADOSStriper() +{ + if (lock_keeper.joinable()) { + shutdown = true; + lock_keeper_cvar.notify_all(); + lock_keeper.join(); + } + + if (ioctx.is_valid()) { + d(5) << dendl; + + if (is_locked()) { + unlock(); + } + } +} + +SimpleRADOSStriper::extent SimpleRADOSStriper::get_next_extent(uint64_t off, size_t len) const +{ + extent e; + { + uint64_t stripe = (off>>object_size); + CachedStackStringStream css; + *css << oid; + *css << "."; + *css << std::setw(16) << std::setfill('0') << std::hex << stripe; + e.soid = css->str(); + } + e.off = off & ((1<(len, (1<wait_for_complete(); rc < 0) { + d(5) << " update failed: " << cpp_strerror(rc) << dendl; + return rc; + } + } + updates.clear(); + + return 0; +} + +int SimpleRADOSStriper::flush() +{ + d(5) << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + if (size_dirty) { + if (int rc = set_metadata(size, true); rc < 0) { + return rc; + } + } + + if (int rc = wait_for_updates(); rc < 0) { + return rc; + } + + return 0; +} + +int SimpleRADOSStriper::stat(uint64_t* s) +{ + d(5) << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + *s = size; + return 0; +} + +int SimpleRADOSStriper::create() +{ + d(5) << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + auto ext = get_first_extent(); + auto op = librados::ObjectWriteOperation(); + /* exclusive create ensures we do none of these setxattrs happen if it fails */ + op.create(1); + op.setxattr(XATTR_VERSION, uint2bl(0)); + op.setxattr(XATTR_EXCL, bufferlist()); + op.setxattr(XATTR_SIZE, uint2bl(0)); + op.setxattr(XATTR_ALLOCATED, uint2bl(0)); + op.setxattr(XATTR_LAYOUT_STRIPE_UNIT, uint2bl(1)); + op.setxattr(XATTR_LAYOUT_STRIPE_COUNT, uint2bl(1)); + op.setxattr(XATTR_LAYOUT_OBJECT_SIZE, uint2bl(1< removes; + + ceph_assert(a <= allocated); + uint64_t prune = std::max(a, (1u << object_size)); /* never delete first extent here */ + uint64_t len = allocated - prune; + const uint64_t bytes_removed = len; + uint64_t offset = prune; + while (len > 0) { + auto ext = get_next_extent(offset, len); + auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); + if (int rc = ioctx.aio_remove(ext.soid, aiocp.get()); rc < 0) { + d(5) << " aio_remove failed: " << cpp_strerror(rc) << dendl; + return rc; + } + removes.emplace_back(std::move(aiocp)); + len -= ext.len; + offset += ext.len; + } + + for (auto& aiocp : removes) { + if (int rc = aiocp->wait_for_complete(); rc < 0 && rc != -ENOENT) { + d(5) << " aio_remove failed: " << cpp_strerror(rc) << dendl; + return rc; + } + } + + auto ext = get_first_extent(); + auto op = librados::ObjectWriteOperation(); + auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); + op.setxattr(XATTR_ALLOCATED, uint2bl(a)); + d(15) << " updating allocated to " << a << dendl; + op.setxattr(XATTR_VERSION, uint2bl(version+1)); + d(15) << " updating version to " << (version+1) << dendl; + if (int rc = ioctx.aio_operate(ext.soid, aiocp.get(), &op); rc < 0) { + d(5) << " update failed: " << cpp_strerror(rc) << dendl; + return rc; + } + /* we need to wait so we don't have dangling extents */ + d(10) << " waiting for allocated update" << dendl; + if (int rc = aiocp->wait_for_complete(); rc < 0) { + d(1) << " update failure: " << cpp_strerror(rc) << dendl; + return rc; + } + if (logger) { + logger->inc(P_UPDATE_METADATA); + logger->inc(P_UPDATE_ALLOCATED); + logger->inc(P_UPDATE_VERSION); + logger->inc(P_SHRINK); + logger->inc(P_SHRINK_BYTES, bytes_removed); + } + + version += 1; + allocated = a; + return 0; +} + +int SimpleRADOSStriper::maybe_shrink_alloc() +{ + d(15) << dendl; + + if (size == 0) { + if (allocated > 0) { + d(10) << "allocation shrink to 0" << dendl; + return shrink_alloc(0); + } else { + return 0; + } + } + + uint64_t mask = (1< new_allocated && ((allocated-new_allocated) > min_growth)) { + d(10) << "allocation shrink to " << new_allocated << dendl; + return shrink_alloc(new_allocated); + } + + return 0; +} + +bufferlist SimpleRADOSStriper::str2bl(std::string_view sv) +{ + bufferlist bl; + bl.append(sv); + return bl; +} + +bufferlist SimpleRADOSStriper::uint2bl(uint64_t v) +{ + CachedStackStringStream css; + *css << std::dec << std::setw(16) << std::setfill('0') << v; + bufferlist bl; + bl.append(css->strv()); + return bl; +} + +int SimpleRADOSStriper::set_metadata(uint64_t new_size, bool update_size) +{ + d(10) << " new_size: " << new_size + << " update_size: " << update_size + << " allocated: " << allocated + << " size: " << size + << " version: " << version + << dendl; + + bool do_op = false; + auto new_allocated = allocated; + auto ext = get_first_extent(); + auto op = librados::ObjectWriteOperation(); + if (new_size > allocated) { + uint64_t mask = (1<inc(P_UPDATE_ALLOCATED); + d(15) << " updating allocated to " << new_allocated << dendl; + } + if (update_size) { + op.setxattr(XATTR_SIZE, uint2bl(new_size)); + do_op = true; + if (logger) logger->inc(P_UPDATE_SIZE); + d(15) << " updating size to " << new_size << dendl; + } + if (do_op) { + if (logger) logger->inc(P_UPDATE_METADATA); + if (logger) logger->inc(P_UPDATE_VERSION); + op.setxattr(XATTR_VERSION, uint2bl(version+1)); + d(15) << " updating version to " << (version+1) << dendl; + auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); + if (int rc = ioctx.aio_operate(ext.soid, aiocp.get(), &op); rc < 0) { + d(1) << " update failure: " << cpp_strerror(rc) << dendl; + return rc; + } + version += 1; + if (allocated != new_allocated) { + /* we need to wait so we don't have dangling extents */ + d(10) << "waiting for allocated update" << dendl; + if (int rc = aiocp->wait_for_complete(); rc < 0) { + d(1) << " update failure: " << cpp_strerror(rc) << dendl; + return rc; + } + aiocp.reset(); + allocated = new_allocated; + } + if (aiocp) { + updates.emplace_back(std::move(aiocp)); + } + if (update_size) { + size = new_size; + size_dirty = false; + return maybe_shrink_alloc(); + } + } + return 0; +} + +ssize_t SimpleRADOSStriper::write(const void* data, size_t len, uint64_t off) +{ + d(5) << off << "~" << len << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + if (allocated < (len+off)) { + if (int rc = set_metadata(len+off, false); rc < 0) { + return rc; + } + } + + size_t w = 0; + while ((len-w) > 0) { + auto ext = get_next_extent(off+w, len-w); + auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); + bufferlist bl; + bl.append((const char*)data+w, ext.len); + if (int rc = ioctx.aio_write(ext.soid, aiocp.get(), bl, ext.len, ext.off); rc < 0) { + break; + } + updates.emplace_back(std::move(aiocp)); + w += ext.len; + } + + if (size < (len+off)) { + size = len+off; + size_dirty = true; + d(10) << " dirty size: " << size << dendl; + } + + return (ssize_t)w; +} + +ssize_t SimpleRADOSStriper::read(void* data, size_t len, uint64_t off) +{ + d(5) << off << "~" << len << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + size_t r = 0; + std::vector> reads; + while ((len-r) > 0) { + auto ext = get_next_extent(off+r, len-r); + auto& [bl, aiocp] = reads.emplace_back(); + aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); + if (int rc = ioctx.aio_read(ext.soid, aiocp.get(), &bl, ext.len, ext.off); rc < 0) { + d(1) << " read failure: " << cpp_strerror(rc) << dendl; + return rc; + } + r += ext.len; + } + + r = 0; + for (auto& [bl, aiocp] : reads) { + if (int rc = aiocp->wait_for_complete(); rc < 0) { + d(1) << " read failure: " << cpp_strerror(rc) << dendl; + return rc; + } + bl.begin().copy(bl.length(), ((char*)data)+r); + r += bl.length(); + } + ceph_assert(r <= len); + + return r; +} + +int SimpleRADOSStriper::print_lockers(std::ostream& out) +{ + int exclusive; + std::string tag; + std::list lockers; + auto ext = get_first_extent(); + if (int rc = ioctx.list_lockers(ext.soid, biglock, &exclusive, &tag, &lockers); rc < 0) { + d(1) << " list_lockers failure: " << cpp_strerror(rc) << dendl; + return rc; + } + if (lockers.empty()) { + out << " lockers none"; + } else { + out << " lockers exclusive=" << exclusive << " tag=" << tag << " lockers=["; + bool first = true; + for (const auto& l : lockers) { + if (!first) out << ","; + out << l.client << ":" << l.cookie << ":" << l.address; + } + out << "]"; + } + return 0; +} + +/* Do lock renewal in a separate thread: while it's unlikely sqlite chews on + * something for multiple seconds without calling into the VFS (where we could + * initiate a lock renewal), it's not impossible with complex queries. Also, we + * want to allow "PRAGMA locking_mode = exclusive" where the application may + * not use the sqlite3 database connection for an indeterminate amount of time. + */ +void SimpleRADOSStriper::lock_keeper_main(void) +{ + d(20) << dendl; + const auto ext = get_first_extent(); + while (!shutdown) { + d(20) << "tick" << dendl; + std::unique_lock lock(lock_keeper_mutex); + auto now = clock::now(); + auto since = now-last_renewal; + + if (since >= lock_keeper_interval && locked) { + d(10) << "renewing lock" << dendl; + auto tv = ceph::to_timeval(lock_keeper_timeout); + int rc = ioctx.lock_exclusive(ext.soid, biglock, cookie.to_string(), lockdesc, &tv, LIBRADOS_LOCK_FLAG_MUST_RENEW); + if (rc) { + /* If lock renewal fails, we cannot continue the application. Return + * -EBLOCKLISTED for all calls into the striper for this instance, even + * if we're not actually blocklisted. + */ + d(-1) << "lock renewal failed: " << cpp_strerror(rc) << dendl; + blocklisted = true; + break; + } + last_renewal = clock::now(); + } + + lock_keeper_cvar.wait_for(lock, lock_keeper_interval); + } +} + +int SimpleRADOSStriper::recover_lock() +{ + d(5) << "attempting to recover lock" << dendl; + + std::string addrs; + const auto ext = get_first_extent(); + + { + auto tv = ceph::to_timeval(lock_keeper_timeout); + if (int rc = ioctx.lock_exclusive(ext.soid, biglock, cookie.to_string(), lockdesc, &tv, 0); rc < 0) { + return rc; + } + locked = true; + last_renewal = clock::now(); + } + + d(5) << "acquired lock, fetching last owner" << dendl; + + { + bufferlist bl_excl; + if (int rc = ioctx.getxattr(ext.soid, XATTR_EXCL, bl_excl); rc < 0) { + if (rc == -ENOENT) { + /* someone removed it? ok... */ + goto setowner; + } else { + d(-1) << "could not recover exclusive locker" << dendl; + locked = false; /* it will drop eventually */ + return -EIO; + } + } + addrs = bl_excl.to_str(); + } + + if (addrs.empty()) { + d(5) << "someone else cleaned up" << dendl; + goto setowner; + } else { + d(5) << "exclusive lock holder was " << addrs << dendl; + } + + if (blocklist_the_dead) { + entity_addrvec_t addrv; + addrv.parse(addrs.c_str()); + auto R = librados::Rados(ioctx); + auto b = "blocklist"sv; +retry: + for (auto& a : addrv.v) { + CachedStackStringStream css; + *css << "{\"prefix\":\"osd " << b << "\", \"" << b << "op\":\"add\","; + *css << "\"addr\":\""; + *css << a; + *css << "\"}"; + std::vector cmd = {css->str()}; + d(5) << "sending blocklist command: " << cmd << dendl; + std::string out; + if (int rc = R.mon_command(css->str(), bufferlist(), nullptr, &out); rc < 0) { + if (rc == -EINVAL && b == "blocklist"sv) { + b = "blacklist"sv; + goto retry; + } + d(-1) << "Cannot proceed with recovery because I have failed to blocklist the old client: " << cpp_strerror(rc) << ", out = " << out << dendl; + locked = false; /* it will drop eventually */ + return -EIO; + } + } + /* Ensure our osd_op requests have the latest epoch. */ + R.wait_for_latest_osdmap(); + } + +setowner: + d(5) << "setting new owner to myself, " << myaddrs << dendl; + { + auto myaddrbl = str2bl(myaddrs); + if (int rc = ioctx.setxattr(ext.soid, XATTR_EXCL, myaddrbl); rc < 0) { + d(-1) << "could not set lock owner" << dendl; + locked = false; /* it will drop eventually */ + return -EIO; + } + } + return 0; +} + +int SimpleRADOSStriper::lock(uint64_t timeoutms) +{ + /* XXX: timeoutms is unused */ + d(5) << "timeout=" << timeoutms << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + std::scoped_lock lock(lock_keeper_mutex); + + ceph_assert(!is_locked()); + + /* We're going to be very lazy here in implementation: only exclusive locks + * are allowed. That even ensures a single reader. + */ + uint64_t slept = 0; + + auto ext = get_first_extent(); + while (true) { + /* The general fast path in one compound operation: obtain the lock, + * confirm the past locker cleaned up after themselves (set XATTR_EXCL to + * ""), then finally set XATTR_EXCL to our address vector as the new + * exclusive locker. + */ + + auto op = librados::ObjectWriteOperation(); + auto tv = ceph::to_timeval(lock_keeper_timeout); + utime_t duration; + duration.set_from_timeval(&tv); + rados::cls::lock::lock(&op, biglock, ClsLockType::EXCLUSIVE, cookie.to_string(), "", lockdesc, duration, 0); + op.cmpxattr(XATTR_EXCL, LIBRADOS_CMPXATTR_OP_EQ, bufferlist()); + op.setxattr(XATTR_EXCL, str2bl(myaddrs)); + int rc = ioctx.operate(ext.soid, &op); + if (rc == 0) { + locked = true; + last_renewal = clock::now(); + break; + } else if (rc == -EBUSY) { + if ((slept % 500000) == 0) { + d(-1) << "waiting for locks: "; + print_lockers(*_dout); + *_dout << dendl; + } + usleep(5000); + slept += 5000; + continue; + } else if (rc == -ECANCELED) { + /* CMPXATTR failed, a locker didn't cleanup. Try to recover! */ + if (rc = recover_lock(); rc < 0) { + if (rc == -EBUSY) { + continue; /* try again */ + } + return rc; + } + break; + } else { + d(-1) << " lock failed: " << cpp_strerror(rc) << dendl; + return rc; + } + } + + if (!lock_keeper.joinable()) { + lock_keeper = std::thread(&SimpleRADOSStriper::lock_keeper_main, this); + } + + if (int rc = open(); rc < 0) { + d(5) << " open failed: " << cpp_strerror(rc) << dendl; + return rc; + } + + d(5) << " = 0" << dendl; + if (logger) { + logger->inc(P_LOCK); + } + + return 0; +} + +int SimpleRADOSStriper::unlock() +{ + d(5) << dendl; + + if (blocklisted.load()) { + return -EBLOCKLISTED; + } + + std::scoped_lock lock(lock_keeper_mutex); + + ceph_assert(is_locked()); + + /* wait for flush of metadata */ + if (int rc = flush(); rc < 0) { + return rc; + } + + const auto ext = get_first_extent(); + auto op = librados::ObjectWriteOperation(); + op.cmpxattr(XATTR_EXCL, LIBRADOS_CMPXATTR_OP_EQ, str2bl(myaddrs)); + op.setxattr(XATTR_EXCL, bufferlist()); + rados::cls::lock::unlock(&op, biglock, cookie.to_string()); + if (int rc = ioctx.operate(ext.soid, &op); rc < 0) { + d(-1) << " unlock failed: " << cpp_strerror(rc) << dendl; + return rc; + } + locked = false; + + d(5) << " = 0" << dendl; + if (logger) { + logger->inc(P_UNLOCK); + } + + return 0; +} diff --git a/src/SimpleRADOSStriper.h b/src/SimpleRADOSStriper.h new file mode 100644 index 00000000000..2472eb7dda9 --- /dev/null +++ b/src/SimpleRADOSStriper.h @@ -0,0 +1,137 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License version 2.1, as published by + * the Free Software Foundation. See file COPYING. + * + */ + +#ifndef _SIMPLERADOSSTRIPER_H +#define _SIMPLERADOSSTRIPER_H + +#include +#include + +#include "include/buffer.h" +#include "include/rados/librados.hpp" +#include "include/uuid.h" +#include "include/types.h" + +#include "common/ceph_time.h" +#include "common/perf_counters.h" + +class SimpleRADOSStriper +{ +public: + using aiocompletionptr = std::unique_ptr; + using clock = ceph::coarse_mono_clock; + using time = ceph::coarse_mono_time; + + static inline const uint64_t object_size = 22; /* power of 2 */ + static inline const uint64_t min_growth = (1<<27); /* 128 MB */ + static int config_logger(CephContext* cct, std::string_view name, std::shared_ptr* l); + + SimpleRADOSStriper() = default; + SimpleRADOSStriper(librados::IoCtx _ioctx, std::string _oid) + : ioctx(std::move(_ioctx)) + , oid(std::move(_oid)) + { + cookie.generate_random(); + auto r = librados::Rados(ioctx); + myaddrs = r.get_addrs(); + } + SimpleRADOSStriper(const SimpleRADOSStriper&) = delete; + SimpleRADOSStriper& operator=(const SimpleRADOSStriper&) = delete; + SimpleRADOSStriper& operator=(SimpleRADOSStriper&&) = delete; + SimpleRADOSStriper(SimpleRADOSStriper&&) = delete; + ~SimpleRADOSStriper(); + + int create(); + int open(); + int remove(); + int stat(uint64_t* size); + ssize_t write(const void* data, size_t len, uint64_t off); + ssize_t read(void* data, size_t len, uint64_t off); + int truncate(size_t size); + int flush(); + int lock(uint64_t timeoutms); + int unlock(); + int is_locked() const { + return locked; + } + int print_lockers(std::ostream& out); + void set_logger(std::shared_ptr l) { + logger = std::move(l); + } + void set_lock_interval(std::chrono::milliseconds t) { + lock_keeper_interval = t; + } + void set_lock_timeout(std::chrono::milliseconds t) { + lock_keeper_timeout = t; + } + void set_blocklist_the_dead(bool b) { + blocklist_the_dead = b; + } + +protected: + struct extent { + std::string soid; + size_t len; + size_t off; + }; + + ceph::bufferlist str2bl(std::string_view sv); + ceph::bufferlist uint2bl(uint64_t v); + int set_metadata(uint64_t new_size, bool update_size); + int shrink_alloc(uint64_t a); + int maybe_shrink_alloc(); + int wait_for_updates(); + int recover_lock(); + extent get_next_extent(uint64_t off, size_t len) const; + extent get_first_extent() const { + return get_next_extent(0, 0); + } + +private: + static inline const char XATTR_EXCL[] = "striper.excl"; + static inline const char XATTR_SIZE[] = "striper.size"; + static inline const char XATTR_ALLOCATED[] = "striper.allocated"; + static inline const char XATTR_VERSION[] = "striper.version"; + static inline const char XATTR_LAYOUT_STRIPE_UNIT[] = "striper.layout.stripe_unit"; + static inline const char XATTR_LAYOUT_STRIPE_COUNT[] = "striper.layout.stripe_count"; + static inline const char XATTR_LAYOUT_OBJECT_SIZE[] = "striper.layout.object_size"; + static inline const std::string biglock = "striper.lock"; + static inline const std::string lockdesc = "SimpleRADOSStriper"; + + void lock_keeper_main(); + + librados::IoCtx ioctx; + std::shared_ptr logger; + std::string oid; + std::thread lock_keeper; + std::condition_variable lock_keeper_cvar; + std::mutex lock_keeper_mutex; + time last_renewal = time::min(); + std::chrono::milliseconds lock_keeper_interval = 2000ms; + std::chrono::milliseconds lock_keeper_timeout = 30000ms; + std::atomic blocklisted = false; + bool shutdown = false; + version_t version = 0; + std::string exclusive_holder; + uint64_t size = 0; + uint64_t allocated = 0; + uuid_d cookie{}; + bool locked = false; + bool size_dirty = false; + bool blocklist_the_dead = true; + std::vector updates; + std::string myaddrs; +}; + +#endif /* _SIMPLERADOSSTRIPER_H */ From 458c6e7c12a6030d07cad2f1d132a43ec1c97fb4 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Sat, 13 Mar 2021 13:39:50 -0800 Subject: [PATCH 11/18] SimpleRADOSStriper: wait for finished aios after write So the "aios" (better named) vector doesn't grow infinitely. Signed-off-by: Patrick Donnelly --- src/SimpleRADOSStriper.cc | 42 ++++++++++++++++++++++++++------------- src/SimpleRADOSStriper.h | 6 ++++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/SimpleRADOSStriper.cc b/src/SimpleRADOSStriper.cc index 1c2b96c2973..736e8769d43 100644 --- a/src/SimpleRADOSStriper.cc +++ b/src/SimpleRADOSStriper.cc @@ -120,7 +120,8 @@ int SimpleRADOSStriper::remove() return -EBLOCKLISTED; } - if (int rc = wait_for_updates(); rc < 0) { + if (int rc = wait_for_aios(true); rc < 0) { + aios_failure = 0; return rc; } @@ -155,19 +156,29 @@ int SimpleRADOSStriper::truncate(uint64_t size) return 0; } -int SimpleRADOSStriper::wait_for_updates() +int SimpleRADOSStriper::wait_for_aios(bool block) { - d(10) << dendl; - - for (auto& aiocp : updates) { - if (int rc = aiocp->wait_for_complete(); rc < 0) { - d(5) << " update failed: " << cpp_strerror(rc) << dendl; - return rc; + while (!aios.empty()) { + auto& aiocp = aios.front(); + int rc; + if (block) { + rc = aiocp->wait_for_complete(); + } else { + if (aiocp->is_complete()) { + rc = aiocp->get_return_value(); + } else { + return 0; + } } + if (rc) { + d(5) << " aio failed: " << cpp_strerror(rc) << dendl; + if (aios_failure == 0) { + aios_failure = rc; + } + } + aios.pop(); } - updates.clear(); - - return 0; + return aios_failure; } int SimpleRADOSStriper::flush() @@ -184,7 +195,8 @@ int SimpleRADOSStriper::flush() } } - if (int rc = wait_for_updates(); rc < 0) { + if (int rc = wait_for_aios(true); rc < 0) { + aios_failure = 0; return rc; } @@ -418,7 +430,7 @@ int SimpleRADOSStriper::set_metadata(uint64_t new_size, bool update_size) allocated = new_allocated; } if (aiocp) { - updates.emplace_back(std::move(aiocp)); + aios.emplace(std::move(aiocp)); } if (update_size) { size = new_size; @@ -452,10 +464,12 @@ ssize_t SimpleRADOSStriper::write(const void* data, size_t len, uint64_t off) if (int rc = ioctx.aio_write(ext.soid, aiocp.get(), bl, ext.len, ext.off); rc < 0) { break; } - updates.emplace_back(std::move(aiocp)); + aios.emplace(std::move(aiocp)); w += ext.len; } + wait_for_aios(false); // clean up finished completions + if (size < (len+off)) { size = len+off; size_dirty = true; diff --git a/src/SimpleRADOSStriper.h b/src/SimpleRADOSStriper.h index 2472eb7dda9..2e7984870f9 100644 --- a/src/SimpleRADOSStriper.h +++ b/src/SimpleRADOSStriper.h @@ -15,6 +15,7 @@ #ifndef _SIMPLERADOSSTRIPER_H #define _SIMPLERADOSSTRIPER_H +#include #include #include @@ -91,7 +92,7 @@ protected: int set_metadata(uint64_t new_size, bool update_size); int shrink_alloc(uint64_t a); int maybe_shrink_alloc(); - int wait_for_updates(); + int wait_for_aios(bool block); int recover_lock(); extent get_next_extent(uint64_t off, size_t len) const; extent get_first_extent() const { @@ -130,7 +131,8 @@ private: bool locked = false; bool size_dirty = false; bool blocklist_the_dead = true; - std::vector updates; + std::queue aios; + int aios_failure = 0; std::string myaddrs; }; From f7fa3b50c1e87ad14b16024ec68812195480b102 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 28 Jan 2021 15:11:50 -0800 Subject: [PATCH 12/18] libcephsqlite: rework architecture and backend This is a complete rewrite on top of SimpleRADOSStriper as well as the API. The VFS is now a loadable extension as well. Fixes: https://tracker.ceph.com/issues/40609 Signed-off-by: Patrick Donnelly --- CMakeLists.txt | 9 +- cmake/modules/FindSQLite3.cmake | 14 +- src/CMakeLists.txt | 13 +- src/common/options.cc | 22 + src/common/subsys.h | 1 + src/include/CMakeLists.txt | 4 + src/include/libcephsqlite.h | 67 ++ src/libcephsqlite.cc | 1547 ++++++++++++++----------------- src/libcephsqlite.h | 24 - 9 files changed, 829 insertions(+), 872 deletions(-) create mode 100644 src/include/libcephsqlite.h delete mode 100644 src/libcephsqlite.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c44b7fc05d..d571b4b8d3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -281,12 +281,9 @@ endif(WITH_QATZIP) # needs mds and? XXX option(WITH_LIBCEPHFS "libcephfs client library" ON) -find_package(SQLite3) -if(SQLITE3_FOUND) - option(WITH_LIBCEPHSQLITE "libcephsqlite client library" ON) -else() - message(WARNING "disabling WITH_LIBCEPHSQLITE, which depends on SQLite3") - set(WITH_LIBCEPHSQLITE OFF) +option(WITH_LIBCEPHSQLITE "libcephsqlite client library" ON) +if(WITH_LIBCEPHSQLITE) + find_package(SQLite3 REQUIRED) endif() # key-value store diff --git a/cmake/modules/FindSQLite3.cmake b/cmake/modules/FindSQLite3.cmake index 3d96d66524e..33e54d9e0c8 100644 --- a/cmake/modules/FindSQLite3.cmake +++ b/cmake/modules/FindSQLite3.cmake @@ -1,10 +1,12 @@ -find_path(SQLITE3_INCLUDE_DIR NAMES sqlite3.h PATHS /usr/include /usr/local/include) -find_library(SQLITE3_LIBRARY NAMES sqlite3) +find_path(SQLite3_INCLUDE_DIR NAMES sqlite3.h) +find_library(SQLite3_LIBRARY NAMES sqlite3 sqlite) include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(SQLITE3 DEFAULT_MSG SQLITE3_LIBRARY SQLITE3_INCLUDE_DIR) +find_package_handle_standard_args(SQLite3 DEFAULT_MSG SQLite3_LIBRARY SQLite3_INCLUDE_DIR) -if(SQLITE3_FOUND) - set(SQLITE3_LIBRARIES ${SQLITE3_LIBRARY}) - set(SQLITE3_INCLUDE_DIRS ${SQLITE3_INCLUDE_DIR}) +if(NOT TARGET SQLite3::SQLite3) + add_library(SQLite3::SQLite3 UNKNOWN IMPORTED) + set_target_properties(SQLite3::SQLite3 PROPERTIES + IMPORTED_LOCATION "${SQLite3_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${SQLite3_INCLUDE_DIR}") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dc5de2c6c1a..f31e888228a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -804,16 +804,9 @@ if(WITH_LIBCEPHFS) endif(WITH_LIBCEPHFS) if(WITH_LIBCEPHSQLITE) - set(libcephsqlite_srcs libcephsqlite.cc) - add_library(cephsqlite ${CEPH_SHARED} ${libcephsqlite_srcs}) - set_target_properties(cephsqlite PROPERTIES COMPILE_FLAGS "-I${SQLITE3_INCLUDE_DIRS}") - target_link_libraries(cephsqlite radosstriper librados ${SQLITE3_LIBRARIES} ${EXTRALIBS}) - if(ENABLE_SHARED) - set_target_properties(cephsqlite PROPERTIES - OUTPUT_NAME cephsqlite - VERSION 1.0.0 - SOVERSION 1) - endif(ENABLE_SHARED) + set(cephsqlite_srcs libcephsqlite.cc SimpleRADOSStriper.cc) + add_library(cephsqlite ${CEPH_SHARED} ${cephsqlite_srcs}) + target_link_libraries(cephsqlite PRIVATE cls_lock_client librados ceph-common SQLite3::SQLite3 ${EXTRALIBS}) install(TARGETS cephsqlite DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif(WITH_LIBCEPHSQLITE) diff --git a/src/common/options.cc b/src/common/options.cc index 87ff8789d64..aa1a097f280 100644 --- a/src/common/options.cc +++ b/src/common/options.cc @@ -5594,6 +5594,28 @@ std::vector