From f1113512d154683dd069409d3f2e11cf93fb334e Mon Sep 17 00:00:00 2001 From: changtao Date: Fri, 27 Nov 2015 21:54:27 +0800 Subject: [PATCH 1/4] FileStore:Use pwritev instead of writev Signed-off-by: Tao Chang --- src/common/buffer.cc | 53 +++++++++++++++++++++++++++++++++++ src/include/buffer.h | 1 + src/os/filestore/FileStore.cc | 19 +------------ src/test/bufferlist.cc | 18 ++++++++++++ 4 files changed, 73 insertions(+), 18 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index e2de936d8c4..f3c6f6dfe62 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -2007,6 +2007,59 @@ int buffer::list::write_fd(int fd) const return 0; } +int buffer::list::write_fd(int fd, uint64_t offset) const +{ + // use writev! + iovec iov[IOV_MAX]; + int iovlen = 0; + ssize_t bytes = 0; + + std::list::const_iterator p = _buffers.begin(); + while (p != _buffers.end()) { + if (p->length() > 0) { + iov[iovlen].iov_base = (void *)p->c_str(); + iov[iovlen].iov_len = p->length(); + bytes += p->length(); + iovlen++; + } + ++p; + + if (iovlen == IOV_MAX-1 || + p == _buffers.end()) { + iovec *start = iov; + int num = iovlen; + ssize_t wrote; + retry: + wrote = ::pwritev(fd, start, num, offset); + if (wrote < 0) { + int err = errno; + if (err == EINTR) + goto retry; + return -err; + } + offset += wrote; + if (wrote < bytes) { + // partial write, recover! + while ((size_t)wrote >= start[0].iov_len) { + wrote -= start[0].iov_len; + bytes -= start[0].iov_len; + start++; + num--; + } + if (wrote > 0) { + start[0].iov_len -= wrote; + start[0].iov_base = (char *)start[0].iov_base + wrote; + bytes -= wrote; + } + goto retry; + } + iovlen = 0; + bytes = 0; + } + } + return 0; +} + void buffer::list::prepare_iov(std::vector *piov) const { piov->resize(_buffers.size()); diff --git a/src/include/buffer.h b/src/include/buffer.h index 5a8b05f5125..428f0e3fcac 100644 --- a/src/include/buffer.h +++ b/src/include/buffer.h @@ -528,6 +528,7 @@ namespace buffer CEPH_BUFFER_API { int read_fd_zero_copy(int fd, size_t len); int write_file(const char *fn, int mode=0644); int write_fd(int fd) const; + int write_fd(int fd, uint64_t offset) const; int write_fd_zero_copy(int fd) const; void prepare_iov(std::vector *piov) const; uint32_t crc32c(uint32_t crc) const; diff --git a/src/os/filestore/FileStore.cc b/src/os/filestore/FileStore.cc index 5cb73e7b01a..89d728e34a7 100644 --- a/src/os/filestore/FileStore.cc +++ b/src/os/filestore/FileStore.cc @@ -3188,8 +3188,6 @@ int FileStore::_write(coll_t cid, const ghobject_t& oid, dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl; int r; - int64_t actual; - FDRef fd; r = lfn_open(cid, oid, true, &fd); if (r < 0) { @@ -3199,23 +3197,8 @@ int FileStore::_write(coll_t cid, const ghobject_t& oid, goto out; } - // seek - actual = ::lseek64(**fd, offset, SEEK_SET); - if (actual < 0) { - r = -errno; - dout(0) << "write lseek64 to " << offset << " failed: " << cpp_strerror(r) << dendl; - lfn_close(fd); - goto out; - } - if (actual != (int64_t)offset) { - dout(0) << "write lseek64 to " << offset << " gave bad offset " << actual << dendl; - r = -EIO; - lfn_close(fd); - goto out; - } - // write - r = bl.write_fd(**fd); + r = bl.write_fd(**fd, offset); if (r == 0) r = bl.length(); diff --git a/src/test/bufferlist.cc b/src/test/bufferlist.cc index 77ca3ec2750..06e10074877 100644 --- a/src/test/bufferlist.cc +++ b/src/test/bufferlist.cc @@ -2143,6 +2143,24 @@ TEST(BufferList, write_fd) { ::unlink(FILENAME); } +TEST(BufferList, write_fd_offset) { + ::unlink(FILENAME); + int fd = ::open(FILENAME, O_WRONLY|O_CREAT|O_TRUNC, 0600); + bufferlist bl; + for (unsigned i = 0; i < IOV_MAX * 2; i++) { + bufferptr ptr("A", 1); + bl.push_back(ptr); + } + uint64_t offset = 200; + EXPECT_EQ(0, bl.write_fd(fd, offset)); + ::close(fd); + struct stat st; + memset(&st, 0, sizeof(st)); + ::stat(FILENAME, &st); + EXPECT_EQ(IOV_MAX * 2 + offset, st.st_size); + ::unlink(FILENAME); +} + TEST(BufferList, crc32c) { bufferlist bl; __u32 crc = 0; From 75a9ee20617d33513fabb433493b9c06015728b8 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Mon, 25 Jan 2016 21:45:36 +0800 Subject: [PATCH 2/4] buffer: refactor write_fd to make readable Signed-off-by: Haomai Wang --- src/common/buffer.cc | 83 +++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/src/common/buffer.cc b/src/common/buffer.cc index f3c6f6dfe62..0cea23f9ba3 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1952,6 +1952,36 @@ int buffer::list::write_file(const char *fn, int mode) return 0; } +static int do_writev(int fd, struct iovec *vec, uint64_t offset, unsigned veclen, unsigned bytes) +{ + while (bytes > 0) { + ssize_t r = ::pwritev(fd, vec, veclen, offset); + if (r < 0) { + if (errno == EINTR) + continue; + return -errno; + } + + bytes -= r; + offset += r; + if (bytes == 0) break; + + while (r > 0) { + if (vec[0].iov_len <= (size_t)r) { + // drain this whole item + r -= vec[0].iov_len; + ++vec; + --veclen; + } else { + vec[0].iov_base = (char *)vec[0].iov_base + r; + vec[0].iov_len -= r; + break; + } + } + } + return 0; +} + int buffer::list::write_fd(int fd) const { if (can_zero_copy()) @@ -2009,53 +2039,28 @@ int buffer::list::write_fd(int fd) const int buffer::list::write_fd(int fd, uint64_t offset) const { - // use writev! iovec iov[IOV_MAX]; - int iovlen = 0; - ssize_t bytes = 0; std::list::const_iterator p = _buffers.begin(); - while (p != _buffers.end()) { - if (p->length() > 0) { + uint64_t left_pbrs = _buffers.size(); + while (left_pbrs) { + ssize_t bytes = 0; + unsigned iovlen = 0; + uint64_t size = MIN(left_pbrs, IOV_MAX); + left_pbrs -= size; + while (size > 0) { iov[iovlen].iov_base = (void *)p->c_str(); iov[iovlen].iov_len = p->length(); - bytes += p->length(); iovlen++; + bytes += p->length(); + ++p; + size--; } - ++p; - if (iovlen == IOV_MAX-1 || - p == _buffers.end()) { - iovec *start = iov; - int num = iovlen; - ssize_t wrote; - retry: - wrote = ::pwritev(fd, start, num, offset); - if (wrote < 0) { - int err = errno; - if (err == EINTR) - goto retry; - return -err; - } - offset += wrote; - if (wrote < bytes) { - // partial write, recover! - while ((size_t)wrote >= start[0].iov_len) { - wrote -= start[0].iov_len; - bytes -= start[0].iov_len; - start++; - num--; - } - if (wrote > 0) { - start[0].iov_len -= wrote; - start[0].iov_base = (char *)start[0].iov_base + wrote; - bytes -= wrote; - } - goto retry; - } - iovlen = 0; - bytes = 0; - } + int r = do_writev(fd, iov, offset, iovlen, bytes); + if (r < 0) + return r; + offset += bytes; } return 0; } From aa7b5056ba031107c7270d6173aab2ed0c6e484c Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Mon, 25 Jan 2016 23:21:29 +0800 Subject: [PATCH 3/4] cmake: add default pwritev support now Signed-off-by: Haomai Wang --- CMakeLists.txt | 1 + src/include/config-h.in.cmake | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b1bd607a36..8b8dbed8f66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE) CHECK_FUNCTION_EXISTS(syncfs HAVE_SYS_SYNCFS) CHECK_FUNCTION_EXISTS(sync_file_range HAVE_SYNC_FILE_RANGE) CHECK_FUNCTION_EXISTS(mallinfo HAVE_MALLINFO) +CHECK_FUNCTION_EXISTS(pwritev HAVE_PWRITEV) CHECK_INCLUDE_FILES("arpa/inet.h" HAVE_ARPA_INET_H) CHECK_INCLUDE_FILES("boost/random/discrete_distribution.hpp" HAVE_BOOST_RANDOM_DISCRETE_DISTRIBUTION) CHECK_INCLUDE_FILES("dirent.h" HAVE_DIRENT_H) diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake index 89d7792ee8a..ceedb6e9dc6 100644 --- a/src/include/config-h.in.cmake +++ b/src/include/config-h.in.cmake @@ -123,6 +123,9 @@ /* Define to 1 if you have the `syncfs' function. */ #cmakedefine HAVE_SYNCFS 1 +/* Define to 1 if you have the `pwritev' function. */ +#cmakedefine HAVE_PWRITEV 1 + /* sync_file_range(2) is supported */ #cmakedefine HAVE_SYNC_FILE_RANGE From b1e27a59c76a39e9cdcc85f3ed712ca74f45d2f0 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Mon, 25 Jan 2016 23:18:58 +0800 Subject: [PATCH 4/4] configure.ac: add pwritev check Signed-off-by: Haomai Wang --- configure.ac | 1 + src/common/buffer.cc | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 57d73e23601..908319b1a29 100644 --- a/configure.ac +++ b/configure.ac @@ -1065,6 +1065,7 @@ AC_CHECK_HEADERS([sys/prctl.h]) AC_CHECK_FUNCS([prctl]) AC_CHECK_FUNCS([pipe2]) AC_CHECK_FUNCS([posix_fadvise]) +AC_CHECK_FUNCS([pwritev], AC_DEFINE([HAVE_PWRITEV], [1], [we have pwritev])) AC_MSG_CHECKING([for fdatasync]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ diff --git a/src/common/buffer.cc b/src/common/buffer.cc index 0cea23f9ba3..ee50c0efabf 100644 --- a/src/common/buffer.cc +++ b/src/common/buffer.cc @@ -1954,8 +1954,18 @@ int buffer::list::write_file(const char *fn, int mode) static int do_writev(int fd, struct iovec *vec, uint64_t offset, unsigned veclen, unsigned bytes) { + ssize_t r = 0; while (bytes > 0) { - ssize_t r = ::pwritev(fd, vec, veclen, offset); +#ifdef HAVE_PWRITEV + r = ::pwritev(fd, vec, veclen, offset); +#else + r = ::lseek64(fd, offset, SEEK_SET); + if (r != offset) { + r = -errno; + return r; + } + r = ::writev(fd, vec, veclen); +#endif if (r < 0) { if (errno == EINTR) continue;