From 4a277dfdb020651506ffd47cc8337bcfe90d191e Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 25 Oct 2022 10:24:55 -0400 Subject: [PATCH 1/4] log: reorg header To follow code style guidelines. Signed-off-by: Patrick Donnelly --- src/log/Log.h | 97 ++++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/src/log/Log.h b/src/log/Log.h index 2d7afa87322..51ed58d9289 100644 --- a/src/log/Log.h +++ b/src/log/Log.h @@ -29,8 +29,58 @@ class SubsystemMap; class Log : private Thread { +public: + using Thread::is_started; + + Log(const SubsystemMap *s); + ~Log() override; + + void set_flush_on_exit(); + + void set_coarse_timestamps(bool coarse); + void set_max_new(std::size_t n); + void set_max_recent(std::size_t n); + void set_log_file(std::string_view fn); + void reopen_log_file(); + void chown_log_file(uid_t uid, gid_t gid); + void set_log_stderr_prefix(std::string_view p); + + void flush(); + + void dump_recent(); + + void set_syslog_level(int log, int crash); + void set_stderr_level(int log, int crash); + void set_graylog_level(int log, int crash); + + void start_graylog(const std::string& host, + const uuid_d& fsid); + void stop_graylog(); + + void set_journald_level(int log, int crash); + + void start_journald_logger(); + void stop_journald_logger(); + + std::shared_ptr graylog() { return m_graylog; } + + void submit_entry(Entry&& e); + + void start(); + void stop(); + + /// true if the log lock is held by our thread + bool is_inside_log_lock(); + + /// induce a segv on the next log event + void inject_segv(); + void reset_segv(); + protected: using EntryVector = std::vector; + + virtual void _flush(EntryVector& q, bool crash); + private: using EntryRing = boost::circular_buffer; @@ -83,55 +133,8 @@ private: void _log_safe_write(std::string_view sv); void _flush_logbuf(); void _log_message(std::string_view s, bool crash); -protected: - virtual void _flush(EntryVector& q, bool crash); -public: - using Thread::is_started; - Log(const SubsystemMap *s); - ~Log() override; - - void set_flush_on_exit(); - - void set_coarse_timestamps(bool coarse); - void set_max_new(std::size_t n); - void set_max_recent(std::size_t n); - void set_log_file(std::string_view fn); - void reopen_log_file(); - void chown_log_file(uid_t uid, gid_t gid); - void set_log_stderr_prefix(std::string_view p); - - void flush(); - - void dump_recent(); - - void set_syslog_level(int log, int crash); - void set_stderr_level(int log, int crash); - void set_graylog_level(int log, int crash); - - void start_graylog(const std::string& host, - const uuid_d& fsid); - void stop_graylog(); - - void set_journald_level(int log, int crash); - - void start_journald_logger(); - void stop_journald_logger(); - - std::shared_ptr graylog() { return m_graylog; } - - void submit_entry(Entry&& e); - - void start(); - void stop(); - - /// true if the log lock is held by our thread - bool is_inside_log_lock(); - - /// induce a segv on the next log event - void inject_segv(); - void reset_segv(); }; } From 61396ab91a266c217b618b4da139ad9858bd60ef Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 25 Oct 2022 10:26:36 -0400 Subject: [PATCH 2/4] log: invalidate m_fd on close Signed-off-by: Patrick Donnelly --- src/log/Log.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/log/Log.cc b/src/log/Log.cc index da645ce8b40..78bedf03cfa 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -58,8 +58,10 @@ Log::~Log() } ceph_assert(!is_started()); - if (m_fd >= 0) + if (m_fd >= 0) { VOID_TEMP_FAILURE_RETRY(::close(m_fd)); + m_fd = -1; + } } @@ -116,8 +118,10 @@ void Log::reopen_log_file() return; } m_flush_mutex_holder = pthread_self(); - if (m_fd >= 0) + if (m_fd >= 0) { VOID_TEMP_FAILURE_RETRY(::close(m_fd)); + m_fd = -1; + } if (m_log_file.length()) { m_fd = ::open(m_log_file.c_str(), O_CREAT|O_WRONLY|O_APPEND|O_CLOEXEC, 0644); if (m_fd >= 0 && (m_uid || m_gid)) { @@ -127,8 +131,6 @@ void Log::reopen_log_file() << std::endl; } } - } else { - m_fd = -1; } m_flush_mutex_holder = 0; } From 2551130bcc12c55ddaa879b4f5b77c81890b51b2 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 25 Oct 2022 11:42:10 -0400 Subject: [PATCH 3/4] log: use non-blocking atomic writes to stderr fifos Buggy container runtimes are known to do partial reads on container's stderr and emit torn logs [1]. This is due to write(2) allowing partial data transfer. One way to avoid this when stderr is a FIFO (which is the case for container runtimes) is to change the file flags to set O_NONBLOCK which alters the behavior of write(2) when the write size is less than PIPE_BUF. This makes a write guaranteed to be atomic. The small complexity added is to poll on stderr in this case whenever a write(2) fails with EAGAIN. This does not block the log flush thread in any new way. It would always block when the stderr FIFO was full. [1] https://github.com/containers/conmon/issues/242 Fixes: cef04bc4ed0a3b1c1e9873fbbdd023b3bdf8af03 Fixes: https://tracker.ceph.com/issues/57923 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1996599 Related: https://bugzilla.redhat.com/show_bug.cgi?id=2108228 Signed-off-by: Patrick Donnelly --- src/log/Log.cc | 100 ++++++++++++++++++++++++++++++++++++++++++++++++- src/log/Log.h | 9 +++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/src/log/Log.cc b/src/log/Log.cc index 78bedf03cfa..967bcd5dcb7 100644 --- a/src/log/Log.cc +++ b/src/log/Log.cc @@ -18,10 +18,14 @@ #include "LogClock.h" #include "SubsystemMap.h" +#include + #include #include +#include #include +#include #include #include @@ -49,6 +53,7 @@ Log::Log(const SubsystemMap *s) m_recent(DEFAULT_MAX_RECENT) { m_log_buf.reserve(MAX_LOG_BUF); + _configure_stderr(); } Log::~Log() @@ -64,6 +69,36 @@ Log::~Log() } } +void Log::_configure_stderr() +{ + struct stat info; + if (int rc = fstat(m_fd_stderr, &info); rc == -1) { + std::cerr << "failed to stat stderr: " << cpp_strerror(errno) << std::endl; + return; + } + + if (S_ISFIFO(info.st_mode)) { + /* Set O_NONBLOCK on FIFO stderr file. We want to ensure atomic debug log + * writes so they do not get partially read by e.g. buggy container + * runtimes. See also IEEE Std 1003.1-2017 and Log::_log_stderr below. + */ + int flags = fcntl(m_fd_stderr, F_GETFL); + if (flags == -1) { + std::cerr << "failed to get fcntl flags for stderr: " << cpp_strerror(errno) << std::endl; + return; + } + if (!(flags & O_NONBLOCK)) { + flags |= O_NONBLOCK; + flags = fcntl(m_fd_stderr, F_SETFL, flags); + if (flags == -1) { + std::cerr << "failed to set fcntl flags for stderr: " << cpp_strerror(errno) << std::endl; + return; + } + } + do_stderr_poll = true; + } +} + /// void Log::set_coarse_timestamps(bool coarse) { @@ -260,6 +295,69 @@ void Log::_log_safe_write(std::string_view sv) } } +void Log::set_stderr_fd(int fd) +{ + m_fd_stderr = fd; + _configure_stderr(); +} + +void Log::_log_stderr(std::string_view strv) +{ + if (do_stderr_poll) { + auto& prefix = m_log_stderr_prefix; + size_t const len = prefix.size() + strv.size(); + boost::container::small_vector buf; + buf.resize(len+1, '\0'); + memcpy(buf.data(), prefix.c_str(), prefix.size()); + memcpy(buf.data()+prefix.size(), strv.data(), strv.size()); + + char const* const start = buf.data(); + char const* current = start; + while ((size_t)(current-start) < len) { + auto chunk = std::min(PIPE_BUF, len-(ssize_t)(current-start)); + while (1) { + ssize_t rc = write(m_fd_stderr, current, chunk); + if (rc == chunk) { + current += chunk; + break; + } else if (rc > 0) { + /* According to IEEE Std 1003.1-2017, this cannot happen: + * + * Write requests to a pipe or FIFO shall be handled in the same way as a regular file with the following exceptions: + * ... + * If the O_NONBLOCK flag is set ... + * ... + * A write request for {PIPE_BUF} or fewer bytes shall have the + * following effect: if there is sufficient space available in + * the pipe, write() shall transfer all the data and return the + * number of bytes requested. Otherwise, write() shall transfer + * no data and return -1 with errno set to [EAGAIN]. + * + * In any case, handle misbehavior gracefully by incrementing current. + */ + current += rc; + break; + } else if (rc == -1) { + if (errno == EAGAIN) { + struct pollfd pfd[1]; + pfd[0].fd = m_fd_stderr; + pfd[0].events = POLLOUT; + poll(pfd, 1, -1); + /* ignore errors / success, just retry the write */ + } else if (errno == EINTR) { + continue; + } else { + /* some other kind of error, no point logging if stderr writes fail */ + return; + } + } + } + } + } else { + fmt::print(std::cerr, "{}{}", m_log_stderr_prefix, strv); + } +} + void Log::_flush_logbuf() { if (m_log_buf.size()) { @@ -319,7 +417,7 @@ void Log::_flush(EntryVector& t, bool crash) pos[used++] = '\n'; if (do_stderr) { - fmt::print(std::cerr, "{}{}", m_log_stderr_prefix, std::string_view(pos, used)); + _log_stderr(std::string_view(pos, used)); } if (do_fd) { diff --git a/src/log/Log.h b/src/log/Log.h index 51ed58d9289..3a60937af55 100644 --- a/src/log/Log.h +++ b/src/log/Log.h @@ -18,6 +18,8 @@ #include "log/Entry.h" +#include + struct uuid_d; namespace ceph { @@ -44,6 +46,7 @@ public: void reopen_log_file(); void chown_log_file(uid_t uid, gid_t gid); void set_log_stderr_prefix(std::string_view p); + void set_stderr_fd(int fd); void flush(); @@ -108,6 +111,8 @@ private: uid_t m_uid = 0; gid_t m_gid = 0; + int m_fd_stderr = STDERR_FILENO; + int m_fd_last_error = 0; ///< last error we say writing to fd (if any) int m_syslog_log = -2, m_syslog_crash = -2; @@ -116,6 +121,7 @@ private: int m_journald_log = -3, m_journald_crash = -3; std::string m_log_stderr_prefix; + bool do_stderr_poll = false; std::shared_ptr m_graylog; std::unique_ptr m_journald; @@ -133,6 +139,9 @@ private: void _log_safe_write(std::string_view sv); void _flush_logbuf(); void _log_message(std::string_view s, bool crash); + void _configure_stderr(); + void _log_stderr(std::string_view strv); + }; From 48323ac07728a6246136993c1ce9219aec9a0f67 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 27 Oct 2022 11:09:21 -0400 Subject: [PATCH 4/4] log: add tests for stderr writes to fifos Signed-off-by: Patrick Donnelly --- src/log/test.cc | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/src/log/test.cc b/src/log/test.cc index f1ea107518e..08ba5bff6af 100644 --- a/src/log/test.cc +++ b/src/log/test.cc @@ -10,6 +10,10 @@ #include "global/global_context.h" #include "common/dout.h" +#include + +#include + using namespace std; using namespace ceph::logging; @@ -166,6 +170,125 @@ TEST(Log, ManyGather) log.stop(); } +static void readpipe(int fd, int verify) +{ + while (1) { + /* Use larger buffer on receiver as Linux will allow pipes buffers to + * exceed PIPE_BUF. We can't avoid tearing due to small read buffers from + * the Ceph side. + */ + + char buf[65536] = ""; + int rc = read(fd, buf, (sizeof buf) - 1); + if (rc == 0) { + _exit(0); + } else if (rc == -1) { + _exit(1); + } else if (rc > 0) { + if (verify) { + char* p = strrchr(buf, '\n'); + /* verify no torn writes */ + if (p == NULL) { + _exit(2); + } else if (p[1] != '\0') { + write(2, buf, strlen(buf)); + _exit(3); + } + } + } else _exit(100); + usleep(500); + } +} + +TEST(Log, StderrPipeAtomic) +{ + int pfd[2] = {-1, -1}; + int rc = pipe(pfd); + ASSERT_EQ(rc, 0); + pid_t pid = fork(); + if (pid == 0) { + close(pfd[1]); + readpipe(pfd[0], 1); + } else if (pid == (pid_t)-1) { + ASSERT_EQ(0, 1); + } + close(pfd[0]); + + SubsystemMap subs; + subs.set_log_level(1, 20); + subs.set_gather_level(1, 10); + Log log(&subs); + log.start(); + log.set_log_file(""); + log.reopen_log_file(); + log.set_stderr_fd(pfd[1]); + log.set_stderr_level(1, 20); + /* -128 for prefix space */ + for (int i = 0; i < PIPE_BUF-128; i++) { + MutableEntry e(1, 1); + auto& s = e.get_ostream(); + for (int j = 0; j < i; j++) { + char c = 'a'; + c += (j % 26); + s << c; + } + log.submit_entry(std::move(e)); + } + log.flush(); + log.stop(); + close(pfd[1]); + int status; + pid_t waited = waitpid(pid, &status, 0); + ASSERT_EQ(pid, waited); + ASSERT_NE(WIFEXITED(status), 0); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST(Log, StderrPipeBig) +{ + int pfd[2] = {-1, -1}; + int rc = pipe(pfd); + ASSERT_EQ(rc, 0); + pid_t pid = fork(); + if (pid == 0) { + /* no verification as some reads will be torn due to size > PIPE_BUF */ + close(pfd[1]); + readpipe(pfd[0], 0); + } else if (pid == (pid_t)-1) { + ASSERT_EQ(0, 1); + } + close(pfd[0]); + + SubsystemMap subs; + subs.set_log_level(1, 20); + subs.set_gather_level(1, 10); + Log log(&subs); + log.start(); + log.set_log_file(""); + log.reopen_log_file(); + log.set_stderr_fd(pfd[1]); + log.set_stderr_level(1, 20); + /* -128 for prefix space */ + for (int i = 0; i < PIPE_BUF*2; i++) { + MutableEntry e(1, 1); + auto& s = e.get_ostream(); + for (int j = 0; j < i; j++) { + char c = 'a'; + c += (j % 26); + s << c; + } + log.submit_entry(std::move(e)); + } + log.flush(); + log.stop(); + close(pfd[1]); + int status; + pid_t waited = waitpid(pid, &status, 0); + ASSERT_EQ(pid, waited); + ASSERT_NE(WIFEXITED(status), 0); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + void do_segv() { SubsystemMap subs;