support more OSes in generic-fp

We're still x86+arm+riscv only, but netbsd and freebsd work too. OSX
as well.
This commit is contained in:
Aliaksey Kandratsenka 2023-07-02 19:43:44 -04:00
parent 4b78ffd03c
commit d9b178695f
2 changed files with 142 additions and 27 deletions

View File

@ -120,9 +120,8 @@ struct GetStackImplementation {
#define HAVE_GST_x86
#endif // i386 || x86_64
// Sadly, different OSes have very different mcontexts even for
// identical hardware arch. So keep it linux-only for now.
#if defined(__GNUC__) && __linux__ && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || defined(__riscv))
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || defined(__riscv) || defined(__arm__))
// NOTE: legacy 32-bit arm works fine with recent clangs, but is broken in gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92172
#define STACKTRACE_INL_HEADER "stacktrace_generic_fp-inl.h"
#define GST_SUFFIX generic_fp
#include "stacktrace_impl_setup-inl.h"

View File

@ -44,10 +44,29 @@
// This is only used on OS-es with mmap support.
#include <sys/mman.h>
// Set this to true to disable "probing" of addresses that are read to
// make backtracing less-safe, but faster.
#ifndef TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE
#define TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE 0
#if HAVE_SYS_SYSCALL_H
#include <sys/syscall.h>
#endif
#if defined(PC_FROM_UCONTEXT) && (HAVE_SYS_UCONTEXT_H || HAVE_UCONTEXT_H)
#include "getpc.h"
#define HAVE_GETPC 1
#endif
// our Autoconf setup enables -fno-omit-frame-pointer, but lets still
// ask for it just in case.
//
// Note: clang doesn't know about optimize attribute. But clang (and
// gcc too, apparently) automagically forces generation of frame
// pointer whenever __builtin_frame_address is used.
#if defined(__GNUC__) && defined(__has_attribute)
#if __has_attribute(optimize)
#define ENABLE_FP_ATTRIBUTE __attribute__((optimize("no-omit-frame-pointer")))
#endif
#endif
#ifndef ENABLE_FP_ATTRIBUTE
#define ENABLE_FP_ATTRIBUTE
#endif
namespace {
@ -93,12 +112,25 @@ static bool CheckPageIsReadable(void* ptr, void* checked_ptr) {
return true;
}
return (msync(reinterpret_cast<void*>(addr), pagesize, MS_ASYNC) == 0);
int rc;
#if __FreeBSD__ && defined(SYS_msync)
// FreeBSD needs this. Our first stacktrace capturing happens early
// and apparently their threading facility isn't ready. And msync as
// well us few other "trivial" calls crash.
rc = syscall(SYS_msync, reinterpret_cast<void*>(addr), pagesize, MS_ASYNC);
#else
rc = msync(reinterpret_cast<void*>(addr), pagesize, MS_ASYNC);
#endif
return (rc == 0);
}
template <bool UnsafeAccesses, bool WithSizes>
ATTRIBUTE_NOINLINE // forces architectures with link register to save it
ENABLE_FP_ATTRIBUTE
int capture(void **result, int max_depth, int skip_count,
void* initial_frame, void* const * initial_pc) {
void* initial_frame, void* const * initial_pc,
int *sizes) {
int i = 0;
max_depth += skip_count;
@ -118,11 +150,21 @@ int capture(void **result, int max_depth, int skip_count,
constexpr uintptr_t kTooSmallAddr = 16 << 10;
constexpr uintptr_t kFrameSizeThreshold = 128 << 10;
#ifdef __arm__
// note, (32-bit, legacy) arm support is not entirely functional
// w.r.t. frame-pointer-bases backtracing. Only recent clangs
// generate "right" frame pointer setup and only with
// --enable-frame-pointers. Current gcc's are hopeless (somewhat
// older gcc's (circa gcc 6 or so) did something that looks right,
// but not recent ones).
constexpr uintptr_t kAlignment = 4;
#else
// This is simplistic yet. Here we're targeting x86, aarch64 and
// riscv. They all have 16 bytes stack alignment (even 32 bit
// riscv). This can be made more elaborate as we consider more
// architectures.
constexpr uintptr_t kAlignment = 16;
#endif
uintptr_t initial_frame_addr = reinterpret_cast<uintptr_t>(initial_frame);
if (((initial_frame_addr + sizeof(frame)) & (kAlignment - 1)) != 0) {
@ -132,11 +174,14 @@ int capture(void **result, int max_depth, int skip_count,
return i;
}
frame* prev_f = nullptr;
// Note, we assume here that this functions frame pointer is not
// bogus. Which is true if this code is built with
// -fno-omit-frame-pointer.
frame* prev_f = reinterpret_cast<frame*>(__builtin_frame_address(0));
frame *f = adjust_fp(reinterpret_cast<frame*>(initial_frame));
while (i < max_depth) {
if (!TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE
if (!UnsafeAccesses
&& !CheckPageIsReadable(&f->parent, prev_f)) {
break;
}
@ -147,6 +192,9 @@ int capture(void **result, int max_depth, int skip_count,
}
if (i >= skip_count) {
if (WithSizes) {
sizes[i - skip_count] = reinterpret_cast<uintptr_t>(prev_f) - reinterpret_cast<uintptr_t>(f);
}
result[i - skip_count] = pc;
}
@ -158,6 +206,7 @@ int capture(void **result, int max_depth, int skip_count,
if (parent_frame_addr < kTooSmallAddr) {
break;
}
// stack grows towards smaller addresses, so if we didn't see
// frame address increased (going from child to parent), it is bad
// frame. We also test if frame is too big since that is another
@ -175,6 +224,9 @@ int capture(void **result, int max_depth, int skip_count,
f = adjust_fp(reinterpret_cast<frame*>(parent_frame_addr));
}
if (WithSizes && i > 0 && skip_count == 0) {
sizes[0] = 0;
}
return i - skip_count;
}
@ -198,9 +250,24 @@ int capture(void **result, int max_depth, int skip_count,
// int skip_count: how many stack pointers to skip before storing in result
// void* ucp: a ucontext_t* (GetStack{Trace,Frames}WithContext only)
// Set this to true to disable "probing" of addresses that are read to
// make backtracing less-safe, but faster.
#ifndef TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE
#define TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE 0
#endif
ENABLE_FP_ATTRIBUTE
static int GET_STACK_TRACE_OR_FRAMES {
if (max_depth == 0) {
return 0;
}
#if IS_STACK_FRAMES
constexpr bool WithSizes = true;
memset(sizes, 0, sizeof(*sizes) * max_depth);
#else
constexpr bool WithSizes = false;
int * const sizes = nullptr;
#endif
// one for this function
@ -208,30 +275,79 @@ static int GET_STACK_TRACE_OR_FRAMES {
void* const * initial_pc = nullptr;
void* initial_frame = __builtin_frame_address(0);
int n;
#if IS_WITH_CONTEXT
#if IS_WITH_CONTEXT && (HAVE_SYS_UCONTEXT_H || HAVE_UCONTEXT_H)
if (ucp) {
auto uc = static_cast<const ucontext_t*>(ucp);
#ifdef __riscv
initial_pc = reinterpret_cast<void* const *>(&uc->uc_mcontext.__gregs[REG_PC]);
initial_frame = reinterpret_cast<void*>(uc->uc_mcontext.__gregs[REG_S0]);
#elif __aarch64__
initial_pc = reinterpret_cast<void* const *>(&uc->uc_mcontext.pc);
initial_frame = reinterpret_cast<void*>(uc->uc_mcontext.regs[29]);
#elif __i386__
initial_pc = reinterpret_cast<void* const *>(&uc->uc_mcontext.gregs[REG_EIP]);
initial_frame = reinterpret_cast<void*>(uc->uc_mcontext.gregs[REG_EBP]);
// We have to resort to macro since different architectures have
// different concrete types for those args.
#define SETUP_FRAME(pc_ptr, frame_addr) \
do { \
initial_pc = reinterpret_cast<void* const *>(pc_ptr); \
initial_frame = reinterpret_cast<void*>(frame_addr); \
} while (false)
#if __linux__ && __riscv
SETUP_FRAME(&uc->uc_mcontext.__gregs[REG_PC], uc->uc_mcontext.__gregs[REG_S0]);
#elif __linux__ && __aarch64__
SETUP_FRAME(&uc->uc_mcontext.pc, uc->uc_mcontext.regs[29]);
#elif __linux__ && __i386__
SETUP_FRAME(&uc->uc_mcontext.gregs[REG_EIP], uc->uc_mcontext.gregs[REG_EBP]);
#elif __linux__ && __x86_64__
SETUP_FRAME(&uc->uc_mcontext.gregs[REG_RIP], uc->uc_mcontext.gregs[REG_RBP]);
#elif __FreeBSD__ && __x86_64__
SETUP_FRAME(&uc->uc_mcontext.mc_rip, uc->uc_mcontext.mc_rbp);
#elif __FreeBSD__ && __i386__
SETUP_FRAME(&uc->uc_mcontext.mc_eip, uc->uc_mcontext.mc_ebp);
#elif __NetBSD__
// NetBSD has those portable defines. Nice!
SETUP_FRAME(&_UC_MACHINE_PC(uc), _UC_MACHINE_FP(uc));
#elif defined(HAVE_GETPC)
// So if we're dealing with architecture that doesn't belong to
// one of cases above, we still have plenty more cases supported
// by pc_from_ucontext facility we have for cpu profiler. We'll
// get top-most instruction pointer from context, and rest will be
// grabbed by frame pointer unwinding (with skipping active).
//
// It is a bit of a guess, but it works for x86 (makes
// stacktrace_unittest ucontext test pass). Main idea is skip
// count we have will skip just past 'sigreturn' trampoline or
// whatever OS has. And those tend to be built without frame
// pointers, which causes last "skipping" step to skip past the
// frame we need. Also, this is how our CPU profiler is built. It
// always places "pc from ucontext" first and then if necessary
// deduplicates it from backtrace.
result[0] = GetPC(*uc);
#else
initial_pc = reinterpret_cast<void* const *>(&uc->uc_mcontext.gregs[REG_RIP]);
initial_frame = reinterpret_cast<void*>(uc->uc_mcontext.gregs[REG_RBP]);
ucp = nullptr;
#endif
#undef SETUP_FRAME
}
#else
void * const ucp = nullptr;
#endif // IS_WITH_CONTEXT
int n = stacktrace_generic_fp::capture(result, max_depth, skip_count,
initial_frame, initial_pc);
constexpr bool UnsafeAccesses = (TCMALLOC_UNSAFE_GENERIC_FP_STACKTRACE != 0);
if (ucp && !initial_pc) {
// we're dealing with architecture that doesn't have proper ucontext integration
n = stacktrace_generic_fp::capture<UnsafeAccesses, WithSizes>(
result + 1, max_depth - 1, skip_count,
initial_frame, initial_pc, sizes);
n++;
} else {
n = stacktrace_generic_fp::capture<UnsafeAccesses, WithSizes>(
result, max_depth, skip_count,
initial_frame, initial_pc, sizes);
}
if (n > 0) {
// make sure we don't tail-call capture
(void)*(const_cast<void * volatile *>(result));
}
// make sure we don't tail-call capture
(void)*(const_cast<void * volatile *>(result));
return n;
}