[PROF-15201] fix(profiler): close two TOCTOU races between SIGPROF handler and JFR lifecycle by r1viollet · Pull Request #614 · DataDog/java-profiler · GitHub
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions ddprof-lib/src/main/cpp/ctimer.h
46 changes: 45 additions & 1 deletion ddprof-lib/src/main/cpp/ctimer_linux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <string.h>
#include <sys/syscall.h>
#include <time.h>
#include <time.h>
#include <unistd.h>

#ifndef SIGEV_THREAD_ID
Expand All @@ -49,6 +50,7 @@ int CTimer::_max_timers = 0;
int *CTimer::_timers = NULL;
CStack CTimer::_cstack;
bool CTimer::_enabled = false;
alignas(64) int CTimer::_inflight = 0;
int CTimer::_signal;

int CTimer::registerThread(int tid) {
Expand Down Expand Up @@ -176,10 +178,47 @@ Error CTimer::start(Arguments &args) {
return Error::OK;
}

// 200 ms: long enough for any legitimate recordSample() call to finish,
// short enough to avoid a perceptible hang if a handler is somehow stuck.
static const long DRAIN_TIMEOUT_NS = 200000000L;

bool CTimer::drainInflight() {
if (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) == 0) {
return true; // fast path: nothing in flight
}

struct timespec deadline;
clock_gettime(CLOCK_MONOTONIC, &deadline);
deadline.tv_nsec += DRAIN_TIMEOUT_NS;
if (deadline.tv_nsec >= 1000000000L) {
deadline.tv_sec += 1;
deadline.tv_nsec -= 1000000000L;
}

while (__atomic_load_n(&_inflight, __ATOMIC_ACQUIRE) > 0) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
if (now.tv_sec > deadline.tv_sec ||
(now.tv_sec == deadline.tv_sec && now.tv_nsec >= deadline.tv_nsec)) {
int remaining = __atomic_load_n(&_inflight, __ATOMIC_ACQUIRE);
Log::error("CTimer::drainInflight: timed out after %ldms waiting for "
"%d in-flight SIGPROF handler(s). Skipping JFR teardown to "
"prevent use-after-free. This indicates a stuck signal handler.",
DRAIN_TIMEOUT_NS / 1000000L, remaining);
return false; // timeout: handlers still in-flight
}
sched_yield();
}
return true; // success: all handlers drained
}

void CTimer::stop() {
for (int i = 0; i < _max_timers; i++) {
unregisterThread(i);
}
// Note: drainInflight() is called by Profiler::stop() after all engines
// have stopped, not here. Profiler needs the return value to decide whether
// to proceed with JFR teardown.
}

Error CTimerJvmti::check(Arguments &args) {
Expand Down Expand Up @@ -210,6 +249,8 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
}
Counters::increment(CTIMER_SIGNAL_OWN);

InflightGuard inflight;

CriticalSection cs;
if (!cs.entered()) {
return;
Expand Down Expand Up @@ -261,6 +302,8 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
}
Counters::increment(CTIMER_SIGNAL_OWN);

InflightGuard inflight;

// Atomically try to enter critical section - prevents all reentrancy races
CriticalSection cs;
if (!cs.entered()) {
Expand All @@ -270,8 +313,9 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
int saved_errno = errno;
// we want to ensure memory order because of the possibility the instance gets
// cleared
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE))
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE)) {
return;
}
int tid = 0;
ProfiledThread *current = ProfiledThread::currentSignalSafe();
assert(current == nullptr || !current->isDeepCrashHandler());
Expand Down
34 changes: 34 additions & 0 deletions ddprof-lib/src/main/cpp/guards.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#include "common.h"
#include "os.h"
#include "thread.h"
#ifdef __linux__
#include "ctimer.h"
#endif

// Signal-context tracking — backed by ProfiledThread::_signal_depth; see
// the comment block in guards.h for the rationale (initial-exec TLS was
Expand Down Expand Up @@ -114,3 +117,34 @@ CriticalSection::~CriticalSection() {
uint32_t CriticalSection::hash_tid(int tid) {
return static_cast<uint32_t>(tid * KNUTH_MULTIPLICATIVE_CONSTANT);
}

// InflightGuard implementation — Linux-specific CTimer race mitigation
// Uses CTimer::_inflight counter which is placed on its own cache line (alignas(64))
// to avoid false sharing with _enabled. While the counter is still globally updated,
// the separate cache line means:
// - _enabled remains read-only on its cache line (fast, no bouncing)
// - _inflight writes don't invalidate the _enabled cache line

#ifdef __linux__

InflightGuard::InflightGuard() : _active(true) {
// Directly access CTimer's static _inflight counter with ACQUIRE semantics
// so drainInflight()'s ACQUIRE load will see all writes before this increment.
__atomic_fetch_add(&CTimer::_inflight, 1, __ATOMIC_ACQUIRE);
}

InflightGuard::~InflightGuard() {
if (_active) {
// RELEASE semantics: all signal handler writes become visible to
// drainInflight() before the counter decrements.
__atomic_fetch_sub(&CTimer::_inflight, 1, __ATOMIC_RELEASE);
}
}

#else

// No-op implementation for non-Linux platforms where CTimer doesn't exist
InflightGuard::InflightGuard() : _active(false) {}
InflightGuard::~InflightGuard() {}

#endif
33 changes: 33 additions & 0 deletions ddprof-lib/src/main/cpp/guards.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,39 @@ class SignalHandlerScope {
void signalHandlerUnwindAfterLongjmp();
#define SIGNAL_HANDLER_UNWIND_AFTER_LONGJMP() signalHandlerUnwindAfterLongjmp()

/**
* RAII guard for CTimer signal handler in-flight tracking.
*
* Increments CTimer::_inflight on construction and decrements on destruction,
* ensuring the counter is always balanced even if the handler returns early.
* CTimer::_inflight is cache-line-aligned (alignas(64)) to avoid false sharing
* with _enabled, minimizing cache line bouncing.
*
* This closes the TOCTOU race between disableEngines() and _jfr.stop():
* CTimer::drainInflight() spins until _inflight reaches zero, guaranteeing
* that _jfr.stop() only runs once all handlers have fully exited.
*
* Usage (at the start of CTimer signal handlers):
* InflightGuard inflight;
* // All return paths automatically decrement the counter
*
* The guard is a no-op on non-Linux platforms where CTimer is unavailable.
*/
class InflightGuard {
public:
InflightGuard();
~InflightGuard();

// Non-copyable, non-movable
InflightGuard(const InflightGuard&) = delete;
InflightGuard& operator=(const InflightGuard&) = delete;
InflightGuard(InflightGuard&&) = delete;
InflightGuard& operator=(InflightGuard&&) = delete;

private:
bool _active;
};

/**
* Race-free critical section using atomic compare-and-swap.
*
Expand Down
32 changes: 28 additions & 4 deletions ddprof-lib/src/main/cpp/profiler.cpp
Loading