diff --git a/.github/dependabot.yml b/.github/dependabot.yml index fc7cecae8..799e64238 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,6 +12,14 @@ updates: - "no-release-notes" - "no-review" ignore: + # JUnit 5.10+ dropped Java 8 support; 5.12+ dropped Java 11; 6.x requires Java 17. + # CI targets on Java 8 and 11 (HotSpot and J9) run the Gradle test worker on the + # test JDK itself (the profiler attaches to its own process), so the JUnit Platform + # classes must be loadable by Java 8/11. Keep the entire JUnit stack at 5.9.x / 1.9.x. + - dependency-name: "org.junit.jupiter:*" + versions: [">=5.10.0"] + - dependency-name: "org.junit.platform:*" + versions: [">=1.10.0"] - dependency-name: "org.junit-pioneer:junit-pioneer" versions: [">=2.0.0"] groups: diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml index fcda543c4..62287dbd6 100644 --- a/.github/workflows/test_workflow.yml +++ b/.github/workflows/test_workflow.yml @@ -164,6 +164,13 @@ jobs: with: name: (test-reports) test-linux-glibc-amd64 (${{ matrix.java_version }}, ${{ matrix.config }}) path: test-reports + - name: Upload signal-safety violation log + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: failure() + with: + name: signal-safety-violation-glibc-${{ matrix.java_version }}-${{ matrix.config }}-amd64 + path: /tmp/signal-safety-violation.txt + if-no-files-found: ignore test-linux-musl-amd64: needs: [cache-jdks, filter-musl-configs] @@ -283,6 +290,13 @@ jobs: with: name: (test-reports) test-linux-musl-amd64 (${{ matrix.java_version }}, ${{ matrix.config }}) path: test-reports + - name: Upload signal-safety violation log + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: failure() + with: + name: signal-safety-violation-musl-${{ matrix.java_version }}-${{ matrix.config }}-amd64 + path: /tmp/signal-safety-violation.txt + if-no-files-found: ignore test-linux-glibc-aarch64: needs: cache-jdks @@ -443,6 +457,13 @@ jobs: with: name: (test-reports) test-linux-glibc-aarch64 (${{ matrix.java_version }}, ${{ matrix.config }}) path: test-reports + - name: Upload signal-safety violation log + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: failure() + with: + name: signal-safety-violation-glibc-${{ matrix.java_version }}-${{ matrix.config }}-aarch64 + path: /tmp/signal-safety-violation.txt + if-no-files-found: ignore test-linux-musl-aarch64: needs: [cache-jdks, filter-musl-configs] @@ -540,3 +561,10 @@ jobs: with: name: (test-reports) test-linux-musl-aarch64 (${{ matrix.java_version }}, ${{ matrix.config }}) path: test-reports + - name: Upload signal-safety violation log + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: failure() + with: + name: signal-safety-violation-musl-${{ matrix.java_version }}-${{ matrix.config }}-aarch64 + path: /tmp/signal-safety-violation.txt + if-no-files-found: ignore diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt index e7d593630..793ff75b1 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt @@ -188,12 +188,46 @@ class GtestPlugin : Plugin { val compiler = findCompiler(project) val includeFiles = extension.includes.plus(project.files(getGtestIncludes(extension))) - // Create per-config aggregation task + // Create per-config aggregation task (compile + link + run) val gtestConfigTask = project.tasks.register("gtest${config.capitalizedName()}") { group = "verification" description = "Run all Google Tests for the ${config.name} build of the library" } + // Per-config build-only aggregation task (compile + link, no run). + // Useful in CI where binaries are executed directly without going + // through Gradle's logging infrastructure. + val buildGtestConfigTask = project.tasks.register("buildGtest${config.capitalizedName()}") { + group = "build" + description = "Compile and link all Google Tests for the ${config.name} build (no run)" + } + + // Compile all library sources ONCE for this config. Each test + // binary only compiles its own test file and links against these + // shared objects, reducing compilations from O(n_tests × n_sources) + // to O(n_sources + n_tests). + val sharedBuilder = GtestTaskBuilder(project, extension, config) + .withCompiler(compiler) + .withIncludes(includeFiles) + .onlyIfGtest(hasGtest) + val sharedCompilerArgs = sharedBuilder.sharedCompilerArgs() + val sharedLibCompileTask = project.tasks.register( + "compileGtestLibrary${config.capitalizedName()}", + com.datadoghq.native.tasks.NativeCompileTask::class.java + ) { + onlyIf { hasGtest && !sharedBuilder.skipConditions() } + group = "build" + description = "Compile shared library sources for ${config.name} gtest binaries" + + this.compiler.set(compiler) + this.compilerArgs.set(sharedCompilerArgs) + sources.from(project.fileTree(extension.mainSourceDir.get()) { include("**/*.cpp") }) + includes.from(includeFiles) + objectFileDir.set(project.file( + "${project.layout.buildDirectory.get()}/obj/gtest/${config.name}/lib" + )) + } + // Discover and create tasks for each test file using builder val testDir = extension.testSourceDir.get().asFile if (!testDir.exists()) { @@ -206,11 +240,16 @@ class GtestPlugin : Plugin { .forTest(testFile) .withCompiler(compiler) .withIncludes(includeFiles) + .withSharedLibObjects(sharedLibCompileTask) .onlyIfGtest(hasGtest) .build() gtestConfigTask.configure { dependsOn(executeTask) } gtestAll.configure { dependsOn(executeTask) } + // buildGtest depends on the link task, not the run task + buildGtestConfigTask.configure { + dependsOn("linkGtest${config.capitalizedName()}_${testFile.nameWithoutExtension}") + } } } diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt index cc4e80770..a56de4ae7 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt @@ -37,6 +37,7 @@ class GtestTaskBuilder( private lateinit var compiler: String private lateinit var includeFiles: FileCollection private var hasGtest: Boolean = true + private var sharedLibCompileTask: TaskProvider? = null private val configName: String get() = config.capitalizedName() @@ -73,6 +74,23 @@ class GtestTaskBuilder( return this } + /** + * Provide the shared library compile task whose objects are linked into + * every test binary. Allows the library sources to be compiled once + * instead of once per test file. + */ + fun withSharedLibObjects(task: TaskProvider): GtestTaskBuilder { + sharedLibCompileTask = task + return this + } + + /** + * Returns the compiler args used for compiling library and test sources. + * Exposed so GtestPlugin can configure the shared library compile task + * with identical flags without duplicating the adjustment logic. + */ + fun sharedCompilerArgs(): List = adjustCompilerArgs() + /** * Build all tasks (compile, link, execute) and return the execute task provider. */ @@ -94,10 +112,16 @@ class GtestTaskBuilder( this.compiler.set(this@GtestTaskBuilder.compiler) this.compilerArgs.set(compilerArgs) - sources.from( - project.fileTree(extension.mainSourceDir.get()) { include("**/*.cpp") }, - testFile - ) + // When a shared library compile task is provided, library sources + // are compiled once there. Only compile the test file itself here. + if (sharedLibCompileTask != null) { + sources.from(testFile) + } else { + sources.from( + project.fileTree(extension.mainSourceDir.get()) { include("**/*.cpp") }, + testFile + ) + } includes.from(includeFiles) objectFileDir.set(objDir) } @@ -117,6 +141,10 @@ class GtestTaskBuilder( linker.set(compiler) this.linkerArgs.set(linkerArgs) objectFiles.from(project.fileTree(objDir) { include("*.o") }) + sharedLibCompileTask?.let { sharedTask -> + dependsOn(sharedTask) + objectFiles.from(sharedTask.map { it.objectFileDir.get().asFileTree.matching { include("*.o") } }) + } outputFile.set(binary) // Add gtest library paths @@ -171,7 +199,7 @@ class GtestTaskBuilder( } } - private fun skipConditions(): Boolean { + fun skipConditions(): Boolean { return project.hasProperty("skip-tests") || project.hasProperty("skip-native") || project.hasProperty("skip-gtest") diff --git a/ddprof-lib/src/main/cpp/ctimer_linux.cpp b/ddprof-lib/src/main/cpp/ctimer_linux.cpp index 2815164df..e0d2b8b0c 100644 --- a/ddprof-lib/src/main/cpp/ctimer_linux.cpp +++ b/ddprof-lib/src/main/cpp/ctimer_linux.cpp @@ -202,6 +202,7 @@ Error CTimerJvmti::start(Arguments &args) { } void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); if (!OS::shouldProcessSignal(siginfo, SI_TIMER, SignalCookie::cpu())) { Counters::increment(CTIMER_SIGNAL_FOREIGN); OS::forwardForeignSignal(signo, siginfo, ucontext); @@ -248,6 +249,7 @@ void CTimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { } void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); // Reject signals that did not originate from our timer_create timers. // This guards against Go's process-wide setitimer(ITIMER_PROF) and other // foreign SIGPROF sources that would otherwise drive our handler onto diff --git a/ddprof-lib/src/main/cpp/dictionary.cpp b/ddprof-lib/src/main/cpp/dictionary.cpp index c1ca2860d..29db2eef5 100644 --- a/ddprof-lib/src/main/cpp/dictionary.cpp +++ b/ddprof-lib/src/main/cpp/dictionary.cpp @@ -17,6 +17,7 @@ #include "dictionary.h" #include "arch.h" #include "counters.h" +#include "signalSafety.h" #include #include #include @@ -41,6 +42,7 @@ Dictionary::~Dictionary() { } void Dictionary::clear() { + DEBUG_ASSERT_NOT_IN_SIGNAL(); clear(_table, _id); memset(_table, 0, sizeof(DictTable)); _table->base_index = _base_index = 1; @@ -88,6 +90,15 @@ unsigned int Dictionary::lookup(const char *key, size_t length) { unsigned int Dictionary::lookup(const char *key, size_t length, bool for_insert, unsigned int sentinel) { + // The insert path mallocs (allocateKey) and may calloc a DictTable — + // both AS-unsafe. Read-only lookups (for_insert == false, used by + // check() and bounded_lookup at capacity) only touch already-allocated + // memory and are AS-safe. Assert here rather than in the overloads + // so bounded_lookup's runtime-decided for_insert is also covered. + if (for_insert) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); + } + DictTable *table = _table; unsigned int h = hash(key, length); diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index 98e8d0a87..789b9debc 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -19,6 +19,7 @@ #include "jniHelper.h" #include "os.h" #include "profiler.h" +#include "signalSafety.h" #include "rustDemangler.h" #include "safeAccess.h" #include "spinLock.h" @@ -1411,6 +1412,7 @@ void Recording::writeMethods(Buffer *buf, Lookup *lookup) { } void Recording::writeClasses(Buffer *buf, Lookup *lookup) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); std::map classes; // Hold classMapSharedGuard() for the full function. The const char* pointers // stored in classes point into dictionary row storage; clear() frees that @@ -1720,10 +1722,11 @@ void Recording::recordCpuLoad(Buffer *buf, float proc_user, float proc_system, // assumption is that we hold the lock (with lock_index) void Recording::addThread(int lock_index, int tid) { int active = _active_index.load(std::memory_order_acquire); - _thread_ids[lock_index][active].insert(tid); + _thread_ids[lock_index][active].insert(tid); // ThreadIdTable::insert is signal-safe (atomics only) } Error FlightRecorder::start(Arguments &args, bool reset) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); ExclusiveLockGuard locker(&_rec_lock); const char *file = args.file(); if (file == NULL || file[0] == 0) { @@ -1751,6 +1754,7 @@ Error FlightRecorder::newRecording(bool reset) { } void FlightRecorder::stop() { + DEBUG_ASSERT_NOT_IN_SIGNAL(); ExclusiveLockGuard locker(&_rec_lock); Recording* rec = _rec; if (rec != nullptr) { @@ -1761,6 +1765,7 @@ void FlightRecorder::stop() { } Error FlightRecorder::dump(const char *filename, const int length) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); assert(length >= 0); ExclusiveLockGuard locker(&_rec_lock); Recording* rec = _rec; @@ -1781,6 +1786,7 @@ Error FlightRecorder::dump(const char *filename, const int length) { } void FlightRecorder::flush() { + DEBUG_ASSERT_NOT_IN_SIGNAL(); ExclusiveLockGuard locker(&_rec_lock); Recording* rec = _rec; if (rec != nullptr) { @@ -1845,6 +1851,7 @@ void FlightRecorder::recordQueueTime(int lock_index, int tid, void FlightRecorder::recordDatadogSetting(int lock_index, int length, const char *name, const char *value, const char *unit) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); OptionalSharedLockGuard locker(&_rec_lock); if (locker.ownsLock()) { Recording* rec = _rec; @@ -1856,6 +1863,7 @@ void FlightRecorder::recordDatadogSetting(int lock_index, int length, } void FlightRecorder::recordHeapUsage(int lock_index, long value, bool live) { + DEBUG_ASSERT_NOT_IN_SIGNAL(); OptionalSharedLockGuard locker(&_rec_lock); if (locker.ownsLock()) { Recording* rec = _rec; diff --git a/ddprof-lib/src/main/cpp/guards.cpp b/ddprof-lib/src/main/cpp/guards.cpp index cc46e64ac..1bfc0b695 100644 --- a/ddprof-lib/src/main/cpp/guards.cpp +++ b/ddprof-lib/src/main/cpp/guards.cpp @@ -19,6 +19,58 @@ #include "os.h" #include "thread.h" +// Signal-context tracking — backed by ProfiledThread::_signal_depth; see +// the comment block in guards.h for the rationale (initial-exec TLS was +// rejected because of the static TLS surplus on Graal). + +int getInSignalDepth() { + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + return pt != nullptr ? static_cast(pt->signalDepth()) : 0; +} + +bool isInTrackedSignalContext() { + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + // null ProfiledThread = no thread context; the SignalHandlerScope + // never ran, so we have no positive evidence of a signal frame. + // See header comment for the rationale of returning false here. + return pt != nullptr && pt->signalDepth() != 0; +} + +SignalHandlerScope::SignalHandlerScope() : _active(true) { + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + if (pt != nullptr) { + pt->enterSignalScope(); + } else { + // No thread context: nothing to update; mark inactive so destructor + // and release() are no-ops. + _active = false; + } +} + +SignalHandlerScope::~SignalHandlerScope() { + if (!_active) return; + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + if (pt != nullptr) { + pt->exitSignalScope(); + } +} + +void SignalHandlerScope::release() { + if (!_active) return; + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + if (pt != nullptr) { + pt->exitSignalScope(); + } + _active = false; +} + +void signalHandlerUnwindAfterLongjmp() { + ProfiledThread *pt = ProfiledThread::currentSignalSafe(); + if (pt != nullptr) { + pt->exitSignalScope(); + } +} + // Static bitmap storage for fallback cases uint64_t CriticalSection::_fallback_bitmap[CriticalSection::FALLBACK_BITMAP_WORDS] = {}; diff --git a/ddprof-lib/src/main/cpp/guards.h b/ddprof-lib/src/main/cpp/guards.h index a5e4b0793..4addd8d39 100644 --- a/ddprof-lib/src/main/cpp/guards.h +++ b/ddprof-lib/src/main/cpp/guards.h @@ -24,6 +24,84 @@ class ProfiledThread; +// --------------------------------------------------------------------------- +// Signal-context depth tracking — always on. +// +// Profiler::dlopen_hook is the production caller — it queries +// isInTrackedSignalContext() to decide between the synchronous refresh() +// path and the deferred markDirty() path. The debug-only +// DEBUG_ASSERT_NOT_IN_SIGNAL() macro in signalSafety.h asserts on the +// same counter. +// +// Storage: the depth lives in ProfiledThread::_signal_depth. An earlier +// design used a thread_local int, but on Graal aarch64 the lazy DTV slot +// allocation triggered malloc inside our signal handler and deadlocked +// against the JVMCI compiler holding the heap lock. initial-exec fixed +// the malloc but tripped the static TLS surplus and broke dlopen on +// Graal. ProfiledThread is already AS-safe-accessible via +// pthread_getspecific (POSIX guarantees it does not allocate; returns +// nullptr when unset). +// +// When ProfiledThread is null on a thread we don't yet have a thread +// context — uninstrumented JVM-internal threads (VM Thread, JIT, GC) fall +// into this bucket too, and they can receive signals. The +// SignalHandlerScope guard is a no-op on those threads (nothing to +// update), so isInTrackedSignalContext() returns false: production code +// prefers synchronous refresh() on null-PT threads because (a) those +// threads regularly call dlopen during normal JVM operation, and (b) +// wasmtime's broken sigaction patching depends on switchLibraryTrap +// running work inline. The residual risk — an uninstrumented thread +// calling dlopen from inside a foreign signal handler — is small in +// practice: prewarmUnwinder() closes the known libgcc_s lazy-load case +// and mainstream JVM signal handlers are AS-safe by design. +// +// DEBUG_ASSERT_NOT_IN_SIGNAL likewise skips its check when ProfiledThread +// is null so well-behaved non-signal code on uninstrumented threads +// doesn't trip a false abort. +// --------------------------------------------------------------------------- + +// Returns the signal-handler depth for the calling thread, or 0 if the +// thread has no ProfiledThread yet. Intended for tests and diagnostic +// code; production callers should use isInTrackedSignalContext(). +int getInSignalDepth(); + +// Returns true only when we have positively tracked entering one of our +// installed signal handlers on this thread (depth > 0 on a non-null +// ProfiledThread). null ProfiledThread → false, matching the +// SignalHandlerScope semantics (the guard is a no-op there). +// Used by Profiler::dlopen_hook to gate the deferred-refresh branch. +bool isInTrackedSignalContext(); + +// Internal RAII type — do not instantiate directly; use the macros below. +class SignalHandlerScope { +public: + SignalHandlerScope(); + ~SignalHandlerScope(); + void release(); + SignalHandlerScope(const SignalHandlerScope&) = delete; + SignalHandlerScope& operator=(const SignalHandlerScope&) = delete; +private: + bool _active; +}; + +// Declare a scope guard local that increments the depth on entry and +// decrements on scope exit. Use as the very first statement in every +// installed signal handler. +#define SIGNAL_HANDLER_GUARD() SignalHandlerScope _signal_handler_scope + +// Manually release the most recent SIGNAL_HANDLER_GUARD() before chaining to +// another handler that may longjmp through us (e.g. J9's SIGSEGV null-pointer +// check handler). After release(), depth has already been decremented; the +// destructor becomes a no-op. +#define SIGNAL_HANDLER_GUARD_RELEASE() _signal_handler_scope.release() + +// Compensate for a longjmp that bypassed a SignalHandlerScope's destructor. +// Call at the setjmp landing point AFTER a known longjmp originated from +// within a signal handler frame (e.g. HotSpot's checkFault → longjmp recovery +// in walkVM). +void signalHandlerUnwindAfterLongjmp(); +#define SIGNAL_HANDLER_UNWIND_AFTER_LONGJMP() signalHandlerUnwindAfterLongjmp() + /** * Race-free critical section using atomic compare-and-swap. * diff --git a/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp b/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp index cc6c6eb60..b63b66e1d 100644 --- a/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp +++ b/ddprof-lib/src/main/cpp/hotspot/hotspotSupport.cpp @@ -13,6 +13,7 @@ #include "hotspot/vmStructs.inline.h" #include "jvmSupport.h" #include "profiler.h" +#include "guards.h" #include "stackWalker.inline.h" #include "frames.h" @@ -187,6 +188,9 @@ __attribute__((no_sanitize("address"))) int HotspotSupport::walkVM(void* ucontex profiled_thread->setCrashProtectionActive(true); } if (setjmp(crash_protection_ctx) != 0) { + // checkFault() does a longjmp from inside segvHandler, bypassing + // segvHandler's SignalHandlerScope destructor. Compensate. + SIGNAL_HANDLER_UNWIND_AFTER_LONGJMP(); if (profiled_thread != nullptr) { profiled_thread->setCrashProtectionActive(false); } diff --git a/ddprof-lib/src/main/cpp/itimer.cpp b/ddprof-lib/src/main/cpp/itimer.cpp index a69634ffe..f041e1885 100644 --- a/ddprof-lib/src/main/cpp/itimer.cpp +++ b/ddprof-lib/src/main/cpp/itimer.cpp @@ -31,6 +31,7 @@ long ITimer::_interval; CStack ITimer::_cstack; void ITimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); // NOTE: ITimer uses setitimer(ITIMER_PROF) which delivers signals with // si_code==SI_KERNEL — no sival payload is available. The signal-origin // check implemented in CTimer/WallClock cannot be applied here. ITimer @@ -102,6 +103,7 @@ volatile bool ITimerJvmti::_enabled = false; long ITimerJvmti::_interval = 0; void ITimerJvmti::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); CriticalSection cs; if (!cs.entered()) { return; diff --git a/ddprof-lib/src/main/cpp/libraries.cpp b/ddprof-lib/src/main/cpp/libraries.cpp index 15863fcd1..420e3b273 100644 --- a/ddprof-lib/src/main/cpp/libraries.cpp +++ b/ddprof-lib/src/main/cpp/libraries.cpp @@ -5,10 +5,19 @@ #include "libraries.h" #include "libraryPatcher.h" #include "log.h" +#include "mallocTracer.h" +#include "os.h" #include "symbols.h" #include "symbols_linux.h" #include "vmEntry.h" +// Cadence for the background refresher thread. Bounds the window during +// which a library lazily loaded from signal context (and therefore unable +// to call refresh() synchronously) is unresolvable by the stack walker. +// 500 ms is short enough that frame resolution gaps are barely observable +// in typical sampling, and the refresher only wakes once per tick (cheap). +static constexpr u64 REFRESH_INTERVAL_NS = 500ULL * 1'000'000ULL; + void Libraries::mangle(const char *name, char *buf, size_t size) { char *buf_end = buf + size; strcpy(buf, "_ZN"); @@ -41,6 +50,79 @@ void Libraries::updateSymbols(bool kernel_symbols) { LibraryPatcher::patch_libraries(); } +void Libraries::refresh() { + // Clear the flag before scanning so any concurrent markDirty() between + // now and the end of this call re-arms us for the next tick. All + // downstream operations are idempotent (parseLibraries tracks + // _parsed_inodes, patch_sigaction checks _sigaction_entries, + // installHooks uses monotonic _patched_libs, updateBuildIds tracks + // _build_id_processed), so redundant invocations are cheap. + _dirty.store(false, std::memory_order_release); + updateSymbols(false); + LibraryPatcher::patch_sigaction(); + MallocTracer::installHooks(); + if (_remote_symbolication) { + updateBuildIds(); + } +} + +void *Libraries::refresherLoop(void *arg) { + Libraries *self = static_cast(arg); + + // Block profiling signals BEFORE publishing our TID. Otherwise a + // SIGPROF / SIGVTALRM armed for this thread (e.g. a stale per-thread + // timer from a previous lifecycle, or an in-flight signal queued during + // pthread_create) could fire on us between TID-publish and sigmask, and + // run the full stack-walk path here — pure overhead, and in debug + // builds with DDPROF_FORCE_STACKWALK_CRASH it inflates the SIGSEGV + // recovery count enough to starve the test work thread and break + // vmStackwalkerCrashRecoveryTest. SIGIO (WAKEUP_SIGNAL) is left + // unmasked because stopRefresher() relies on it to interrupt sleep. + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGPROF); + sigaddset(&mask, SIGVTALRM); + pthread_sigmask(SIG_BLOCK, &mask, nullptr); + + // Publish our TID so sampler thread-list enumerations can skip us. + self->_refresher_tid.store(OS::threadId(), std::memory_order_release); + + while (self->_refresher_running.load(std::memory_order_acquire)) { + // Absolute-deadline sleep that resumes across EINTR (SIGCHLD, debugger + // SIGSTOP/SIGCONT, etc.) and wakes early when stopRefresher() flips + // _refresher_running false and sends SIGIO. See OS::sleepWhile. + OS::sleepWhile(REFRESH_INTERVAL_NS, self->_refresher_running); + if (!self->_refresher_running.load(std::memory_order_acquire)) { + break; + } + if (self->_dirty.load(std::memory_order_acquire)) { + self->refresh(); + } + } + return nullptr; +} + +void Libraries::startRefresher() { + if (_refresher_running.exchange(true, std::memory_order_acq_rel)) { + return; // already running + } + if (pthread_create(&_refresher_thread, nullptr, refresherLoop, this) != 0) { + _refresher_running.store(false, std::memory_order_release); + Log::warn("Unable to start Libraries refresher thread"); + } +} + +void Libraries::stopRefresher() { + if (!_refresher_running.exchange(false, std::memory_order_acq_rel)) { + return; // not running + } + pthread_kill(_refresher_thread, WAKEUP_SIGNAL); + pthread_join(_refresher_thread, nullptr); + // Clear the published TID so a later sampler doesn't skip an unrelated + // thread that may have inherited the same TID after we exited. + _refresher_tid.store(-1, std::memory_order_release); +} + // Platform-specific implementation of updateBuildIds() is in libraries_linux.cpp (Linux) // or stub implementation for other platforms diff --git a/ddprof-lib/src/main/cpp/libraries.h b/ddprof-lib/src/main/cpp/libraries.h index e58ceaa02..18b59c963 100644 --- a/ddprof-lib/src/main/cpp/libraries.h +++ b/ddprof-lib/src/main/cpp/libraries.h @@ -1,16 +1,59 @@ #ifndef _LIBRARIES_H #define _LIBRARIES_H +#include +#include + #include "codeCache.h" class Libraries { private: CodeCacheArray _native_libs; CodeCache _runtime_stubs; + bool _remote_symbolication; // set via setRemoteSymbolication() + + // Pending-refresh flag set by dlopen_hook when it cannot call refresh() + // synchronously (signal context). Polled by the refresher thread. + std::atomic _dirty; + + // Background refresher thread: periodically (every REFRESH_INTERVAL_NS) + // checks _dirty and runs refresh() outside signal context, narrowing the + // window during which newly-loaded libraries are unresolvable. + pthread_t _refresher_thread; + std::atomic _refresher_running; + std::atomic _refresher_tid; // captured from OS::threadId() on entry + static void *refresherLoop(void *arg); static void mangle(const char *name, char *buf, size_t size); public: - Libraries() : _native_libs(), _runtime_stubs("runtime stubs") {} + Libraries() : _native_libs(), _runtime_stubs("runtime stubs"), + _remote_symbolication(false), _dirty(false), + _refresher_thread(), _refresher_running(false), + _refresher_tid(-1) {} + + void setRemoteSymbolication(bool enabled) { _remote_symbolication = enabled; } + + // Refresh symbol tables and reinstall hooks/patches for any libraries + // loaded since the last refresh. Idempotent and cheap when no new + // libraries have been loaded (parseLibraries tracks _parsed_inodes). + // Clears the dirty flag. Must be called from non-signal context: + // updateSymbols acquires a Mutex and reads /proc/self/maps. + void refresh(); + + // Async-signal-safe: just sets a flag. The refresher thread will pick + // up the change on its next tick. + void markDirty() { _dirty.store(true, std::memory_order_release); } + + // Start/stop the background refresher thread. Called from + // Profiler::start/stop. + void startRefresher(); + void stopRefresher(); + + // TID of the refresher thread once it has captured its own ID, or -1 if + // the thread is not currently running. Used by sampler thread-list + // enumeration to skip this profiler-internal thread. + int refresherTid() const { return _refresher_tid.load(std::memory_order_acquire); } + void updateSymbols(bool kernel_symbols); void updateBuildIds(); // Extract build-ids for all loaded libraries const void *resolveSymbol(const char *name); diff --git a/ddprof-lib/src/main/cpp/mutex.cpp b/ddprof-lib/src/main/cpp/mutex.cpp index 85228dbf6..8b9c92b64 100644 --- a/ddprof-lib/src/main/cpp/mutex.cpp +++ b/ddprof-lib/src/main/cpp/mutex.cpp @@ -4,6 +4,7 @@ */ #include "mutex.h" +#include "signalSafety.h" Mutex::Mutex() { @@ -14,6 +15,7 @@ Mutex::Mutex() { } void Mutex::lock() { + DEBUG_ASSERT_NOT_IN_SIGNAL(); pthread_mutex_lock(&_mutex); } diff --git a/ddprof-lib/src/main/cpp/os.h b/ddprof-lib/src/main/cpp/os.h index 77d19aba3..6d50420ec 100644 --- a/ddprof-lib/src/main/cpp/os.h +++ b/ddprof-lib/src/main/cpp/os.h @@ -7,6 +7,7 @@ #ifndef _OS_H #define _OS_H +#include #include #include #include @@ -104,7 +105,22 @@ class OS { static u64 micros(); static u64 processStartTime(); static void sleep(u64 nanos); - static void uninterruptibleSleep(u64 nanos, volatile bool* flag); + + // Sleep for up to max_nanos, restarting against an absolute monotonic + // deadline on EINTR so unrelated signals (SIGCHLD, debugger + // SIGSTOP/SIGCONT, etc.) do not shorten the wait. Returns early when + // keep_sleeping becomes false — used by background threads (e.g. the + // Libraries refresher) to respond to a stop request without waiting + // out the full interval. Pair with pthread_kill(thread, SOMESIG) + + // keep_sleeping=false in the caller's stop path: the signal triggers + // EINTR, the loop re-checks the flag, and the function returns. + // + // On Linux this uses clock_nanosleep with TIMER_ABSTIME + CLOCK_MONOTONIC + // so the deadline is absolute and arithmetic-free. On macOS we fall + // back to nanosleep with a recomputed remainder (clock_nanosleep with + // TIMER_ABSTIME is unavailable there). + static void sleepWhile(u64 max_nanos, std::atomic& keep_sleeping); + static u64 overrun(siginfo_t* siginfo); static u64 hton64(u64 x); diff --git a/ddprof-lib/src/main/cpp/os_linux.cpp b/ddprof-lib/src/main/cpp/os_linux.cpp index 807ccea81..20329091f 100644 --- a/ddprof-lib/src/main/cpp/os_linux.cpp +++ b/ddprof-lib/src/main/cpp/os_linux.cpp @@ -171,11 +171,18 @@ void OS::sleep(u64 nanos) { nanosleep(&ts, NULL); } -void OS::uninterruptibleSleep(u64 nanos, volatile bool* flag) { - // Workaround nanosleep bug: https://man7.org/linux/man-pages/man2/nanosleep.2.html#BUGS - u64 deadline = OS::nanotime() + nanos; - struct timespec ts = {(time_t)(deadline / 1000000000), (long)(deadline % 1000000000)}; - while (*flag && clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &ts, &ts) == EINTR); +void OS::sleepWhile(u64 max_nanos, std::atomic& keep_sleeping) { + // Compute the deadline once and reuse it across EINTR retries so + // unrelated signals don't shorten the wait. + u64 deadline = OS::nanotime() + max_nanos; + struct timespec ts; + ts.tv_sec = (time_t)(deadline / 1000000000ULL); + ts.tv_nsec = (long)(deadline % 1000000000ULL); + while (keep_sleeping.load(std::memory_order_acquire) && + clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &ts, nullptr) == EINTR) { + // Re-issue against the same absolute deadline. EINTR absorbs + // SIGCHLD, SIGSTOP/CONT, etc. without arithmetic on remaining time. + } } u64 OS::overrun(siginfo_t* siginfo) { diff --git a/ddprof-lib/src/main/cpp/os_macos.cpp b/ddprof-lib/src/main/cpp/os_macos.cpp index a6872b695..0aee02704 100644 --- a/ddprof-lib/src/main/cpp/os_macos.cpp +++ b/ddprof-lib/src/main/cpp/os_macos.cpp @@ -127,9 +127,20 @@ void OS::sleep(u64 nanos) { nanosleep(&ts, NULL); } -void OS::uninterruptibleSleep(u64 nanos, volatile bool* flag) { - struct timespec ts = {(time_t)(nanos / 1000000000), (long)(nanos % 1000000000)}; - while (*flag && nanosleep(&ts, &ts) < 0 && errno == EINTR); +void OS::sleepWhile(u64 max_nanos, std::atomic& keep_sleeping) { + // macOS does not expose clock_nanosleep(TIMER_ABSTIME). Recompute the + // remaining interval against OS::nanotime() each iteration so spurious + // wake-ups don't shorten the wait, mirroring the Linux semantics. + u64 deadline = OS::nanotime() + max_nanos; + while (keep_sleeping.load(std::memory_order_acquire)) { + u64 now = OS::nanotime(); + if (now >= deadline) { + return; + } + u64 remaining = deadline - now; + struct timespec ts = {(time_t)(remaining / 1000000000ULL), (long)(remaining % 1000000000ULL)}; + nanosleep(&ts, nullptr); + } } u64 OS::overrun(siginfo_t* siginfo) { diff --git a/ddprof-lib/src/main/cpp/perfEvents_linux.cpp b/ddprof-lib/src/main/cpp/perfEvents_linux.cpp index 2edc05f46..85ec23d5e 100644 --- a/ddprof-lib/src/main/cpp/perfEvents_linux.cpp +++ b/ddprof-lib/src/main/cpp/perfEvents_linux.cpp @@ -730,6 +730,7 @@ u64 PerfEvents::readCounter(siginfo_t *siginfo, void *ucontext) { } void PerfEvents::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); if (siginfo->si_code <= 0) { // Looks like an external signal; don't treat as a profiling event return; diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index a328026b8..0bb66b89b 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -748,20 +748,54 @@ void Profiler::writeHeapUsage(long value, bool live) { _locks[lock_index].unlock(); } +void Profiler::prewarmUnwinder() { +#ifdef __linux__ + // J9 on aarch64 (and other JVMs) lazily loads libgcc_s.so.1 from its DWARF + // unwinder during stack walks. When that happens inside a signal handler + // frame, our dlopen_hook fires from signal context and tries to refresh the + // library list — Mutex::lock and malloc on a signal stack. By forcing the + // load here, before any signal handler is installed, subsequent calls find + // libgcc_s already mapped and the lazy-load path never runs. + // + // The handle is intentionally leaked: keeping the refcount > 0 prevents the + // library from being unmapped for the remainder of the process lifetime. + // + // SONAME note: "libgcc_s.so.1" is hardcoded deliberately. Referencing + // _Unwind_Backtrace from C++ would normally let the linker resolve the + // SONAME implicitly, but our release build uses -static-libgcc + // (ConfigurationPresets.kt: "-static-libgcc"), which embeds the unwinder + // into libjavaProfiler.so and removes libgcc_s.so from our NEEDED entries + // — so a symbol reference would not trigger the shared-library load. + // dlopen by SONAME is the only mechanism that works under static-libgcc. + // libgcc_s.so.1 has been the stable SONAME since 2002; a bump would + // constitute a glibc/GCC C++ ABI break and is treated as a fixed contract. + (void)dlopen("libgcc_s.so.1", RTLD_LAZY | RTLD_GLOBAL); +#endif +} + void *Profiler::dlopen_hook(const char *filename, int flags) { void *result = dlopen(filename, flags); if (result != NULL) { - // Static function of Profiler -> can not use the instance variable _libs - // Since Libraries is a singleton, this does not matter - Libraries::instance()->updateSymbols(false); - // Patch sigaction in newly loaded libraries - LibraryPatcher::patch_sigaction(); - MallocTracer::installHooks(); - // Extract build-ids for newly loaded libraries if remote symbolication is enabled - Profiler* profiler = instance(); - if (profiler != nullptr && profiler->_remote_symbolication) { - Libraries::instance()->updateBuildIds(); + if (!isInTrackedSignalContext()) { + // Either confirmed non-signal context, or no ProfiledThread on this + // thread (uninstrumented JVM internals — VM Thread, JIT, GC). We + // prefer synchronous refresh for the null-PT case too because (a) + // those threads call dlopen synchronously during normal JVM + // operation, and (b) wasmtime's broken sigaction patching depends + // on switchLibraryTrap running its work inline (the original reason + // for the trap). The residual risk — an uninstrumented thread + // calling dlopen from inside a foreign signal handler — is small: + // prewarmUnwinder() closes the known libgcc_s lazy-load case, and + // mainstream JVM signal handlers are AS-safe by design. + Libraries::instance()->refresh(); + } else { + // Confirmed signal context (one of our SignalHandlerScopes is on + // the stack). refresh() must NOT run here — parseLibraries + // acquires a Mutex and calls malloc, both AS-unsafe. Mark the + // library set dirty; the Libraries refresher thread picks it up + // within REFRESH_INTERVAL_NS (500 ms). + Libraries::instance()->markDirty(); } } @@ -801,10 +835,25 @@ void Profiler::disableEngines() { } void Profiler::segvHandler(int signo, siginfo_t *siginfo, void *ucontext) { + // J9 installs a SIGSEGV handler that uses siglongjmp() to recover from + // null-pointer-check faults during normal Java execution. When we chain to + // it, that longjmp unwinds past our stack frame and skips the RAII + // destructor, permanently leaking depth on the thread. Release the guard + // before chaining so depth is correct whether the chained handler returns + // or longjmps. + // + // Sanitizer-coverage note: this also means depth == 0 inside the chained + // handler, so DEBUG_ASSERT_NOT_IN_SIGNAL() will NOT fire for AS-unsafe + // code reachable from a chained handler that returns normally. This is + // the lesser of two evils — leaking depth on longjmp would silently + // break the production deferred-refresh gate, while the sanitizer gap + // is bounded to third-party signal handler code we don't own. + SIGNAL_HANDLER_GUARD(); if (crashHandlerInternal(signo, siginfo, ucontext)) { - return; // Handled + return; // Handled — destructor decrements depth } - // Not handled, chain to next handler + SIGNAL_HANDLER_GUARD_RELEASE(); + // Not handled, chain to next handler (may longjmp; never return through us) SigAction chain = OS::getSegvChainTarget(); if (chain != nullptr) { chain(signo, siginfo, ucontext); @@ -814,9 +863,13 @@ void Profiler::segvHandler(int signo, siginfo_t *siginfo, void *ucontext) { } void Profiler::busHandler(int signo, siginfo_t *siginfo, void *ucontext) { + // See segvHandler: release before chaining in case the chained handler + // longjmps through us. + SIGNAL_HANDLER_GUARD(); if (crashHandlerInternal(signo, siginfo, ucontext)) { - return; // Handled + return; // Handled — destructor decrements depth } + SIGNAL_HANDLER_GUARD_RELEASE(); // Not handled, chain to next handler SigAction chain = OS::getBusChainTarget(); if (chain != nullptr) { @@ -1127,6 +1180,10 @@ Error Profiler::start(Arguments &args, bool reset) { return Error("Profiler already started"); } + // Force libgcc_s to load now (idempotent dlopen) so the JVM's DWARF + // unwinder cannot lazy-load it later from signal context. + prewarmUnwinder(); + Error error = checkJvmCapabilities(); if (error) { return error; @@ -1134,6 +1191,7 @@ Error Profiler::start(Arguments &args, bool reset) { _omit_stacktraces = args._lightweight; _remote_symbolication = args._remote_symbolication; + _libs->setRemoteSymbolication(_remote_symbolication); _event_mask = ((args._event != NULL && strcmp(args._event, EVENT_NOOP) != 0) ? EM_CPU : 0) | @@ -1266,6 +1324,11 @@ Error Profiler::start(Arguments &args, bool reset) { enableEngines(); + // Refresher must be running before the trap fires: dlopen_hook's + // signal-context branch only marks dirty and relies on the refresher + // to call refresh() within REFRESH_INTERVAL_NS (500 ms). + _libs->startRefresher(); + // Always enable library trap to catch wasmtime loading and patch its broken sigaction switchLibraryTrap(true); @@ -1279,6 +1342,7 @@ Error Profiler::start(Arguments &args, bool reset) { if (error) { disableEngines(); switchLibraryTrap(false); + _libs->stopRefresher(); return error; } @@ -1321,6 +1385,7 @@ Error Profiler::start(Arguments &args, bool reset) { // nativemem is the only requested mode: propagate the real error disableEngines(); switchLibraryTrap(false); + _libs->stopRefresher(); lockAll(); _jfr.stop(); unlockAll(); @@ -1350,6 +1415,7 @@ Error Profiler::start(Arguments &args, bool reset) { // no engine was activated; perform cleanup disableEngines(); switchLibraryTrap(false); + _libs->stopRefresher(); lockAll(); _jfr.stop(); @@ -1377,7 +1443,11 @@ Error Profiler::stop() { _cpu_engine->stop(); switchLibraryTrap(false); + // Stop the refresher before the final refresh() so it doesn't race the + // teardown. startRefresher/stopRefresher are idempotent. + _libs->stopRefresher(); switchThreadEvents(JVMTI_DISABLE); + Libraries::instance()->refresh(); updateJavaThreadNames(); updateNativeThreadNames(); @@ -1480,6 +1550,7 @@ Error Profiler::flushJfr() { return Error("Profiler is not active"); } + Libraries::instance()->refresh(); updateJavaThreadNames(); updateNativeThreadNames(); @@ -1502,6 +1573,7 @@ Error Profiler::dump(const char *path, const int length) { // by the live objects LivenessTracker::instance()->flush(thread_ids); + Libraries::instance()->refresh(); updateJavaThreadNames(); updateNativeThreadNames(); diff --git a/ddprof-lib/src/main/cpp/profiler.h b/ddprof-lib/src/main/cpp/profiler.h index d2bf1065d..62b0b7749 100644 --- a/ddprof-lib/src/main/cpp/profiler.h +++ b/ddprof-lib/src/main/cpp/profiler.h @@ -122,6 +122,7 @@ class alignas(alignof(SpinLock)) Profiler { void **_dlopen_entry; static void *dlopen_hook(const char *filename, int flags); void switchLibraryTrap(bool enable); + static void prewarmUnwinder(); void enableEngines(); void disableEngines(); diff --git a/ddprof-lib/src/main/cpp/signalSafety.cpp b/ddprof-lib/src/main/cpp/signalSafety.cpp new file mode 100644 index 000000000..0f8dc3038 --- /dev/null +++ b/ddprof-lib/src/main/cpp/signalSafety.cpp @@ -0,0 +1,20 @@ +/* + * Copyright 2026, Datadog, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Header-only module — see signalSafety.h for DEBUG_ASSERT_NOT_IN_SIGNAL(). +// The signal-context depth counter (used by both the assertion and the +// production AS-safe deferred path) lives in guards.{h,cpp}. +#include "signalSafety.h" diff --git a/ddprof-lib/src/main/cpp/signalSafety.h b/ddprof-lib/src/main/cpp/signalSafety.h new file mode 100644 index 000000000..e322fdcbb --- /dev/null +++ b/ddprof-lib/src/main/cpp/signalSafety.h @@ -0,0 +1,88 @@ +/* + * Copyright 2026, Datadog, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _SIGNAL_SAFETY_H +#define _SIGNAL_SAFETY_H + +#include "guards.h" // isInSignalContext, SIGNAL_HANDLER_GUARD, ... +#include "thread.h" // ProfiledThread::currentSignalSafe + +// Detect ASAN using compiler-provided macros so the ASAN_ENABLED guard below +// works in every TU that includes this header, independent of include order. +#ifdef __has_feature +# if __has_feature(address_sanitizer) +# ifndef ASAN_ENABLED +# define ASAN_ENABLED 1 +# endif +# endif +#endif +#ifdef __SANITIZE_ADDRESS__ +# ifndef ASAN_ENABLED +# define ASAN_ENABLED 1 +# endif +#endif + +// Debug-only AS-safety assertion. Aborts with a file:line diagnostic when +// invoked from inside a signal handler in debug / ASAN builds; compiles to a +// no-op in release builds (NDEBUG). +// +// The depth counter itself lives in ProfiledThread::_signal_depth (see +// guards.h for the rationale). The check is skipped when ProfiledThread +// is null — uninstrumented threads (VM Thread, JIT, GC) have no thread +// context, so the assertion has no way to know whether they're really in +// a signal frame. Treating "unknown" as a violation would produce false +// positives every time AS-unsafe code legitimately ran on such a thread. +// +// write(2) is POSIX async-signal-safe. abort() generates a core dump and +// triggers ASAN's stack-trace symbolization, making it far more debuggable +// than _exit(1). The macro is only active in debug/ASAN builds where we +// intentionally trade AS-safety of the abort path for diagnosability. +#define _SIGNAL_SAFETY_STR(x) #x +#define _SIGNAL_SAFETY_TOSTR(x) _SIGNAL_SAFETY_STR(x) + +#if !defined(NDEBUG) || defined(ASAN_ENABLED) +#include // write, STDERR_FILENO, open, close +#include // abort +#include // O_WRONLY, O_CREAT, O_APPEND + +// Path for the diagnostic file — picked up as a CI artifact on failure. +#define _SIGNAL_SAFETY_DIAG_FILE "/tmp/signal-safety-violation.txt" + +#define DEBUG_ASSERT_NOT_IN_SIGNAL() \ + do { \ + ProfiledThread *_pt_for_assert = ProfiledThread::currentSignalSafe(); \ + /* Skip when no thread context — see comment above. */ \ + if (_pt_for_assert != nullptr && _pt_for_assert->signalDepth() != 0) { \ + static const char _msg[] = \ + "[java-profiler] AS-safety violation at " \ + __FILE__ ":" _SIGNAL_SAFETY_TOSTR(__LINE__) \ + ": async-signal-unsafe call made from signal handler context\n"; \ + (void)write(STDERR_FILENO, _msg, sizeof(_msg) - 1); \ + /* Also write to a file so CI can capture it regardless of output routing. */ \ + int _diag_fd = open(_SIGNAL_SAFETY_DIAG_FILE, \ + O_WRONLY | O_CREAT | O_APPEND, 0644); \ + if (_diag_fd >= 0) { \ + (void)write(_diag_fd, _msg, sizeof(_msg) - 1); \ + close(_diag_fd); \ + } \ + abort(); \ + } \ + } while (0) +#else +#define DEBUG_ASSERT_NOT_IN_SIGNAL() ((void)0) +#endif + +#endif // _SIGNAL_SAFETY_H diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index 4428aab94..e31c18bba 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -55,6 +55,7 @@ class ProfiledThread : public ThreadLocalData { u32 _misc_flags; int _filter_slot_id; // Slot ID for thread filtering uint8_t _init_window; // Countdown for JVM thread init race window (PROF-13072) + uint8_t _signal_depth; // Nested signal-handler depth (see SignalHandlerScope) UnwindFailures _unwind_failures; bool _otel_ctx_initialized; bool _crash_protection_active; @@ -73,6 +74,7 @@ class ProfiledThread : public ThreadLocalData { ProfiledThread(int tid) : ThreadLocalData(), _pc(0), _sp(0), _span_id(0), _crash_depth(0), _tid(tid), _cpu_epoch(0), _wall_epoch(0), _call_trace_id(0), _recording_epoch(0), _misc_flags(0), _filter_slot_id(-1), _init_window(0), + _signal_depth(0), _otel_ctx_initialized(false), _crash_protection_active(false), _otel_ctx_record{}, _otel_tag_encodings{}, _otel_local_root_span_id(0) {}; @@ -168,6 +170,14 @@ class ProfiledThread : public ThreadLocalData { return _crash_depth > CRASH_HANDLER_NESTING_LIMIT; } + // Signal-handler depth counter used by SignalHandlerScope (guards.h). All + // access happens on the owning thread (signal handlers are delivered to the + // thread that's interrupted), so plain reads/writes are AS-safe — no locks, + // no malloc, no syscalls. See guards.h for the public API. + inline uint8_t signalDepth() const { return _signal_depth; } + inline void enterSignalScope() { ++_signal_depth; } + inline void exitSignalScope() { if (_signal_depth > 0) --_signal_depth; } + UnwindFailures* unwindFailures(bool reset = true) { if (reset) { _unwind_failures.clear(); diff --git a/ddprof-lib/src/main/cpp/vmEntry.cpp b/ddprof-lib/src/main/cpp/vmEntry.cpp index 3323cb8f8..aa5a30889 100644 --- a/ddprof-lib/src/main/cpp/vmEntry.cpp +++ b/ddprof-lib/src/main/cpp/vmEntry.cpp @@ -21,6 +21,7 @@ #include "hotspot/jitCodeCache.h" #include #include +#include "guards.h" #include #include @@ -56,6 +57,7 @@ AsyncGetCallTrace VM::_asyncGetCallTrace; JVM_GetManagement VM::_getManagement; static void wakeupHandler(int signo) { + SIGNAL_HANDLER_GUARD(); // Dummy handler for interrupting syscalls } diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp index a19263f3a..ce655f5b1 100644 --- a/ddprof-lib/src/main/cpp/wallClock.cpp +++ b/ddprof-lib/src/main/cpp/wallClock.cpp @@ -54,6 +54,7 @@ bool BaseWallClock::inSyscall(void *ucontext) { void WallClockASGCT::sharedSignalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); // Reject any SIGVTALRM that did not originate from our rt_tgsigqueueinfo // send. Defends against stray in-process tgkill / external sigqueue that // would otherwise drive our wallclock sampling path. @@ -177,14 +178,18 @@ void WallClockASGCT::timerLoop() { auto collectThreads = [&](std::vector& tids) { // Get thread IDs from the filter if it's enabled // Otherwise list all threads in the system + const int refresher_tid = Libraries::instance()->refresherTid(); if (Profiler::instance()->threadFilter()->enabled()) { Profiler::instance()->threadFilter()->collect(tids); } else { ThreadList *thread_list = OS::listThreads(); while (thread_list->hasNext()) { int tid = thread_list->next(); - // Don't include the current thread - if (tid != OS::threadId()) { + // Don't include the current thread, nor the Libraries refresher + // thread (profiler-internal — masking SIGVTALRM there is not + // enough; we also want to avoid the kill() round-trip and any + // pending-signal accumulation). + if (tid != OS::threadId() && tid != refresher_tid) { tids.push_back(tid); } } @@ -228,6 +233,7 @@ void WallClockASGCT::timerLoop() { void WallClockJvmti::sharedSignalHandler(int signo, siginfo_t *siginfo, void *ucontext) { + SIGNAL_HANDLER_GUARD(); WallClockJvmti *engine = reinterpret_cast(Profiler::instance()->wallEngine()); if (signo == SIGVTALRM) { @@ -283,13 +289,16 @@ void WallClockJvmti::initialize(Arguments &args) { void WallClockJvmti::timerLoop() { auto collectThreads = [&](std::vector &tids) { + const int refresher_tid = Libraries::instance()->refresherTid(); if (Profiler::instance()->threadFilter()->enabled()) { Profiler::instance()->threadFilter()->collect(tids); } else { ThreadList *thread_list = OS::listThreads(); while (thread_list->hasNext()) { int tid = thread_list->next(); - if (tid != OS::threadId()) { + // Exclude the wallclock timer thread itself and the Libraries + // refresher (profiler-internal). + if (tid != OS::threadId() && tid != refresher_tid) { tids.push_back(tid); } } diff --git a/ddprof-lib/src/test/cpp/signalSafety_ut.cpp b/ddprof-lib/src/test/cpp/signalSafety_ut.cpp new file mode 100644 index 000000000..a451e7439 --- /dev/null +++ b/ddprof-lib/src/test/cpp/signalSafety_ut.cpp @@ -0,0 +1,144 @@ +/* + * Copyright 2026, Datadog, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "signalSafety.h" +#include "thread.h" +#include + +class SignalSafetyTest : public ::testing::Test { +protected: + void SetUp() override { + // SignalHandlerScope reads/writes ProfiledThread::_signal_depth — the + // tests need a thread context to exist on the gtest thread, otherwise + // every scope is a no-op (which is the intended production behavior + // on uninstrumented threads, but not what these unit tests assert). + ProfiledThread::initCurrentThread(); + } + + void TearDown() override { + // Catch leaked depth from a failed test so subsequent tests start clean. + ASSERT_EQ(0, getInSignalDepth()) << "depth not zero after test — check for leaked SignalHandlerScope"; + ProfiledThread::release(); + } +}; + +TEST_F(SignalSafetyTest, DepthStartsAtZero) { + EXPECT_EQ(0, getInSignalDepth()); +} + +TEST_F(SignalSafetyTest, DepthSymmetry) { + EXPECT_EQ(0, getInSignalDepth()); + { + SignalHandlerScope scope; + EXPECT_EQ(1, getInSignalDepth()); + } + EXPECT_EQ(0, getInSignalDepth()); +} + +TEST_F(SignalSafetyTest, NestedDepth) { + EXPECT_EQ(0, getInSignalDepth()); + { + SignalHandlerScope outer; + EXPECT_EQ(1, getInSignalDepth()); + { + SignalHandlerScope inner; + EXPECT_EQ(2, getInSignalDepth()); + } + EXPECT_EQ(1, getInSignalDepth()); + } + EXPECT_EQ(0, getInSignalDepth()); +} + +TEST_F(SignalSafetyTest, TrackedSignalContextRequiresScope) { + EXPECT_FALSE(isInTrackedSignalContext()); + { + SignalHandlerScope scope; + EXPECT_TRUE(isInTrackedSignalContext()); + } + EXPECT_FALSE(isInTrackedSignalContext()); +} + +// SIGNAL_HANDLER_GUARD_RELEASE path: explicit early release inside a scope, +// destructor must not double-decrement. +TEST_F(SignalSafetyTest, ReleaseDecrementsAndDestructorIsNoOp) { + EXPECT_EQ(0, getInSignalDepth()); + { + SignalHandlerScope scope; + EXPECT_EQ(1, getInSignalDepth()); + scope.release(); + EXPECT_EQ(0, getInSignalDepth()); + // Destructor runs at end of scope and must NOT decrement again. + } + EXPECT_EQ(0, getInSignalDepth()); +} + +// SIGNAL_HANDLER_UNWIND_AFTER_LONGJMP path: simulate a longjmp that bypasses +// SignalHandlerScope's destructor. signalHandlerUnwindAfterLongjmp() must +// decrement the depth even when no live scope object is around. +// +// Sequence (longjmp simulated by an extra block-scope guard that we don't +// destroy): +// outer scope ctor → depth=1 +// inner scope ctor → depth=2 +// imagine longjmp: inner scope's dtor is skipped +// signalHandlerUnwindAfterLongjmp() at landing → depth=1 +// outer scope dtor → depth=0 +TEST_F(SignalSafetyTest, SignalHandlerUnwindAfterLongjmpDecrementsOnce) { + EXPECT_EQ(0, getInSignalDepth()); + { + SignalHandlerScope outer; + EXPECT_EQ(1, getInSignalDepth()); + + // Heap-allocate the inner scope so we can drop it without running its + // destructor — emulating a longjmp that bypasses the C++ stack unwind. + SignalHandlerScope *leaked_inner = new SignalHandlerScope(); + EXPECT_EQ(2, getInSignalDepth()); + + // Pretend the longjmp landed here. Compensate for the bypassed dtor. + SIGNAL_HANDLER_UNWIND_AFTER_LONGJMP(); + EXPECT_EQ(1, getInSignalDepth()); + + // Outer scope dtor runs next and brings depth back to 0. + // Free leaked_inner without calling its destructor (placement-new + // wasn't used so a plain operator delete would invoke the destructor; + // route the storage through ::operator delete to skip dtor). + ::operator delete(leaked_inner); + } + EXPECT_EQ(0, getInSignalDepth()); +} + +// Safety property: signalHandlerUnwindAfterLongjmp() saturates at zero; +// double calls do not underflow. +TEST_F(SignalSafetyTest, SignalHandlerUnwindAfterLongjmpSaturatesAtZero) { + EXPECT_EQ(0, getInSignalDepth()); + signalHandlerUnwindAfterLongjmp(); + signalHandlerUnwindAfterLongjmp(); + EXPECT_EQ(0, getInSignalDepth()); +} + +TEST(SignalSafetyTestNoContext, NullProfiledThreadIsNotTrackedSignal) { + // isInTrackedSignalContext() returns false on null because the + // SignalHandlerScope never ran — used by Profiler::dlopen_hook so + // uninstrumented JVM threads doing a normal dlopen take the fast + // (synchronous refresh) path instead of deferring. + EXPECT_FALSE(isInTrackedSignalContext()); +} + +TEST(SignalSafetyTestNoContext, NullProfiledThreadDepthIsZero) { + // Mutation guard: getInSignalDepth() must return 0 on null PT, not a + // sentinel that happens to equal zero today. + EXPECT_EQ(0, getInSignalDepth()); +} diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index f6fc91356..5d8380722 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -6,8 +6,8 @@ kotlin = "1.9.22" spotless = "8.5.1" # Testing -junit = "6.1.0" -junit-platform = "6.1.0" # JUnit Platform version corresponding to JUnit Jupiter 5.9.2 +junit = "5.9.2" +junit-platform = "1.9.2" # JUnit Platform version corresponding to JUnit Jupiter 5.9.2 junit-pioneer = "1.9.1" slf4j = "2.0.18" diff --git a/utils/run-docker-tests.sh b/utils/run-docker-tests.sh index fb1107385..a29f95192 100755 --- a/utils/run-docker-tests.sh +++ b/utils/run-docker-tests.sh @@ -8,7 +8,7 @@ # # Usage: ./utils/run-docker-tests.sh [options] # --libc=glibc|musl (default: glibc) -# --jdk=8|11|17|21|25|8-j9|11-j9|17-j9|21-j9 (default: 21) +# --jdk=8|11|17|21|25|8-j9|11-j9|17-j9|21-j9|17-graal|21-graal|25-graal (default: 21) # --arch=x64|aarch64 (default: auto-detect) # --config=debug|release|asan|tsan (default: debug) # --tests="TestPattern" (optional, specific test to run) @@ -94,6 +94,25 @@ get_glibc_jdk_url() { esac } +# JDK Download URLs (Oracle GraalVM) +# Versions match the SDKMAN-installed builds used in CI (java_setup.sh + +# cache_java.yml: JAVA__GRAAL_VERSION). Using Oracle's stable archive +# URLs (not "latest") so the docker images match the CI configuration. +get_graal_jdk_url() { + local version=$1 + local arch=$2 + # Oracle GraalVM archive arch labels: x64 / aarch64 (same as our $arch) + case "$version-$arch" in + 17-x64) echo "https://download.oracle.com/graalvm/17/archive/graalvm-jdk-17.0.12_linux-x64_bin.tar.gz" ;; + 17-aarch64) echo "https://download.oracle.com/graalvm/17/archive/graalvm-jdk-17.0.12_linux-aarch64_bin.tar.gz" ;; + 21-x64) echo "https://download.oracle.com/graalvm/21/archive/graalvm-jdk-21.0.4_linux-x64_bin.tar.gz" ;; + 21-aarch64) echo "https://download.oracle.com/graalvm/21/archive/graalvm-jdk-21.0.4_linux-aarch64_bin.tar.gz" ;; + 25-x64) echo "https://download.oracle.com/graalvm/25/archive/graalvm-jdk-25_linux-x64_bin.tar.gz" ;; + 25-aarch64) echo "https://download.oracle.com/graalvm/25/archive/graalvm-jdk-25_linux-aarch64_bin.tar.gz" ;; + *) echo "" ;; + esac +} + # JDK Download URLs (IBM Semeru OpenJ9) get_j9_jdk_url() { local version=$1 @@ -205,6 +224,12 @@ if [[ "$JDK_VARIANT" == "j9" ]]; then exit 1 fi JDK_URL=$(get_j9_jdk_url "$JDK_BASE_VERSION" "$ARCH") +elif [[ "$JDK_VARIANT" == "graal" ]]; then + if [[ "$LIBC" == "musl" ]]; then + echo "Error: GraalVM is not available for musl libc" + exit 1 + fi + JDK_URL=$(get_graal_jdk_url "$JDK_BASE_VERSION" "$ARCH") elif [[ "$LIBC" == "musl" ]]; then JDK_URL=$(get_musl_jdk_url "$JDK_BASE_VERSION" "$ARCH") else @@ -212,7 +237,7 @@ else fi if [[ -z "$JDK_URL" ]]; then - echo "Error: --jdk must be one of: 8, 11, 17, 21, 25, 8-j9, 11-j9, 17-j9, 21-j9" + echo "Error: --jdk must be one of: 8, 11, 17, 21, 25, 8-j9, 11-j9, 17-j9, 21-j9, 17-graal, 21-graal, 25-graal" exit 1 fi