Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ jobs:
if: needs.check-for-pr.outputs.skip != 'true'
outputs:
configurations: ${{ steps.compute.outputs.configurations }}
run_fuzz: ${{ steps.compute.outputs.run_fuzz }}
steps:
- name: Debounce label events
if: github.event.action == 'labeled'
Expand Down Expand Up @@ -155,6 +156,13 @@ jobs:
if echo "$labels" | grep -Fq "test:tsan"; then
configs="$configs"',"tsan"'
fi
if echo "$labels" | grep -Fq "test:fuzz"; then
echo "run_fuzz=true" >> $GITHUB_OUTPUT
else
echo "run_fuzz=false" >> $GITHUB_OUTPUT
fi
else
echo "run_fuzz=false" >> $GITHUB_OUTPUT
fi

configs="$configs]"
Expand Down Expand Up @@ -194,3 +202,37 @@ jobs:
body-file: test-summary.md
comment-id: ci-test-results

fuzz:
needs: [check-for-pr, compute-configurations]
if: needs.check-for-pr.outputs.skip != 'true' && needs.compute-configurations.outputs.run_fuzz == 'true'
runs-on: ubuntu-latest
continue-on-error: true
timeout-minutes: 30
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Cache Gradle Wrapper Binaries
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.gradle/wrapper/dists
key: gradle-wrapper-${{ runner.os }}-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties') }}
restore-keys: |
gradle-wrapper-${{ runner.os }}-
- name: Cache Gradle User Home
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.gradle/caches
key: gradle-caches-${{ runner.os }}-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }}
restore-keys: |
gradle-caches-${{ runner.os }}-
- name: Setup OS
run: |
sudo apt-get update
sudo apt-get install -y clang
- name: Fuzz
run: ./gradlew :ddprof-lib:fuzz:fuzz -Pfuzz-duration=120 --no-daemon
- name: Upload crash artifacts
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: fuzz-crashes
path: ddprof-lib/fuzz/build/fuzz-crashes/
32 changes: 32 additions & 0 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,38 @@ jobs:
# C++ gtests (ASan + TSan) run on every PR via native-sanitizer-tests in ci.yml.
# Skip them here so the nightly focuses on Java functional tests under ASan.
skip_gtest: true
fuzz:
runs-on: ubuntu-latest
continue-on-error: true
timeout-minutes: 30
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Cache Gradle Wrapper Binaries
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.gradle/wrapper/dists
key: gradle-wrapper-${{ runner.os }}-${{ hashFiles('gradle/wrapper/gradle-wrapper.properties') }}
restore-keys: |
gradle-wrapper-${{ runner.os }}-
- name: Cache Gradle User Home
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: ~/.gradle/caches
key: gradle-caches-${{ runner.os }}-${{ hashFiles('**/*.gradle*', '**/gradle-wrapper.properties') }}
restore-keys: |
gradle-caches-${{ runner.os }}-
- name: Setup OS
run: |
sudo apt-get update
sudo apt-get install -y clang
- name: Fuzz
run: ./gradlew :ddprof-lib:fuzz:fuzz -Pfuzz-duration=120 --no-daemon
- name: Upload crash artifacts
if: failure()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: fuzz-crashes
path: ddprof-lib/fuzz/build/fuzz-crashes/
report-failures:
runs-on: ubuntu-latest
needs: run-test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class FuzzTargetsPlugin : Plugin<Project> {
val includeFiles = buildIncludePaths(project, extension, homebrewLLVM)

// Build compiler/linker args
val compilerArgs = buildFuzzCompilerArgs()
val compilerArgs = buildFuzzCompilerArgs(project)
val linkerArgs = buildFuzzLinkerArgs(homebrewLLVM, clangResourceDir, project.logger)

val fuzzSourceDir = extension.fuzzSourceDir.get().asFile
Expand Down Expand Up @@ -194,15 +194,17 @@ class FuzzTargetsPlugin : Plugin<Project> {
return includes
}

private fun buildFuzzCompilerArgs(): List<String> {
private fun buildFuzzCompilerArgs(project: Project): List<String> {
val version = project.version.toString()
val args = mutableListOf(
"-O1",
"-g",
"-fno-omit-frame-pointer",
"-fsanitize=fuzzer,address,undefined",
"-fvisibility=hidden",
"-std=c++17",
"-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION"
"-DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION",
"-DPROFILER_VERSION=\"$version\""
)
if (PlatformUtils.currentPlatform == Platform.LINUX && PlatformUtils.isMusl()) {
args.add("-D__musl__")
Expand Down
226 changes: 1 addition & 225 deletions ddprof-lib/src/main/cpp/callTraceStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,240 +6,16 @@

#include "callTraceStorage.h"
#include "counters.h"
#include "log.h"
#include "os.h"
#include "common.h"
#include "thread.h"
#include "vmEntry.h" // For BCI_ERROR constant
#include "arch.h" // For LP64_ONLY macro and COMMA macro
#include "guards.h" // For table swap critical sections
#include "primeProbing.h"
#include "thread.h"
#include <string.h>
#include <atomic>
#include <time.h>

// RefCountGuard static members
RefCountSlot RefCountGuard::refcount_slots[RefCountGuard::MAX_THREADS];
int RefCountGuard::slot_owners[RefCountGuard::MAX_THREADS];


// RefCountGuard implementation
int RefCountGuard::getThreadRefCountSlot() {
// Signal-safe collision resolution: use OS::threadId() with semi-random prime step probing
ProfiledThread* thrd = ProfiledThread::currentSignalSafe();
int tid = thrd != nullptr ? thrd->tid() : OS::threadId();

// Semi-random prime step probing to eliminate secondary clustering
HashProbe probe(static_cast<u64>(tid), MAX_THREADS);

int slot = probe.slot();
for (int i = 0; i < MAX_PROBE_DISTANCE; i++) {
// Try to claim this slot atomically
int expected = 0; // Empty slot (no thread ID)
if (__atomic_compare_exchange_n(&slot_owners[slot], &expected, tid, false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED)) {
// Successfully claimed the slot
return slot;
}

// Check if we already own this slot (for reentrant calls)
if (__atomic_load_n(&slot_owners[slot], __ATOMIC_ACQUIRE) == tid) {
return slot;
}

// Move to next slot using probe
if (probe.hasNext()) {
slot = probe.next();
}
}

// All probing attempts failed - return -1 to indicate failure
return -1;
}

RefCountGuard::RefCountGuard(CallTraceHashTable* resource) : _active(true), _my_slot(-1) {
// Get thread refcount slot using signal-safe collision resolution
_my_slot = getThreadRefCountSlot();

if (_my_slot == -1) {
// Slot allocation failed - refcount guard is inactive
_active = false;
return;
}

// CRITICAL ORDERING: Store pointer FIRST, then increment count
// This ensures the pointer-first protocol for race-free operation
//
// Why this ordering is safe:
// Between step 1 and 2, if scanner runs:
// - Scanner loads count=0 (not yet incremented)
// - Scanner sees slot as inactive, skips it
// - Safe: we haven't "activated" protection yet
//
// After step 2, slot is fully active and protects the resource
__atomic_store_n(&refcount_slots[_my_slot].active_table, resource, __ATOMIC_RELEASE);
__atomic_fetch_add(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
}

RefCountGuard::~RefCountGuard() {
if (_active && _my_slot >= 0) {
// CRITICAL ORDERING: Decrement count FIRST, then clear pointer
// This ensures safe deactivation
//
// Why this ordering is safe:
// After step 1, count=0 so scanner will skip this slot
// Step 2 clears the pointer (cleanup)
// No window where scanner thinks slot protects a table it doesn't
__atomic_fetch_sub(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_table, nullptr, __ATOMIC_RELEASE);

// Release slot ownership
__atomic_store_n(&slot_owners[_my_slot], 0, __ATOMIC_RELEASE);
}
}

RefCountGuard::RefCountGuard(RefCountGuard&& other) noexcept : _active(other._active), _my_slot(other._my_slot) {
other._active = false;
}

RefCountGuard& RefCountGuard::operator=(RefCountGuard&& other) noexcept {
if (this != &other) {
// Clean up current state with same ordering as destructor
if (_active && _my_slot >= 0) {
__atomic_fetch_sub(&refcount_slots[_my_slot].count, 1, __ATOMIC_RELEASE);
__atomic_store_n(&refcount_slots[_my_slot].active_table, nullptr, __ATOMIC_RELEASE);
__atomic_store_n(&slot_owners[_my_slot], 0, __ATOMIC_RELEASE);
}

// Move from other
_active = other._active;
_my_slot = other._my_slot;

// Clear other
other._active = false;
}
return *this;
}

void RefCountGuard::waitForRefCountToClear(CallTraceHashTable* table_to_delete) {
// Check refcount slots for the table we want to delete
//
// POINTER-FIRST PROTOCOL GUARANTEES:
// - Constructor stores pointer then increments count
// - Destructor decrements count then clears pointer
// - Scanner checks count first (if 0, slot is inactive)
//
// TRACE DROP WINDOW (intentional design):
// - Scanner can complete on FIRST iteration if all slots have count=0
// - Guards in construction (pointer stored, count still 0) are treated as inactive
// - Revalidation check in put() detects this race and drops the trace
// - This trades a narrow trace-drop window (~10-100ns) for protocol simplicity
// - USE-AFTER-FREE IS IMPOSSIBLE: Revalidation prevents table access after deletion

// PHASE 1: Fast path - spin with pause for short waits (common case)
// Expected: refcounts clear within 1-20µs as put() operations complete
const int SPIN_ITERATIONS = 100;
for (int spin = 0; spin < SPIN_ITERATIONS; ++spin) {
bool all_clear = true;

// Scan all slots (no bitmap optimization, but simpler logic)
for (int i = 0; i < MAX_THREADS; ++i) {
// CRITICAL: Check count FIRST (pointer-first protocol)
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count == 0) {
continue; // Slot inactive, skip it
}

// Count > 0, so slot is active - check which table it protects
CallTraceHashTable* table = __atomic_load_n(&refcount_slots[i].active_table, __ATOMIC_ACQUIRE);
if (table == table_to_delete) {
all_clear = false;
break;
}
}

if (all_clear) {
return; // Fast path success - refcounts cleared quickly
}
spinPause(); // CPU pause instruction, ~10-50 cycles
}

// PHASE 2: Slow path - async-signal-safe sleep for blocked thread case
const int MAX_WAIT_ITERATIONS = 5000;
struct timespec sleep_time = {0, 100000}; // 100 microseconds

for (int wait_count = 0; wait_count < MAX_WAIT_ITERATIONS; ++wait_count) {
bool all_clear = true;

for (int i = 0; i < MAX_THREADS; ++i) {
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count == 0) {
continue;
}

CallTraceHashTable* table = __atomic_load_n(&refcount_slots[i].active_table, __ATOMIC_ACQUIRE);
if (table == table_to_delete) {
all_clear = false;
break;
}
}

if (all_clear) {
return; // Slow path success
}

// nanosleep is POSIX async-signal-safe and does not call malloc
nanosleep(&sleep_time, nullptr);
}

// If we reach here, some refcounts didn't clear in time
// This shouldn't happen in normal operation but we log it for debugging
}

void RefCountGuard::waitForAllRefCountsToClear() {
// PHASE 1: Fast path - spin with pause for short waits
const int SPIN_ITERATIONS = 100;
for (int spin = 0; spin < SPIN_ITERATIONS; ++spin) {
bool any_refcounts = false;

for (int i = 0; i < MAX_THREADS; ++i) {
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count > 0) {
any_refcounts = true;
break;
}
}

if (!any_refcounts) {
return; // Fast path success
}
spinPause();
}

// PHASE 2: Slow path - async-signal-safe sleep
const int MAX_WAIT_ITERATIONS = 5000;
struct timespec sleep_time = {0, 100000}; // 100 microseconds

for (int wait_count = 0; wait_count < MAX_WAIT_ITERATIONS; ++wait_count) {
bool any_refcounts = false;

for (int i = 0; i < MAX_THREADS; ++i) {
uint32_t count = __atomic_load_n(&refcount_slots[i].count, __ATOMIC_ACQUIRE);
if (count > 0) {
any_refcounts = true;
break;
}
}

if (!any_refcounts) {
return; // Slow path success
}

nanosleep(&sleep_time, nullptr);
}

// If we reach here, some refcounts didn't clear in time
}


static const u64 OVERFLOW_TRACE_ID = 0x7fffffffffffffffULL; // Max 64-bit signed value

Expand Down
Loading
Loading