Skip to content

Commit 661df25

Browse files
authored
gh-149584: Fix excessive overhead in the Tachyon profiler regarding the cache behavior (#149649)
Use exact remote reads for interpreter state, thread state, and interpreter frame structs instead of pulling full remote pages into the profiler page cache. This matches the core change from #149585. The profiler clears the page cache between samples, so live entries are always packed at the front. Track the live count and only clear/search that prefix instead of scanning all 1024 slots on the hot path. Use the frame cache to predict the next thread state and top frame address, then batch interpreter/thread/frame reads with process_vm_readv when profiling a Linux target. Reuse prefetched frame buffers in the frame walker when the prediction is valid. Cache the last FrameInfo tuple per code object/instruction offset, reuse cached thread id objects, and append cached parent frames directly on full frame-cache hits. This cuts Python allocation churn in the steady-state profiler path.
1 parent 06a2830 commit 661df25

12 files changed

Lines changed: 738 additions & 126 deletions

File tree

Lib/profiling/sampling/sample.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,33 @@ def _print_unwinder_stats(self):
327327
print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
328328
print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
329329

330+
batched_attempts = stats.get('batched_read_attempts', 0)
331+
batched_successes = stats.get('batched_read_successes', 0)
332+
batched_misses = stats.get('batched_read_misses', 0)
333+
segments_requested = stats.get('batched_read_segments_requested', 0)
334+
segments_completed = stats.get('batched_read_segments_completed', 0)
335+
if batched_attempts > 0:
336+
batched_success_rate = stats.get('batched_read_success_rate', 0.0)
337+
batched_miss_rate = 100.0 - batched_success_rate
338+
segment_completion_rate = stats.get(
339+
'batched_read_segment_completion_rate', 0.0
340+
)
341+
342+
print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
343+
print(f" Attempts: {batched_attempts:n}")
344+
print(
345+
f" Successes: {batched_successes:n} "
346+
f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
347+
)
348+
print(
349+
f" Misses: {batched_misses:n} "
350+
f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
351+
)
352+
print(
353+
f" Segments read: {segments_completed:n}/{segments_requested:n} "
354+
f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
355+
)
356+
330357
# Memory operations
331358
memory_reads = stats.get('memory_reads', 0)
332359
memory_bytes = stats.get('memory_bytes_read', 0)

Lib/test/test_external_inspection.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3767,6 +3767,13 @@ def test_get_stats(self):
37673767
"frames_read_from_cache",
37683768
"frames_read_from_memory",
37693769
"frame_cache_hit_rate",
3770+
"batched_read_attempts",
3771+
"batched_read_successes",
3772+
"batched_read_misses",
3773+
"batched_read_segments_requested",
3774+
"batched_read_segments_completed",
3775+
"batched_read_success_rate",
3776+
"batched_read_segment_completion_rate",
37703777
]
37713778
for key in expected_keys:
37723779
self.assertIn(key, stats)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix excessive overhead in the Tachyon profiler when inspecting a remote
2+
process by avoiding repeated remote page-cache scans, batching predicted
3+
remote reads, and reusing cached profiler result objects. Patch by Pablo
4+
Galindo and Maurycy Pawłowski-Wieroński.

Modules/_remote_debugging/_remote_debugging.h

Lines changed: 79 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ extern "C" {
3030
#include "internal/pycore_llist.h" // struct llist_node
3131
#include "internal/pycore_long.h" // _PyLong_GetZero
3232
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
33+
#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
3334
#include "internal/pycore_stackref.h" // Py_TAG_BITS
3435
#include "../../Python/remote_debug.h"
3536

@@ -215,6 +216,8 @@ typedef struct {
215216
PyObject *file_name;
216217
int first_lineno;
217218
PyObject *linetable; // bytes
219+
PyObject *last_frame_info;
220+
ptrdiff_t last_addrq;
218221
uintptr_t addr_code_adaptive;
219222
} CachedCodeMetadata;
220223

@@ -224,11 +227,41 @@ typedef struct {
224227

225228
typedef struct {
226229
uint64_t thread_id; // 0 = empty slot
230+
uintptr_t thread_state_addr;
227231
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
228232
Py_ssize_t num_addrs;
233+
PyObject *thread_id_obj; // owned reference, NULL if empty
229234
PyObject *frame_list; // owned reference, NULL if empty
230235
} FrameCacheEntry;
231236

237+
#define INTERPRETER_THREAD_CACHE_SIZE 32
238+
#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
239+
# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
240+
#endif
241+
242+
// The two per-interpreter L2 caches below are split into per-field tables so
243+
// that a writer rebinding one slot cannot leave stale data in a field owned by
244+
// the other when the slot is reused across interpreters.
245+
typedef struct {
246+
uintptr_t interpreter_addr;
247+
uintptr_t thread_state_addr;
248+
} InterpreterTstateCacheEntry;
249+
typedef struct {
250+
uintptr_t interpreter_addr;
251+
uint64_t code_object_generation;
252+
} InterpreterGenerationCacheEntry;
253+
254+
// Carries already-read thread state and/or frame buffers across helpers so the
255+
// downstream callee can skip a remote read. Address fields are caller-supplied
256+
// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
257+
// successfully populated them.
258+
typedef struct {
259+
const char *tstate;
260+
uintptr_t tstate_addr;
261+
const char *frame;
262+
uintptr_t frame_addr;
263+
} RemoteReadPrefetch;
264+
232265
/* Statistics for profiling performance analysis */
233266
typedef struct {
234267
uint64_t total_samples; // Total number of get_stack_trace calls
@@ -242,14 +275,44 @@ typedef struct {
242275
uint64_t code_object_cache_hits; // Code object cache hits
243276
uint64_t code_object_cache_misses; // Code object cache misses
244277
uint64_t stale_cache_invalidations; // Times stale entries were cleared
278+
uint64_t batched_read_attempts; // Batched remote-read attempts
279+
uint64_t batched_read_successes; // Attempts that read all requested segments
280+
uint64_t batched_read_misses; // Attempts that fell back or partially read
281+
uint64_t batched_read_segments_requested; // Segments requested by batched reads
282+
uint64_t batched_read_segments_completed; // Segments completed by batched reads
245283
} UnwinderStats;
246284

285+
#if defined(__GNUC__) || defined(__clang__)
286+
# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
287+
#else
288+
# define REMOTE_DEBUG_UNLIKELY(value) (value)
289+
#endif
290+
247291
/* Stats tracking macros - no-op when stats collection is disabled */
248292
#define STATS_INC(unwinder, field) \
249-
do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
293+
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)
250294

251295
#define STATS_ADD(unwinder, field, val) \
252-
do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
296+
do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
297+
298+
#if HAVE_PROCESS_VM_READV
299+
# define STATS_BATCHED_READ(unwinder, requested, completed) \
300+
do { \
301+
if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
302+
(unwinder)->stats.batched_read_attempts++; \
303+
(unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
304+
(unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
305+
if ((completed) == (requested)) { \
306+
(unwinder)->stats.batched_read_successes++; \
307+
} \
308+
else { \
309+
(unwinder)->stats.batched_read_misses++; \
310+
} \
311+
} \
312+
} while(0)
313+
#else
314+
# define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
315+
#endif
253316

254317
typedef struct {
255318
PyTypeObject *RemoteDebugging_Type;
@@ -290,7 +353,6 @@ typedef struct {
290353
struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
291354
uintptr_t interpreter_addr;
292355
uintptr_t tstate_addr;
293-
uint64_t code_object_generation;
294356
_Py_hashtable_t *code_object_cache;
295357
int debug;
296358
int only_active_thread;
@@ -302,9 +364,17 @@ typedef struct {
302364
int cache_frames;
303365
int collect_stats; // whether to collect statistics
304366
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
367+
// L1 single-entry shortcut over cached_tstates[]: most workloads sample one
368+
// interpreter, so check these pairs before hashing into the table below.
369+
uintptr_t cached_tstate_interpreter_addr;
370+
uintptr_t cached_tstate_addr;
371+
uintptr_t cached_generation_interpreter_addr;
372+
uint64_t cached_code_object_generation;
305373
RemoteDebuggingState *cached_state;
306374
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
307375
UnwinderStats stats; // statistics for performance analysis
376+
InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
377+
InterpreterGenerationCacheEntry cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
308378
#ifdef Py_GIL_DISABLED
309379
uint32_t tlbc_generation;
310380
_Py_hashtable_t *tlbc_cache;
@@ -361,11 +431,13 @@ typedef struct {
361431
typedef struct {
362432
/* Inputs */
363433
uintptr_t frame_addr; // Starting frame address
434+
uintptr_t thread_state_addr; // Owning thread state address
364435
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
365436
uintptr_t gc_frame; // GC frame address (0 if not tracking)
366437
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
367438
StackChunkList *chunks; // Pre-copied stack chunks
368439
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
440+
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
369441

370442
/* Outputs */
371443
PyObject *frame_info; // List to append FrameInfo objects
@@ -548,6 +620,7 @@ extern int process_frame_chain(
548620
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
549621
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
550622
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
623+
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
551624
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
552625
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
553626
extern int frame_cache_lookup_and_extend(
@@ -566,6 +639,7 @@ extern int frame_cache_store(
566639
PyObject *frame_list,
567640
const uintptr_t *addrs,
568641
Py_ssize_t num_addrs,
642+
uintptr_t thread_state_addr,
569643
uintptr_t base_frame_addr,
570644
uintptr_t last_frame_visited);
571645

@@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
605679
uintptr_t *current_tstate,
606680
uintptr_t gil_holder_tstate,
607681
uintptr_t gc_frame,
608-
uintptr_t main_thread_tstate
682+
uintptr_t main_thread_tstate,
683+
const RemoteReadPrefetch *prefetch
609684
);
610685

611686
/* Thread stopping functions (for blocking mode) */

Modules/_remote_debugging/clinic/module.c.h

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_remote_debugging/code_objects.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
405405
meta->func_name = func;
406406
meta->file_name = file;
407407
meta->linetable = linetable;
408+
meta->last_frame_info = NULL;
409+
meta->last_addrq = -1;
408410
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
409411
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
410412

@@ -482,6 +484,12 @@ parse_code_object(RemoteUnwinderObject *unwinder,
482484
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
483485
#endif
484486
; // Empty statement to avoid C23 extension warning
487+
488+
if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
489+
*result = Py_NewRef(meta->last_frame_info);
490+
return 0;
491+
}
492+
485493
LocationInfo info = {0};
486494
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
487495
PyBytes_GET_SIZE(meta->linetable),
@@ -529,6 +537,11 @@ parse_code_object(RemoteUnwinderObject *unwinder,
529537
goto error;
530538
}
531539

540+
if (!unwinder->opcodes) {
541+
Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
542+
meta->last_addrq = addrq;
543+
}
544+
532545
*result = tuple;
533546
return 0;
534547

Modules/_remote_debugging/frame_cache.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
3030
return;
3131
}
3232
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
33+
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
3334
Py_CLEAR(unwinder->frame_cache[i].frame_list);
3435
}
3536
PyMem_Free(unwinder->frame_cache);
@@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
5354
return NULL;
5455
}
5556

57+
FrameCacheEntry *
58+
frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
59+
{
60+
if (!unwinder->frame_cache || tstate_addr == 0) {
61+
return NULL;
62+
}
63+
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
64+
if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
65+
assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
66+
return &unwinder->frame_cache[i];
67+
}
68+
}
69+
return NULL;
70+
}
71+
5672
// Allocate a cache slot for a thread
5773
// Returns NULL if cache is full (graceful degradation)
5874
static FrameCacheEntry *
@@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
127143
}
128144
if (!found) {
129145
// Clear this entry
146+
Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
130147
Py_CLEAR(unwinder->frame_cache[i].frame_list);
131148
unwinder->frame_cache[i].thread_id = 0;
149+
unwinder->frame_cache[i].thread_state_addr = 0;
132150
unwinder->frame_cache[i].num_addrs = 0;
133151
STATS_INC(unwinder, stale_cache_invalidations);
134152
}
@@ -216,6 +234,7 @@ frame_cache_store(
216234
PyObject *frame_list,
217235
const uintptr_t *addrs,
218236
Py_ssize_t num_addrs,
237+
uintptr_t thread_state_addr,
219238
uintptr_t base_frame_addr,
220239
uintptr_t last_frame_visited)
221240
{
@@ -257,6 +276,13 @@ frame_cache_store(
257276
return -1;
258277
}
259278
entry->thread_id = thread_id;
279+
entry->thread_state_addr = thread_state_addr;
280+
if (entry->thread_id_obj == NULL) {
281+
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
282+
if (entry->thread_id_obj == NULL) {
283+
return -1;
284+
}
285+
}
260286
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
261287
entry->num_addrs = num_addrs;
262288
assert(entry->num_addrs == num_addrs);

0 commit comments

Comments
 (0)