Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
786ed88
chore: add more debug logs
not-matthias May 27, 2026
0df7999
chore: add more logs
not-matthias May 27, 2026
a5687f9
chore(callgrind): log obj_skip HIT when fn matches skip list
not-matthias May 27, 2026
81689f2
chore(callgrind): dump per-fn skip state for python objects at dump time
not-matthias May 27, 2026
a6e53bb
chore(callgrind): log new_fn_node creation with fn and obj name
not-matthias May 27, 2026
48472d5
fix(callgrind): check obj-skip on every BB entry, not only jk_Call
not-matthias May 27, 2026
731dc7d
test(callgrind): add C reproducer for runtime obj-skip cxt==0 leak
not-matthias May 27, 2026
bcfb4c9
fix(callgrind): drop BBCCs whose top context fn is skip-flagged
not-matthias May 27, 2026
ece679d
Revert "fix(callgrind): drop BBCCs whose top context fn is skip-flagged"
not-matthias May 27, 2026
3934032
chore(callgrind): log underflow resets and instrument_state transitions
not-matthias May 27, 2026
f243797
test(callgrind): C reproducer for cascading underflow obj-skip leak
not-matthias May 27, 2026
4a7c1d5
fix(callgrind): aggregate (cxt==0) and underflow leaks under a sentin…
not-matthias May 27, 2026
6fbd12b
Revert "fix(callgrind): aggregate (cxt==0) and underflow leaks under …
not-matthias May 27, 2026
568c255
fix(callgrind): seed shadow call stack from native stack at START
not-matthias May 27, 2026
dfe44cc
test(callgrind): wire underflow + py3.14 reproducers, strengthen post…
not-matthias May 27, 2026
9e62d5b
fix(callgrind/tests): link runtime_obj_skip libs via LDADD for --as-n…
not-matthias May 27, 2026
939d007
fix(callgrind/tests): normalize py314 prereq exit code for dash
not-matthias May 27, 2026
caf4703
fix(callgrind): skip frame 0 when seeding shadow call stack at START
not-matthias May 28, 2026
e31aab9
fix(callgrind): restore cxt on ret for seeded shadow-stack entries
not-matthias May 28, 2026
72e5f60
fix(callgrind): don't seed cxt for anonymous (JIT) frames
not-matthias May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 57 additions & 17 deletions callgrind/bbcc.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,7 @@ static void resize_bbcc_hash(void)
}


static __inline
BBCC** new_recursion(int size)
BBCC** CLG_(new_recursion)(int size)
{
BBCC** bbccs;
int i;
Expand Down Expand Up @@ -313,8 +312,7 @@ BBCC* new_bbcc(BB* bb)
* Recursion level doesn't need to be set as this is not included
* in the hash key: Only BBCCs with rec level 0 are in hashes.
*/
static
void insert_bbcc_into_hash(BBCC* bbcc)
void CLG_(insert_bbcc_into_hash)(BBCC* bbcc)
{
UInt idx;

Expand Down Expand Up @@ -389,10 +387,10 @@ static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)

bbcc->rec_index = 0;
bbcc->cxt = cxt;
bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions);
bbcc->rec_array = CLG_(new_recursion)(cxt->fn[0]->separate_recursions);
bbcc->rec_array[0] = bbcc;

insert_bbcc_into_hash(bbcc);
CLG_(insert_bbcc_into_hash)(bbcc);
}
else {
if (CLG_(clo).separate_threads)
Expand Down Expand Up @@ -513,16 +511,21 @@ static void handleUnderflow(BB* bb)
CLG_(current_fn_stack).top--;
CLG_(current_state).cxt = 0;
caller = CLG_(get_fn_node)(bb);
VG_(message)(Vg_UserMsg,
"underflow reset: cxt=0, BB=%#lx, fn-about-to-push='%s' "
"obj='%s' skip=%d\n",
bb_addr(bb), caller->name,
caller->file->obj->name, caller->skip);
CLG_(push_cxt)( caller );

if (!seen_before) {
/* set rec array for source BBCC: this is at rec level 1 */
source_bbcc->rec_array = new_recursion(caller->separate_recursions);
source_bbcc->rec_array = CLG_(new_recursion)(caller->separate_recursions);
source_bbcc->rec_array[0] = source_bbcc;

CLG_ASSERT(source_bbcc->cxt == 0);
source_bbcc->cxt = CLG_(current_state).cxt;
insert_bbcc_into_hash(source_bbcc);
CLG_(insert_bbcc_into_hash)(source_bbcc);
}
CLG_ASSERT(CLG_(current_state).bbcc);

Expand Down Expand Up @@ -725,20 +728,40 @@ void CLG_(setup_bbcc)(BB* bb)
}
}

if (jmpkind == jk_Call) {
/* Check obj-skip on every BB entry, not only jk_Call.
* The interpreter / perf trampoline can enter functions via jk_Jump
* or fall-through; if we only checked on jk_Call, skip would never
* latch for those fns and they'd leak into the dump. */
{
fn_node* node = CLG_(get_fn_node)(bb);
skip = node->skip;
if (!skip && !node->obj_skip_checked){
HChar* obj_name = node->file->obj->name;
// VG_(printf)(" %s\n", obj_name);
Int cmp_results[CLG_(clo).objs_to_skip_count];
for (int i=0; i<CLG_(clo).objs_to_skip_count; i++) {
// VG_(printf)(" %s\n", CLG_(clo).objs_to_skip[i]);
if (VG_(strcmp)(obj_name, CLG_(clo).objs_to_skip[i]) == 0) {
cmp_results[i] = VG_(strcmp)(obj_name, CLG_(clo).objs_to_skip[i]);
if (cmp_results[i] == 0) {
node->skip = True;
skip = True;
break;
}
}
if (skip) {
VG_(message)(Vg_UserMsg,
"obj_skip HIT: fn='%s' obj='%s' jmpkind=%d\n",
node->name, obj_name, (int)jmpkind);
}
if (!skip && CLG_(clo).objs_to_skip_count > 0) {
VG_(message)(Vg_UserMsg,
"obj_skip miss: fn='%s' obj='%s' (len=%lu, %d entries) jmpkind=%d\n",
node->name, obj_name,
VG_(strlen)(obj_name), CLG_(clo).objs_to_skip_count,
(int)jmpkind);
for (int i=0; i<CLG_(clo).objs_to_skip_count; i++)
VG_(message)(Vg_UserMsg, " vs [%d] strcmp=%d (len=%lu) '%s'\n",
i, cmp_results[i],
VG_(strlen)(CLG_(clo).objs_to_skip[i]),
CLG_(clo).objs_to_skip[i]);
Comment on lines +753 to +763
}
node->obj_skip_checked = True;
}
}
Expand Down Expand Up @@ -794,9 +817,26 @@ void CLG_(setup_bbcc)(BB* bb)
}
}

/* Change new context if needed, taking delayed_push into account */
/* Change new context if needed, taking delayed_push into account.
*
* The `cxt == 0` clause used to fire regardless of skip, which meant
* that on the first BB after instrumentation start / call-stack
* underflow, a skipped libpython fn would still be pushed as the new
* top context and appear as its own fn= block in the dump.
*
* Now: if the fn is skip, we still push it (otherwise the assert at
* the end of this block fires when fn_stack is empty), but emit a
* diagnostic so we can measure how often the leak happens. */
if ((delayed_push && !skip) || (CLG_(current_state).cxt == 0)) {
CLG_(push_cxt)(CLG_(get_fn_node)(bb));
fn_node* push_fn = CLG_(get_fn_node)(bb);
if (skip && CLG_(current_state).cxt == 0) {
VG_(message)(Vg_UserMsg,
"push_cxt FORCED for skipped fn (cxt==0): fn='%s' obj='%s' jmpkind=%d delayed_push=%d\n",
push_fn->name,
push_fn->file->obj->name,
(int)jmpkind, (int)delayed_push);
}
CLG_(push_cxt)(push_fn);
}
CLG_ASSERT(CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom);

Expand All @@ -807,10 +847,10 @@ void CLG_(setup_bbcc)(BB* bb)

bbcc->cxt = CLG_(current_state).cxt;
bbcc->rec_array =
new_recursion((*CLG_(current_fn_stack).top)->separate_recursions);
CLG_(new_recursion)((*CLG_(current_fn_stack).top)->separate_recursions);
bbcc->rec_array[0] = bbcc;

insert_bbcc_into_hash(bbcc);
CLG_(insert_bbcc_into_hash)(bbcc);
}
else {
/* get BBCC with current context */
Expand Down
122 changes: 122 additions & 0 deletions callgrind/callstack.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
*/

#include "global.h"
#include "pub_tool_stacktrace.h"

/*------------------------------------------------------------*/
/*--- Call stack, operations ---*/
Expand Down Expand Up @@ -361,6 +362,19 @@ void CLG_(pop_call_stack)(void)

if (depth == 0) function_left(to_fn);
}
else if (lower_entry->cxt != 0) {
/* Seeded entry from reconstruct_call_stack_from_native: jcc=0
* (skip-style) but push_cxt did run, so cxt was changed. Restore
* it here so the seeded wrapper doesn't stay stuck on top of the
* cxt chain and phantom-parent every subsequent call from the
* real caller. Real skip-entries (push_call_stack(skip=True)
* without a prior push_cxt) have lower_entry->cxt==0 and skip
* this branch — their cxt was never changed, so nothing to
* restore. */
CLG_(current_state).cxt = lower_entry->cxt;
CLG_(current_fn_stack).top =
CLG_(current_fn_stack).bottom + lower_entry->fn_sp;
}

/* To allow for an assertion in push_call_stack() */
lower_entry->cxt = 0;
Expand Down Expand Up @@ -433,3 +447,111 @@ Int CLG_(unwind_call_stack)(Addr sp, Int minpops)
CLG_DEBUG(4,"- unwind_call_stack\n");
return unwind_count;
}


/* Seed callgrind's shadow call stack from the client's native stack so a
* later `ret` past unseen frames pops cleanly instead of underflowing.
*
* Called on the OFF->ON instrumentation transition: the client (e.g.
* pytest_codspeed) typically reaches CALLGRIND_START_INSTRUMENTATION several
* libpython frames deep. Without seeding, csp stays at 0 while the real
* stack is non-empty, and every subsequent ret trips handleUnderflow and
* leaks the returned-into fn as a top-level fn= block.
*
* For each native frame we push a (jcc=0, skip-style) call_entry with the
* captured SP and a ret_addr computed from the caller's IP. To make obj-skip
* cost-folding work across the seeded chain, we also synthesize a BBCC for
* each non-skipped caller frame so push_call_stack-style `nonskipped`
* attribution has a target on the first non-skip -> skip transition.
*/
#define CLG_RECON_MAX_FRAMES 256

void CLG_(reconstruct_call_stack_from_native)(ThreadId tid)
{
Addr ips[CLG_RECON_MAX_FRAMES];
Addr sps[CLG_RECON_MAX_FRAMES];
call_stack* cs = &CLG_(current_call_stack);

if (cs->sp != 0) return;

UInt n = VG_(get_StackTrace)(tid, ips, CLG_RECON_MAX_FRAMES, sps, NULL, 0);
if (n == 0) return;

/* Caller's synthesized BBCC, latched for use as nonskipped target on
* the first non-skipped -> skipped transition. */
BBCC* caller_bbcc = 0;

/* Push bottom-up: oldest caller first, current frame last. */
for (Int frame = n - 1; frame >= 0; frame--) {
fn_node* fn = CLG_(get_fn_node_for_addr)(ips[frame]);

/* Latch obj-skip on first encounter, matching bbcc.c's check. */
if (!fn->obj_skip_checked) {
const HChar* obj = fn->file->obj->name;
for (Int j = 0; j < CLG_(clo).objs_to_skip_count; j++) {
if (VG_(strcmp)(obj, CLG_(clo).objs_to_skip[j]) == 0) {
fn->skip = True;
break;
}
}
fn->obj_skip_checked = True;
}

ensure_stack_size(cs->sp + 1);
BBCC* prev_nonskipped = CLG_(current_state).nonskipped;

/* Anonymous JIT frames (V8 trampolines, generated code with no
* DebugInfo) resolve to fn->name == "???". Don't push_cxt them:
* they have no useful identity, and because no later RET ever
* pops them (JS resumes via dispatch, not C-ABI ret), they would
* stay stuck on top of the cxt chain forever and become a phantom
* "???" root of every user fn. Push only a bare call_entry so SP
* unwind still works; ce->cxt stays 0, signaling pop_call_stack
* to leave cxt alone. */
Bool anonymous = (VG_(strcmp)(fn->name, "???") == 0);

if (!anonymous) {
CLG_(push_cxt)(fn);

/* Create a BBCC for non-skipped caller frames. ips[frame] for
* frame>=1 is "last byte of the call instruction" per
* VG_(get_StackTrace), so it's never a real BB start and the
* 0-insn synthetic BB cannot collide with later real
* instrumentation. The top frame's IP can land on a real BB,
* so we don't synthesize there — real BBCC will be created
* naturally on the first instrumented BB. */
if (frame > 0 && !fn->skip) {
Bool seen;
BBCC* b = CLG_(get_bbcc)(CLG_(get_bb)(ips[frame], NULL, &seen));
if (!seen) {
b->rec_array = CLG_(new_recursion)(fn->separate_recursions);
b->rec_array[0] = b;
b->cxt = CLG_(current_state).cxt;
CLG_(insert_bbcc_into_hash)(b);
}
caller_bbcc = b;
}

/* Mirror push_call_stack's nonskipped transition. */
if (!fn->skip) {
CLG_(current_state).nonskipped = 0;
} else if (prev_nonskipped == 0 && caller_bbcc) {
CLG_(current_state).nonskipped = caller_bbcc;
if (!caller_bbcc->skipped)
CLG_(init_cost_lz)(CLG_(sets).full, &caller_bbcc->skipped);
}
}

call_entry* ce = &cs->entry[cs->sp];
ce->jcc = 0;
ce->sp = sps[frame];
Comment on lines +545 to +547
ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] + 1 : 0;
ce->nonskipped = prev_nonskipped;

cs->sp++;
ensure_stack_size(cs->sp + 1);
cs->entry[cs->sp].cxt = 0;
}

if (caller_bbcc) CLG_(current_state).bbcc = caller_bbcc;
}
13 changes: 13 additions & 0 deletions callgrind/clo.c
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,25 @@ void CLG_(update_fn_config)(fn_node* fn)

void CLG_(add_obj_to_skip)(const HChar* obj_name)
{
VG_(message)(Vg_UserMsg, "add_obj_to_skip: '%s'\n", obj_name);
HChar* dup = VG_(strdup)("cl.clo.aots.1", obj_name);
CLG_(clo).objs_to_skip_count++;
CLG_(clo).objs_to_skip = VG_(realloc)("cl.clo.aots.2",
CLG_(clo).objs_to_skip,
CLG_(clo).objs_to_skip_count * sizeof(HChar*));
CLG_(clo).objs_to_skip[CLG_(clo).objs_to_skip_count - 1] = dup;

VG_(message)(Vg_UserMsg, "obj-skip list now has %d entries:\n",
CLG_(clo).objs_to_skip_count);
for (Int i = 0; i < CLG_(clo).objs_to_skip_count; i++) {
VG_(message)(Vg_UserMsg, " [%d] '%s'\n", i, CLG_(clo).objs_to_skip[i]);
}

Int checked = 0, skipped = 0;
CLG_(count_obj_skip_checked_fns)(&checked, &skipped);
VG_(message)(Vg_UserMsg,
"fn_nodes already obj_skip_checked: %d (of which marked skip: %d)\n",
checked, skipped);
}


Expand Down
2 changes: 2 additions & 0 deletions callgrind/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -1636,6 +1636,8 @@ void CLG_(dump_profile)(const HChar* trigger, Bool only_current_thread)

print_bbccs(trigger, only_current_thread);

CLG_(dump_python_fn_summary)();

bbs_done = CLG_(stat).bb_executions++;

if (VG_(clo_verbosity) > 1)
Expand Down
Loading
Loading