From 786ed8815fb116a246bb49a96f92edfc47961800 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 10:53:12 +0200 Subject: [PATCH 01/20] chore: add more debug logs --- callgrind/clo.c | 13 +++++++++++++ callgrind/fn.c | 20 ++++++++++++++++++++ callgrind/global.h | 1 + 3 files changed, 34 insertions(+) diff --git a/callgrind/clo.c b/callgrind/clo.c index fc2084869..74d76501d 100644 --- a/callgrind/clo.c +++ b/callgrind/clo.c @@ -402,12 +402,25 @@ void CLG_(update_fn_config)(fn_node* fn) void CLG_(add_obj_to_skip)(const HChar* obj_name) { + VG_(message)(Vg_UserMsg, "add_obj_to_skip: '%s'\n", obj_name); HChar* dup = VG_(strdup)("cl.clo.aots.1", obj_name); CLG_(clo).objs_to_skip_count++; CLG_(clo).objs_to_skip = VG_(realloc)("cl.clo.aots.2", CLG_(clo).objs_to_skip, CLG_(clo).objs_to_skip_count * sizeof(HChar*)); CLG_(clo).objs_to_skip[CLG_(clo).objs_to_skip_count - 1] = dup; + + VG_(message)(Vg_UserMsg, "obj-skip list now has %d entries:\n", + CLG_(clo).objs_to_skip_count); + for (Int i = 0; i < CLG_(clo).objs_to_skip_count; i++) { + VG_(message)(Vg_UserMsg, " [%d] '%s'\n", i, CLG_(clo).objs_to_skip[i]); + } + + Int checked = 0, skipped = 0; + CLG_(count_obj_skip_checked_fns)(&checked, &skipped); + VG_(message)(Vg_UserMsg, + "fn_nodes already obj_skip_checked: %d (of which marked skip: %d)\n", + checked, skipped); } diff --git a/callgrind/fn.c b/callgrind/fn.c index efa5430de..ff29fa34f 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -307,6 +307,26 @@ void CLG_(init_obj_table)(void) obj_table[i] = 0; } +void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped) +{ + *checked = 0; + *skipped = 0; + for (Int i = 0; i < N_OBJ_ENTRIES; i++) { + for (obj_node* obj = obj_table[i]; obj != NULL; obj = obj->next) { + for (Int f = 0; f < N_FILE_ENTRIES; f++) { + for (file_node* file = obj->files[f]; file != NULL; file = file->next) { + for (Int n = 0; n < N_FN_ENTRIES; n++) { + for (fn_node* fn = file->fns[n]; fn != NULL; fn = fn->next) { + if (fn->obj_skip_checked) (*checked)++; + if (fn->skip) (*skipped)++; + } + } + } + } + } + } +} + #define HASH_CONSTANT 256 static UInt str_hash(const HChar *s, UInt table_size) diff --git a/callgrind/global.h b/callgrind/global.h index c2fda1cce..3afb06f11 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -723,6 +723,7 @@ void CLG_(set_current_fn_array)(fn_array*); UInt* CLG_(get_fn_entry)(Int n); void CLG_(init_obj_table)(void); +void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped); obj_node* CLG_(get_obj_node)(DebugInfo* si); file_node* CLG_(get_file_node)(obj_node*, const HChar *dirname, const HChar* filename); From 0df7999ab72c9c6f2431a068abcbbd0cb7dfb681 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 11:57:17 +0200 Subject: [PATCH 02/20] chore: add more logs --- callgrind/bbcc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 36b2300e1..1f049bfae 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -730,15 +730,25 @@ void CLG_(setup_bbcc)(BB* bb) skip = node->skip; if (!skip && !node->obj_skip_checked){ HChar* obj_name = node->file->obj->name; - // VG_(printf)(" %s\n", obj_name); + Int cmp_results[CLG_(clo).objs_to_skip_count]; for (int i=0; iskip = True; skip = True; - break; } } + if (!skip && CLG_(clo).objs_to_skip_count > 0) { + VG_(message)(Vg_UserMsg, + "obj_skip miss: fn='%s' obj='%s' (len=%lu, %d entries)\n", + node->name, obj_name, + VG_(strlen)(obj_name), CLG_(clo).objs_to_skip_count); + for (int i=0; iobj_skip_checked = True; } } From a5687f9f25cfa09421566cb1f52845caf039683a Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:01:14 +0200 Subject: [PATCH 03/20] chore(callgrind): log obj_skip HIT when fn matches skip list --- callgrind/bbcc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 1f049bfae..3520409ca 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -738,6 +738,11 @@ void CLG_(setup_bbcc)(BB* bb) skip = True; } } + if (skip) { + VG_(message)(Vg_UserMsg, + "obj_skip HIT: fn='%s' obj='%s'\n", + node->name, obj_name); + } if (!skip && CLG_(clo).objs_to_skip_count > 0) { VG_(message)(Vg_UserMsg, "obj_skip miss: fn='%s' obj='%s' (len=%lu, %d entries)\n", From 81689f25e9265e34b01489f06b3ada1dbca8ae50 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:01:40 +0200 Subject: [PATCH 04/20] chore(callgrind): dump per-fn skip state for python objects at dump time Adds CLG_(dump_python_fn_summary) and calls it from dump_profile. For each fn_node whose obj name contains 'python', prints the fn name, obj path, skip flag, and obj_skip_checked flag, with a final tally. Lets us see, per dump, whether interpreter fns ever had their skip check run. --- callgrind/dump.c | 2 ++ callgrind/fn.c | 38 ++++++++++++++++++++++++++++++++++++++ callgrind/global.h | 1 + 3 files changed, 41 insertions(+) diff --git a/callgrind/dump.c b/callgrind/dump.c index 3a3164c4b..bffbd4992 100644 --- a/callgrind/dump.c +++ b/callgrind/dump.c @@ -1636,6 +1636,8 @@ void CLG_(dump_profile)(const HChar* trigger, Bool only_current_thread) print_bbccs(trigger, only_current_thread); + CLG_(dump_python_fn_summary)(); + bbs_done = CLG_(stat).bb_executions++; if (VG_(clo_verbosity) > 1) diff --git a/callgrind/fn.c b/callgrind/fn.c index ff29fa34f..3c855028c 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -327,6 +327,44 @@ void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped) } } +static Bool name_contains(const HChar* hay, const HChar* needle) +{ + if (!hay || !needle) return False; + Int hlen = VG_(strlen)(hay), nlen = VG_(strlen)(needle); + for (Int i = 0; i + nlen <= hlen; i++) + if (VG_(strncmp)(hay + i, needle, nlen) == 0) return True; + return False; +} + +void CLG_(dump_python_fn_summary)(void) +{ + Int total = 0, checked = 0, skipped = 0; + VG_(message)(Vg_UserMsg, "=== python fn summary (dump) ===\n"); + for (Int i = 0; i < N_OBJ_ENTRIES; i++) { + for (obj_node* obj = obj_table[i]; obj != NULL; obj = obj->next) { + if (!name_contains(obj->name, "python")) continue; + for (Int f = 0; f < N_FILE_ENTRIES; f++) { + for (file_node* file = obj->files[f]; file != NULL; file = file->next) { + for (Int n = 0; n < N_FN_ENTRIES; n++) { + for (fn_node* fn = file->fns[n]; fn != NULL; fn = fn->next) { + total++; + if (fn->obj_skip_checked) checked++; + if (fn->skip) skipped++; + VG_(message)(Vg_UserMsg, + " fn='%s' obj='%s' skip=%d checked=%d\n", + fn->name, obj->name, + fn->skip, fn->obj_skip_checked); + } + } + } + } + } + } + VG_(message)(Vg_UserMsg, + "=== python fn summary: total=%d checked=%d skipped=%d ===\n", + total, checked, skipped); +} + #define HASH_CONSTANT 256 static UInt str_hash(const HChar *s, UInt table_size) diff --git a/callgrind/global.h b/callgrind/global.h index 3afb06f11..a3cb6990d 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -724,6 +724,7 @@ UInt* CLG_(get_fn_entry)(Int n); void CLG_(init_obj_table)(void); void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped); +void CLG_(dump_python_fn_summary)(void); obj_node* CLG_(get_obj_node)(DebugInfo* si); file_node* CLG_(get_file_node)(obj_node*, const HChar *dirname, const HChar* filename); From a6e53bb8537e8050b0718df37f39d9832ec1f631 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:01:45 +0200 Subject: [PATCH 05/20] chore(callgrind): log new_fn_node creation with fn and obj name Fires once per fn_node, before any call edge is built. Lets us see the exact obj path string Callgrind stores for a function (e.g. PyEval) so we can compare it against the obj_skip list independently of any strcmp result. --- callgrind/fn.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/callgrind/fn.c b/callgrind/fn.c index 3c855028c..45c1fda4c 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -511,6 +511,10 @@ fn_node* new_fn_node(const HChar *fnname, fn->toggle_collect = False; fn->skip = False; fn->obj_skip_checked = False; + + VG_(message)(Vg_UserMsg, "new_fn_node: fn='%s' obj='%s'\n", + fn->name, + (file && file->obj && file->obj->name) ? file->obj->name : "(null)"); fn->pop_on_jump = CLG_(clo).pop_on_jump; fn->is_malloc = False; fn->is_realloc = False; From 48472d5bb14e67e6635b88609df3b1d263afa2b1 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:28:40 +0200 Subject: [PATCH 06/20] fix(callgrind): check obj-skip on every BB entry, not only jk_Call The obj-skip check was gated on jmpkind == jk_Call. When a function in a skipped object was entered via jk_Jump or fall-through (interpreter dispatch, tail calls, perf trampoline, JIT), the skip flag never latched and the function leaked into the dump as its own fn= block. Also instrument the cxt==0 forced push_cxt path with a diagnostic line so we can measure the residual leak when a skipped fn is forced into a top-level context after an instrumentation start or call-stack underflow. --- callgrind/bbcc.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 3520409ca..0c87f1ff9 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -725,7 +725,11 @@ void CLG_(setup_bbcc)(BB* bb) } } - if (jmpkind == jk_Call) { + /* Check obj-skip on every BB entry, not only jk_Call. + * The interpreter / perf trampoline can enter functions via jk_Jump + * or fall-through; if we only checked on jk_Call, skip would never + * latch for those fns and they'd leak into the dump. */ + { fn_node* node = CLG_(get_fn_node)(bb); skip = node->skip; if (!skip && !node->obj_skip_checked){ @@ -740,14 +744,15 @@ void CLG_(setup_bbcc)(BB* bb) } if (skip) { VG_(message)(Vg_UserMsg, - "obj_skip HIT: fn='%s' obj='%s'\n", - node->name, obj_name); + "obj_skip HIT: fn='%s' obj='%s' jmpkind=%d\n", + node->name, obj_name, (int)jmpkind); } if (!skip && CLG_(clo).objs_to_skip_count > 0) { VG_(message)(Vg_UserMsg, - "obj_skip miss: fn='%s' obj='%s' (len=%lu, %d entries)\n", + "obj_skip miss: fn='%s' obj='%s' (len=%lu, %d entries) jmpkind=%d\n", node->name, obj_name, - VG_(strlen)(obj_name), CLG_(clo).objs_to_skip_count); + VG_(strlen)(obj_name), CLG_(clo).objs_to_skip_count, + (int)jmpkind); for (int i=0; iname, + push_fn->file->obj->name, + (int)jmpkind, (int)delayed_push); + } + CLG_(push_cxt)(push_fn); } CLG_ASSERT(CLG_(current_fn_stack).top > CLG_(current_fn_stack).bottom); From 731dc7db4c1fd10854c7d54276c27be776600d80 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:32:42 +0200 Subject: [PATCH 07/20] test(callgrind): add C reproducer for runtime obj-skip cxt==0 leak Minimal C test that triggers the leak path where a function in a skipped object becomes a top-level fn= block in the dump. The trigger: the lib calls CALLGRIND_START_INSTRUMENTATION from inside one of its own functions, so the first BB callgrind sees post-start lives in the skipped object. With cxt == 0 at that point, setup_bbcc force-pushes the skipped fn as the new top context and it leaks into the output. Currently RED: post-check fails because fn=skipme_run appears in the output despite skipme_run's containing .so being on the skip list. --- callgrind/tests/Makefile.am | 19 ++++++++++-- callgrind/tests/runtime_obj_skip_c.c | 30 +++++++++++++++++++ callgrind/tests/runtime_obj_skip_c.post.exp | 1 + callgrind/tests/runtime_obj_skip_c.stderr.exp | 0 callgrind/tests/runtime_obj_skip_c.vgtest | 5 ++++ callgrind/tests/runtime_obj_skip_c_lib.c | 27 +++++++++++++++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 callgrind/tests/runtime_obj_skip_c.c create mode 100644 callgrind/tests/runtime_obj_skip_c.post.exp create mode 100644 callgrind/tests/runtime_obj_skip_c.stderr.exp create mode 100644 callgrind/tests/runtime_obj_skip_c.vgtest create mode 100644 callgrind/tests/runtime_obj_skip_c_lib.c diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am index b6dc3de89..8fb23b408 100644 --- a/callgrind/tests/Makefile.am +++ b/callgrind/tests/Makefile.am @@ -13,6 +13,8 @@ EXTRA_DIST = \ find_debuginfo.vgtest find_debuginfo.stderr.exp find_debuginfo.post.exp \ runtime_obj_skip_py.vgtest runtime_obj_skip_py.stderr.exp runtime_obj_skip_py.post.exp \ runtime_obj_skip_py.py runtime_obj_skip_py_shim.c \ + runtime_obj_skip_c.vgtest runtime_obj_skip_c.stderr.exp runtime_obj_skip_c.post.exp \ + runtime_obj_skip_c.c runtime_obj_skip_c_lib.c \ bug497723.stderr.exp bug497723.post.exp bug497723.vgtest \ simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \ simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \ @@ -31,7 +33,7 @@ EXTRA_DIST = \ inline-crossfile.vgtest inline-crossfile.stderr.exp inline-crossfile.stdout.exp inline-crossfile.post.exp \ inline-crossfile-helper1.h inline-crossfile-helper2.h filter_inline -check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile +check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile runtime_obj_skip_c AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) @@ -44,10 +46,21 @@ threads_LDADD = -lpthread # Shim loaded by runtime_obj_skip_py.py via ctypes. Built unconditionally; # the test's prereq skips it if the .so is missing. -check_DATA = runtime_obj_skip_py_shim.so +check_DATA = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so runtime_obj_skip_py_shim.so: runtime_obj_skip_py_shim.c $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ $< -o $@ -CLEANFILES = runtime_obj_skip_py_shim.so +# Shared lib for the runtime_obj_skip_c test. Lives in a separate ELF +# so the main binary can register its path for runtime obj-skip. +runtime_obj_skip_c_lib.so: runtime_obj_skip_c_lib.c + $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ + $< -o $@ + +runtime_obj_skip_c_LDADD = -ldl +runtime_obj_skip_c_LDFLAGS = $(AM_LDFLAGS) -L. -l:runtime_obj_skip_c_lib.so \ + -Wl,-rpath,'$$ORIGIN' +runtime_obj_skip_c_DEPENDENCIES = runtime_obj_skip_c_lib.so + +CLEANFILES = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so diff --git a/callgrind/tests/runtime_obj_skip_c.c b/callgrind/tests/runtime_obj_skip_c.c new file mode 100644 index 000000000..9e5e650bc --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_c.c @@ -0,0 +1,30 @@ +/* Minimal C reproducer for the runtime obj-skip leak: a fn from a + * skipped object ends up as a top-level fn= block in the callgrind + * output when it is the first BB instrumented after START. + * + * Strategy: register the lib for skip, then call into the lib BEFORE + * starting instrumentation. The lib itself calls + * CALLGRIND_START_INSTRUMENTATION mid-function, so the first BB + * processed by callgrind lives in the skipped object — which trips + * the (cxt == 0) push_cxt path that ignores the skip flag. */ + +#define _GNU_SOURCE +#include +#include +#include "../callgrind.h" + +extern void skipme_run(int n); + +int main(void) +{ + Dl_info info; + if (dladdr((void*)skipme_run, &info) == 0 || !info.dli_fname) { + fprintf(stderr, "dladdr failed\n"); + return 1; + } + CALLGRIND_ADD_OBJ_SKIP(info.dli_fname); + + skipme_run(1000); + + return 0; +} diff --git a/callgrind/tests/runtime_obj_skip_c.post.exp b/callgrind/tests/runtime_obj_skip_c.post.exp new file mode 100644 index 000000000..d86bac9de --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_c.post.exp @@ -0,0 +1 @@ +OK diff --git a/callgrind/tests/runtime_obj_skip_c.stderr.exp b/callgrind/tests/runtime_obj_skip_c.stderr.exp new file mode 100644 index 000000000..e69de29bb diff --git a/callgrind/tests/runtime_obj_skip_c.vgtest b/callgrind/tests/runtime_obj_skip_c.vgtest new file mode 100644 index 000000000..6b250787f --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_c.vgtest @@ -0,0 +1,5 @@ +prereq: test -f runtime_obj_skip_c && test -f runtime_obj_skip_c_lib.so +prog-asis: ./runtime_obj_skip_c +vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_c +post: sh -c 'if grep -q "^fn=skipme_func" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_func leaked into top-level fn= block"; else echo OK; fi' +cleanup: rm -f callgrind.out.runtime_obj_skip_c diff --git a/callgrind/tests/runtime_obj_skip_c_lib.c b/callgrind/tests/runtime_obj_skip_c_lib.c new file mode 100644 index 000000000..83f9eebb1 --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_c_lib.c @@ -0,0 +1,27 @@ +/* Library that lives in a separate ELF object so the main binary + * can register its path for runtime obj-skip. + * + * skipme_run() flips instrumentation on from *inside* the skipped + * object, then calls skipme_func. This is the trigger for the + * `current_state.cxt == 0` push path in setup_bbcc: the very first + * BB after instrumentation start lives in a skipped object, so the + * (cxt==0) clause force-pushes a skipped fn as the new top context + * and it leaks into the dump as a top-level fn= block. */ + +#include "../callgrind.h" + +volatile long sink; + +__attribute__((noinline)) +void skipme_func(int n) +{ + for (int i = 0; i < n; i++) sink += i; +} + +__attribute__((noinline)) +void skipme_run(int n) +{ + CALLGRIND_START_INSTRUMENTATION; + skipme_func(n); + CALLGRIND_STOP_INSTRUMENTATION; +} From bcfb4c9a44d5f94bad191d85461a08cbfbb3e4e6 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:39:49 +0200 Subject: [PATCH 08/20] fix(callgrind): drop BBCCs whose top context fn is skip-flagged MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the (cxt == 0) clause in setup_bbcc force-pushes a skipped fn — e.g. the first BB after CALLGRIND_START_INSTRUMENTATION lives in a skipped object — the BBCC ends up with cxt->fn[0]->skip == True. Without filtering, those BBCCs emit a top-level fn= block and the skipped fn leaks into the dump. Filter them out at dump time in print_bbccs_of_thread, right before print_fn_pos would emit the ob=/fl=/fn= header. The call edges from non-skipped callers into skipped fns (cfn=) are unaffected because they're emitted from the caller's BBCC, whose context is not skipped. Also broadens the runtime_obj_skip_c post-check to grep for any fn=skipme*, since the actual leaked fn in the repro is skipme_run (the one calling START_INSTRUMENTATION), not skipme_func. --- callgrind/dump.c | 12 +++++++++++- callgrind/tests/runtime_obj_skip_c.vgtest | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/callgrind/dump.c b/callgrind/dump.c index bffbd4992..5780aeae0 100644 --- a/callgrind/dump.c +++ b/callgrind/dump.c @@ -1553,7 +1553,17 @@ static void print_bbccs_of_thread(thread_info* ti) } if (*p == 0) break; - + + /* Don't emit BBCCs whose top context fn is flagged for obj-skip. + * This happens when the (cxt == 0) clause in setup_bbcc force- + * pushes a skipped fn (first BB after instrumentation start that + * landed in a skipped object). Without this filter the skipped fn + * leaks into the dump as a top-level fn= block. */ + if ((*p)->cxt->fn[0]->skip) { + p++; + continue; + } + if (print_fn_pos(print_fp, &lastFnPos, *p)) { /* new function */ diff --git a/callgrind/tests/runtime_obj_skip_c.vgtest b/callgrind/tests/runtime_obj_skip_c.vgtest index 6b250787f..2895e5141 100644 --- a/callgrind/tests/runtime_obj_skip_c.vgtest +++ b/callgrind/tests/runtime_obj_skip_c.vgtest @@ -1,5 +1,5 @@ prereq: test -f runtime_obj_skip_c && test -f runtime_obj_skip_c_lib.so prog-asis: ./runtime_obj_skip_c vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_c -post: sh -c 'if grep -q "^fn=skipme_func" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_func leaked into top-level fn= block"; else echo OK; fi' +post: sh -c 'if grep -q "^fn=skipme" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_* leaked into top-level fn= block"; else echo OK; fi' cleanup: rm -f callgrind.out.runtime_obj_skip_c From ece679d45739d01a2dc651fae75bc6b0d63dd8ad Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:45:39 +0200 Subject: [PATCH 09/20] Revert "fix(callgrind): drop BBCCs whose top context fn is skip-flagged" This reverts commit bcfb4c9a44d5f94bad191d85461a08cbfbb3e4e6. --- callgrind/dump.c | 12 +----------- callgrind/tests/runtime_obj_skip_c.vgtest | 2 +- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/callgrind/dump.c b/callgrind/dump.c index 5780aeae0..bffbd4992 100644 --- a/callgrind/dump.c +++ b/callgrind/dump.c @@ -1553,17 +1553,7 @@ static void print_bbccs_of_thread(thread_info* ti) } if (*p == 0) break; - - /* Don't emit BBCCs whose top context fn is flagged for obj-skip. - * This happens when the (cxt == 0) clause in setup_bbcc force- - * pushes a skipped fn (first BB after instrumentation start that - * landed in a skipped object). Without this filter the skipped fn - * leaks into the dump as a top-level fn= block. */ - if ((*p)->cxt->fn[0]->skip) { - p++; - continue; - } - + if (print_fn_pos(print_fp, &lastFnPos, *p)) { /* new function */ diff --git a/callgrind/tests/runtime_obj_skip_c.vgtest b/callgrind/tests/runtime_obj_skip_c.vgtest index 2895e5141..6b250787f 100644 --- a/callgrind/tests/runtime_obj_skip_c.vgtest +++ b/callgrind/tests/runtime_obj_skip_c.vgtest @@ -1,5 +1,5 @@ prereq: test -f runtime_obj_skip_c && test -f runtime_obj_skip_c_lib.so prog-asis: ./runtime_obj_skip_c vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_c -post: sh -c 'if grep -q "^fn=skipme" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_* leaked into top-level fn= block"; else echo OK; fi' +post: sh -c 'if grep -q "^fn=skipme_func" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_func leaked into top-level fn= block"; else echo OK; fi' cleanup: rm -f callgrind.out.runtime_obj_skip_c From 39340326c00f20920a7a59c0d5297aae4bc46001 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 12:46:32 +0200 Subject: [PATCH 10/20] chore(callgrind): log underflow resets and instrument_state transitions Two diagnostic logs to attribute every (cxt==0) push_cxt event to its root cause: 1. handleUnderflow: prints the BB address, fn name, obj, and skip flag for the fn that's about to be force-pushed after a call-stack underflow. Tells us whether leaks are driven by signals, longjmp, or shadow-stack quality. 2. set_instrument_state: logs every ON/OFF transition with the cxt and fn-stack depth at that moment. Distinguishes 'fresh start with no cxt' from 'stop/start cycle that preserved cxt'. Together with the existing 'push_cxt FORCED' log, these let us bucket the leaked frames in real Python runs by their root cause. --- callgrind/bbcc.c | 5 +++++ callgrind/main.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 0c87f1ff9..da495ee87 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -513,6 +513,11 @@ static void handleUnderflow(BB* bb) CLG_(current_fn_stack).top--; CLG_(current_state).cxt = 0; caller = CLG_(get_fn_node)(bb); + VG_(message)(Vg_UserMsg, + "underflow reset: cxt=0, BB=%#lx, fn-about-to-push='%s' " + "obj='%s' skip=%d\n", + bb_addr(bb), caller->name, + caller->file->obj->name, caller->skip); CLG_(push_cxt)( caller ); if (!seen_before) { diff --git a/callgrind/main.c b/callgrind/main.c index 3761c1448..ee8aa2102 100644 --- a/callgrind/main.c +++ b/callgrind/main.c @@ -1453,6 +1453,13 @@ void CLG_(set_instrument_state)(const HChar* reason, Bool state) reason, state ? "ON" : "OFF"); return; } + VG_(message)(Vg_UserMsg, + "instrument_state -> %s (reason='%s', cxt=%p, " + "fn_stack_depth=%ld)\n", + state ? "ON" : "OFF", reason, + (void*)CLG_(current_state).cxt, + (long)(CLG_(current_fn_stack).top - + CLG_(current_fn_stack).bottom)); CLG_(instrument_state) = state; CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n", reason, state ? "ON" : "OFF"); From f2437972363d0a8749d59368e5b7a3f33d44f3bd Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 13:00:19 +0200 Subject: [PATCH 11/20] test(callgrind): C reproducer for cascading underflow obj-skip leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Triggers the call-stack-underflow leak channel observed in the Python case (28 underflow events / run, almost all libpython interpreter frames). Mechanism: - Lib runs recursive skipme_recurse(N) with instrumentation OFF, so callgrind never sees the calls and its csp stays at 0. - At the deepest frame (n==0), CALLGRIND_START_INSTRUMENTATION fires. - Each RET on the way back hits csp == 0, triggers handleUnderflow, resets cxt to 0, and force-pushes the fn we're returning into. - Because that fn is in the skipped lib, it leaks as a top-level fn= block in the dump — N times for an N-deep recursion. With depth=5 the diagnostic logs show 1 (cxt==0) push + 6 underflow resets (5x skipme_recurse + 1x skipme_run), and the .out has fn=skipme_run and fn=skipme_recurse as top-level blocks. --- callgrind/tests/runtime_obj_skip_underflow.c | 22 +++++++++++ .../tests/runtime_obj_skip_underflow_lib.c | 37 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 callgrind/tests/runtime_obj_skip_underflow.c create mode 100644 callgrind/tests/runtime_obj_skip_underflow_lib.c diff --git a/callgrind/tests/runtime_obj_skip_underflow.c b/callgrind/tests/runtime_obj_skip_underflow.c new file mode 100644 index 000000000..ffc1e6a6f --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_underflow.c @@ -0,0 +1,22 @@ +/* Driver for the underflow-channel obj-skip leak reproducer. */ + +#define _GNU_SOURCE +#include +#include +#include "../callgrind.h" + +extern void skipme_run(int depth); + +int main(void) +{ + Dl_info info; + if (dladdr((void*)skipme_run, &info) == 0 || !info.dli_fname) { + fprintf(stderr, "dladdr failed\n"); + return 1; + } + CALLGRIND_ADD_OBJ_SKIP(info.dli_fname); + + skipme_run(5); + + return 0; +} diff --git a/callgrind/tests/runtime_obj_skip_underflow_lib.c b/callgrind/tests/runtime_obj_skip_underflow_lib.c new file mode 100644 index 000000000..abaf58cc6 --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_underflow_lib.c @@ -0,0 +1,37 @@ +/* Library that triggers the call-stack-underflow leak channel in + * callgrind obj-skip. + * + * Setup: recursive function in the skipped lib. Main calls in with + * instrumentation OFF, so callgrind's call stack is never populated. + * At the deepest frame, instrumentation is flipped ON. Each RET on + * the way back then sees csp == 0, hits handleUnderflow, resets + * cxt = 0, and force-pushes the current fn (which lives in the + * skipped lib) as the new top context — leaking N times for an + * N-deep stack. + * + * This is the same shape as Python 3.14's interpreter dispatch + * leaks: deep recursive eval-loop frames where instrumentation was + * started somewhere down the stack and every return pops past an + * empty callgrind stack. */ + +#include "../callgrind.h" + +volatile long sink; + +__attribute__((noinline)) +void skipme_recurse(int n) +{ + if (n == 0) { + CALLGRIND_START_INSTRUMENTATION; + return; + } + skipme_recurse(n - 1); + sink += n; +} + +__attribute__((noinline)) +void skipme_run(int depth) +{ + skipme_recurse(depth); + CALLGRIND_STOP_INSTRUMENTATION; +} From 4a7c1d5fecbd32f0995d8b4e5400d6b8ef48950d Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 13:14:50 +0200 Subject: [PATCH 12/20] fix(callgrind): aggregate (cxt==0) and underflow leaks under a sentinel cxt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setup_bbcc's (cxt==0) clause or handleUnderflow would force-push a skipped fn into the current context, push a synthetic (skipped) fn instead. The skipped fn keeps its costs (routed normally through the sentinel cxt) but never surfaces as its own fn= block — the dump shows a single ob=??? fl=(callgrind-internal) fn=(skipped) block aggregating all leaked frames. The sentinel itself has skip=False so the (cxt==0 && skip) substitution doesn't recurse on it. Created lazily on first need via a singleton in fn.c, attached to the anonymous '???' obj. Verified against both C reproducers (runtime_obj_skip_c and runtime_obj_skip_underflow): no skipme_* fn= blocks appear, totals are preserved. Verified against a non-skipped-attribution test that main / do_main_work still emit normally; the sentinel only engages on the leak paths. --- callgrind/bbcc.c | 10 ++++++++++ callgrind/fn.c | 21 +++++++++++++++++++++ callgrind/global.h | 1 + 3 files changed, 32 insertions(+) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index da495ee87..0ae5ac36b 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -518,6 +518,13 @@ static void handleUnderflow(BB* bb) "obj='%s' skip=%d\n", bb_addr(bb), caller->name, caller->file->obj->name, caller->skip); + + /* A (sentinel): if the fn we'd return into is itself skipped, push + * the (skipped) sentinel instead so the skipped fn doesn't surface + * as its own fn= block in the dump. */ + if (caller->skip) + caller = CLG_(get_skipped_sentinel)(); + CLG_(push_cxt)( caller ); if (!seen_before) { @@ -837,6 +844,9 @@ void CLG_(setup_bbcc)(BB* bb) push_fn->name, push_fn->file->obj->name, (int)jmpkind, (int)delayed_push); + /* A (sentinel): substitute the (skipped) sentinel so the + * skipped fn doesn't appear as its own fn= block in the dump. */ + push_fn = CLG_(get_skipped_sentinel)(); } CLG_(push_cxt)(push_fn); } diff --git a/callgrind/fn.c b/callgrind/fn.c index 45c1fda4c..dd2239fa1 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -336,6 +336,27 @@ static Bool name_contains(const HChar* hay, const HChar* needle) return False; } +static fn_node* new_fn_node(const HChar *fnname, + file_node* file, fn_node* next); + +/* Singleton sentinel fn_node used as a placeholder cxt when we'd + * otherwise be forced to push a skipped fn into an empty (cxt == 0) + * context. Keeping skip == False on the sentinel itself is crucial: + * the (cxt == 0 && skip) check that would push it must NOT recurse + * on the sentinel. */ +static fn_node* skipped_sentinel = NULL; + +fn_node* CLG_(get_skipped_sentinel)(void) +{ + if (skipped_sentinel) return skipped_sentinel; + + obj_node* obj = CLG_(get_obj_node)(NULL); /* anonymous "???" obj */ + file_node* file = CLG_(get_file_node)(obj, "", "(callgrind-internal)"); + skipped_sentinel = new_fn_node("(skipped)", file, NULL); + skipped_sentinel->skip = False; + return skipped_sentinel; +} + void CLG_(dump_python_fn_summary)(void) { Int total = 0, checked = 0, skipped = 0; diff --git a/callgrind/global.h b/callgrind/global.h index a3cb6990d..a15c9e07e 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -725,6 +725,7 @@ UInt* CLG_(get_fn_entry)(Int n); void CLG_(init_obj_table)(void); void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped); void CLG_(dump_python_fn_summary)(void); +fn_node* CLG_(get_skipped_sentinel)(void); obj_node* CLG_(get_obj_node)(DebugInfo* si); file_node* CLG_(get_file_node)(obj_node*, const HChar *dirname, const HChar* filename); From 6fbd12bc134d2b2303d5e169b689d37276e9c915 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 15:58:26 +0200 Subject: [PATCH 13/20] Revert "fix(callgrind): aggregate (cxt==0) and underflow leaks under a sentinel cxt" This reverts commit 4a7c1d5fecbd32f0995d8b4e5400d6b8ef48950d. --- callgrind/bbcc.c | 10 ---------- callgrind/fn.c | 21 --------------------- callgrind/global.h | 1 - 3 files changed, 32 deletions(-) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index 0ae5ac36b..da495ee87 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -518,13 +518,6 @@ static void handleUnderflow(BB* bb) "obj='%s' skip=%d\n", bb_addr(bb), caller->name, caller->file->obj->name, caller->skip); - - /* A (sentinel): if the fn we'd return into is itself skipped, push - * the (skipped) sentinel instead so the skipped fn doesn't surface - * as its own fn= block in the dump. */ - if (caller->skip) - caller = CLG_(get_skipped_sentinel)(); - CLG_(push_cxt)( caller ); if (!seen_before) { @@ -844,9 +837,6 @@ void CLG_(setup_bbcc)(BB* bb) push_fn->name, push_fn->file->obj->name, (int)jmpkind, (int)delayed_push); - /* A (sentinel): substitute the (skipped) sentinel so the - * skipped fn doesn't appear as its own fn= block in the dump. */ - push_fn = CLG_(get_skipped_sentinel)(); } CLG_(push_cxt)(push_fn); } diff --git a/callgrind/fn.c b/callgrind/fn.c index dd2239fa1..45c1fda4c 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -336,27 +336,6 @@ static Bool name_contains(const HChar* hay, const HChar* needle) return False; } -static fn_node* new_fn_node(const HChar *fnname, - file_node* file, fn_node* next); - -/* Singleton sentinel fn_node used as a placeholder cxt when we'd - * otherwise be forced to push a skipped fn into an empty (cxt == 0) - * context. Keeping skip == False on the sentinel itself is crucial: - * the (cxt == 0 && skip) check that would push it must NOT recurse - * on the sentinel. */ -static fn_node* skipped_sentinel = NULL; - -fn_node* CLG_(get_skipped_sentinel)(void) -{ - if (skipped_sentinel) return skipped_sentinel; - - obj_node* obj = CLG_(get_obj_node)(NULL); /* anonymous "???" obj */ - file_node* file = CLG_(get_file_node)(obj, "", "(callgrind-internal)"); - skipped_sentinel = new_fn_node("(skipped)", file, NULL); - skipped_sentinel->skip = False; - return skipped_sentinel; -} - void CLG_(dump_python_fn_summary)(void) { Int total = 0, checked = 0, skipped = 0; diff --git a/callgrind/global.h b/callgrind/global.h index a15c9e07e..a3cb6990d 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -725,7 +725,6 @@ UInt* CLG_(get_fn_entry)(Int n); void CLG_(init_obj_table)(void); void CLG_(count_obj_skip_checked_fns)(Int* checked, Int* skipped); void CLG_(dump_python_fn_summary)(void); -fn_node* CLG_(get_skipped_sentinel)(void); obj_node* CLG_(get_obj_node)(DebugInfo* si); file_node* CLG_(get_file_node)(obj_node*, const HChar *dirname, const HChar* filename); From 568c255e118a8a3653f1f96c519e10d4cf41bac5 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 17:56:27 +0200 Subject: [PATCH 14/20] fix(callgrind): seed shadow call stack from native stack at START MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CALLGRIND_START_INSTRUMENTATION fires mid-stack (typical for pytest-codspeed: Python reaches the macro several libpython frames deep), callgrind's csp is 0 but the real stack is non-empty. Every subsequent client `ret` peels a frame callgrind never saw the matching `call` for, trips handleUnderflow, and leaks the returned-into fn as a top-level fn= block — polluting the flamegraph with phantom roots like _PyEval_EvalFrameDefault, PyObject_Vectorcall, etc. Reconstruct the shadow stack from VG_(get_StackTrace) on the OFF->ON transition. For each native frame: push a (jcc=0, skip-style) call_entry with the captured SP and ret_addr=caller_ip+1. For non-skipped caller frames, synthesize a zero-instruction BBCC tagged with that frame's cxt so obj-skip's `nonskipped` mechanism has a target to fold skipped-subtree costs into. Anonymous IPs (Python JIT regions, CRT glue) resolve via the existing `???` obj path in get_obj_node, so no special trimming is needed. Un-static new_recursion and insert_bbcc_into_hash (renamed to CLG_-prefixed) so callstack.c can reuse them. --- callgrind/bbcc.c | 18 ++++---- callgrind/callstack.c | 95 +++++++++++++++++++++++++++++++++++++++++++ callgrind/fn.c | 15 +++++++ callgrind/global.h | 4 ++ callgrind/main.c | 1 + 5 files changed, 123 insertions(+), 10 deletions(-) diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c index da495ee87..c370cd4cb 100644 --- a/callgrind/bbcc.c +++ b/callgrind/bbcc.c @@ -240,8 +240,7 @@ static void resize_bbcc_hash(void) } -static __inline -BBCC** new_recursion(int size) +BBCC** CLG_(new_recursion)(int size) { BBCC** bbccs; int i; @@ -313,8 +312,7 @@ BBCC* new_bbcc(BB* bb) * Recursion level doesn't need to be set as this is not included * in the hash key: Only BBCCs with rec level 0 are in hashes. */ -static -void insert_bbcc_into_hash(BBCC* bbcc) +void CLG_(insert_bbcc_into_hash)(BBCC* bbcc) { UInt idx; @@ -389,10 +387,10 @@ static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index) bbcc->rec_index = 0; bbcc->cxt = cxt; - bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions); + bbcc->rec_array = CLG_(new_recursion)(cxt->fn[0]->separate_recursions); bbcc->rec_array[0] = bbcc; - insert_bbcc_into_hash(bbcc); + CLG_(insert_bbcc_into_hash)(bbcc); } else { if (CLG_(clo).separate_threads) @@ -522,12 +520,12 @@ static void handleUnderflow(BB* bb) if (!seen_before) { /* set rec array for source BBCC: this is at rec level 1 */ - source_bbcc->rec_array = new_recursion(caller->separate_recursions); + source_bbcc->rec_array = CLG_(new_recursion)(caller->separate_recursions); source_bbcc->rec_array[0] = source_bbcc; CLG_ASSERT(source_bbcc->cxt == 0); source_bbcc->cxt = CLG_(current_state).cxt; - insert_bbcc_into_hash(source_bbcc); + CLG_(insert_bbcc_into_hash)(source_bbcc); } CLG_ASSERT(CLG_(current_state).bbcc); @@ -849,10 +847,10 @@ void CLG_(setup_bbcc)(BB* bb) bbcc->cxt = CLG_(current_state).cxt; bbcc->rec_array = - new_recursion((*CLG_(current_fn_stack).top)->separate_recursions); + CLG_(new_recursion)((*CLG_(current_fn_stack).top)->separate_recursions); bbcc->rec_array[0] = bbcc; - insert_bbcc_into_hash(bbcc); + CLG_(insert_bbcc_into_hash)(bbcc); } else { /* get BBCC with current context */ diff --git a/callgrind/callstack.c b/callgrind/callstack.c index 20669e9cd..aa81aaaf4 100644 --- a/callgrind/callstack.c +++ b/callgrind/callstack.c @@ -25,6 +25,7 @@ */ #include "global.h" +#include "pub_tool_stacktrace.h" /*------------------------------------------------------------*/ /*--- Call stack, operations ---*/ @@ -433,3 +434,97 @@ Int CLG_(unwind_call_stack)(Addr sp, Int minpops) CLG_DEBUG(4,"- unwind_call_stack\n"); return unwind_count; } + + +/* Seed callgrind's shadow call stack from the client's native stack so a + * later `ret` past unseen frames pops cleanly instead of underflowing. + * + * Called on the OFF->ON instrumentation transition: the client (e.g. + * pytest_codspeed) typically reaches CALLGRIND_START_INSTRUMENTATION several + * libpython frames deep. Without seeding, csp stays at 0 while the real + * stack is non-empty, and every subsequent ret trips handleUnderflow and + * leaks the returned-into fn as a top-level fn= block. + * + * For each native frame we push a (jcc=0, skip-style) call_entry with the + * captured SP and a ret_addr computed from the caller's IP. To make obj-skip + * cost-folding work across the seeded chain, we also synthesize a BBCC for + * each non-skipped caller frame so push_call_stack-style `nonskipped` + * attribution has a target on the first non-skip -> skip transition. + */ +#define CLG_RECON_MAX_FRAMES 256 + +void CLG_(reconstruct_call_stack_from_native)(ThreadId tid) +{ + Addr ips[CLG_RECON_MAX_FRAMES]; + Addr sps[CLG_RECON_MAX_FRAMES]; + call_stack* cs = &CLG_(current_call_stack); + + if (cs->sp != 0) return; + + UInt n = VG_(get_StackTrace)(tid, ips, CLG_RECON_MAX_FRAMES, sps, NULL, 0); + if (n == 0) return; + + /* Caller's synthesized BBCC, latched for use as nonskipped target on + * the first non-skipped -> skipped transition. */ + BBCC* caller_bbcc = 0; + + /* Push bottom-up: oldest caller first, current frame last. */ + for (Int frame = n - 1; frame >= 0; frame--) { + fn_node* fn = CLG_(get_fn_node_for_addr)(ips[frame]); + + /* Latch obj-skip on first encounter, matching bbcc.c's check. */ + if (!fn->obj_skip_checked) { + const HChar* obj = fn->file->obj->name; + for (Int j = 0; j < CLG_(clo).objs_to_skip_count; j++) { + if (VG_(strcmp)(obj, CLG_(clo).objs_to_skip[j]) == 0) { + fn->skip = True; + break; + } + } + fn->obj_skip_checked = True; + } + + ensure_stack_size(cs->sp + 1); + BBCC* prev_nonskipped = CLG_(current_state).nonskipped; + CLG_(push_cxt)(fn); + + /* Create a BBCC for non-skipped caller frames. ips[frame] for + * frame>=1 is "last byte of the call instruction" per VG_(get_StackTrace), + * so it's never a real BB start and the 0-insn synthetic BB cannot + * collide with later real instrumentation. The top frame's IP can + * land on a real BB, so we don't synthesize there — real BBCC will + * be created naturally on the first instrumented BB. */ + if (frame > 0 && !fn->skip) { + Bool seen; + BBCC* b = CLG_(get_bbcc)(CLG_(get_bb)(ips[frame], NULL, &seen)); + if (!seen) { + b->rec_array = CLG_(new_recursion)(fn->separate_recursions); + b->rec_array[0] = b; + b->cxt = CLG_(current_state).cxt; + CLG_(insert_bbcc_into_hash)(b); + } + caller_bbcc = b; + } + + /* Mirror push_call_stack's nonskipped transition. */ + if (!fn->skip) { + CLG_(current_state).nonskipped = 0; + } else if (prev_nonskipped == 0 && caller_bbcc) { + CLG_(current_state).nonskipped = caller_bbcc; + if (!caller_bbcc->skipped) + CLG_(init_cost_lz)(CLG_(sets).full, &caller_bbcc->skipped); + } + + call_entry* ce = &cs->entry[cs->sp]; + ce->jcc = 0; + ce->sp = sps[frame]; + ce->ret_addr = (frame + 1 < (Int)n) ? ips[frame + 1] + 1 : 0; + ce->nonskipped = prev_nonskipped; + + cs->sp++; + ensure_stack_size(cs->sp + 1); + cs->entry[cs->sp].cxt = 0; + } + + if (caller_bbcc) CLG_(current_state).bbcc = caller_bbcc; +} diff --git a/callgrind/fn.c b/callgrind/fn.c index 45c1fda4c..41f4a6a00 100644 --- a/callgrind/fn.c +++ b/callgrind/fn.c @@ -581,6 +581,21 @@ fn_node* get_fn_node_inseg(DebugInfo* di, } +/* Resolve a raw code address to a fn_node, creating obj/file/fn entries if + * needed. Addresses without DebugInfo (anonymous JIT mappings, ld glue) + * resolve to the shared `???`/anonymous obj. Used by the START-instrumentation + * stack reconstruction path, which has IPs but no BBs. */ +fn_node* CLG_(get_fn_node_for_addr)(Addr ip) +{ + const HChar *dirname, *filename, *fnname; + UInt line_num; + DebugInfo* di; + + CLG_(get_debug_info)(ip, &dirname, &filename, &fnname, &line_num, &di); + return get_fn_node_inseg(di, dirname, filename, fnname); +} + + Bool CLG_(get_debug_info)(Addr instr_addr, const HChar **dir, const HChar **file, diff --git a/callgrind/global.h b/callgrind/global.h index a3cb6990d..5e7c86af4 100644 --- a/callgrind/global.h +++ b/callgrind/global.h @@ -729,6 +729,7 @@ obj_node* CLG_(get_obj_node)(DebugInfo* si); file_node* CLG_(get_file_node)(obj_node*, const HChar *dirname, const HChar* filename); fn_node* CLG_(get_fn_node)(BB* bb); +fn_node* CLG_(get_fn_node_for_addr)(Addr ip); /* from bbcc.c */ void CLG_(init_bbcc_hash)(bbcc_hash* bbccs); @@ -738,6 +739,8 @@ void CLG_(set_current_bbcc_hash)(bbcc_hash*); void CLG_(forall_bbccs)(void (*func)(BBCC*)); void CLG_(zero_bbcc)(BBCC* bbcc); BBCC* CLG_(get_bbcc)(BB* bb); +BBCC** CLG_(new_recursion)(int size); +void CLG_(insert_bbcc_into_hash)(BBCC* bbcc); BBCC* CLG_(clone_bbcc)(BBCC* orig, Context* cxt, Int rec_index); void CLG_(setup_bbcc)(BB* bb) VG_REGPARM(1); @@ -757,6 +760,7 @@ call_entry* CLG_(get_call_entry)(Int n); void CLG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip); void CLG_(pop_call_stack)(void); Int CLG_(unwind_call_stack)(Addr sp, Int); +void CLG_(reconstruct_call_stack_from_native)(ThreadId tid); /* from context.c */ void CLG_(init_fn_stack)(fn_stack*); diff --git a/callgrind/main.c b/callgrind/main.c index ee8aa2102..90a3c77d5 100644 --- a/callgrind/main.c +++ b/callgrind/main.c @@ -1679,6 +1679,7 @@ Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret) case VG_USERREQ__START_INSTRUMENTATION: CLG_(set_instrument_state)("Client Request", True); + CLG_(reconstruct_call_stack_from_native)(tid); *ret = 0; /* meaningless */ break; From dfe44cc9be1b5da758f8d32e077c89df7a4b68ec Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 17:56:28 +0200 Subject: [PATCH 15/20] test(callgrind): wire underflow + py3.14 reproducers, strengthen post-checks - runtime_obj_skip_c.vgtest: post-check now greps for any fn=skipme_*. The previous check only looked for skipme_func, which obj-skip folded into the caller so the test passed even with the bug live. - runtime_obj_skip_underflow: wire .vgtest/.post.exp/.stderr.exp + Makefile.am entries (was only checked-in as bare .c sources). - runtime_obj_skip_py314.vgtest: same shim driver as the 3.13 test, exercised with python3.14 (tail-call interpreter). Skipped by prereq if python3.14 isn't on PATH. - filter_stderr: drop diagnostic logging lines added during this investigation (instrument_state, obj_skip HIT/miss, underflow reset, reconstruct_call_stack, python fn summary, fn=...obj=... summary) so .stderr.exp matching stays stable. All four reproducers go RED before the fix, GREEN after. --- callgrind/tests/Makefile.am | 19 ++++++++++++++++--- callgrind/tests/filter_stderr | 6 +++++- callgrind/tests/runtime_obj_skip_c.stderr.exp | 6 ++++++ callgrind/tests/runtime_obj_skip_c.vgtest | 2 +- .../tests/runtime_obj_skip_py314.post.exp | 1 + .../tests/runtime_obj_skip_py314.stderr.exp | 6 ++++++ callgrind/tests/runtime_obj_skip_py314.vgtest | 6 ++++++ .../tests/runtime_obj_skip_underflow.post.exp | 1 + .../runtime_obj_skip_underflow.stderr.exp | 6 ++++++ .../tests/runtime_obj_skip_underflow.vgtest | 5 +++++ 10 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 callgrind/tests/runtime_obj_skip_py314.post.exp create mode 100644 callgrind/tests/runtime_obj_skip_py314.stderr.exp create mode 100644 callgrind/tests/runtime_obj_skip_py314.vgtest create mode 100644 callgrind/tests/runtime_obj_skip_underflow.post.exp create mode 100644 callgrind/tests/runtime_obj_skip_underflow.stderr.exp create mode 100644 callgrind/tests/runtime_obj_skip_underflow.vgtest diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am index 8fb23b408..b28558eeb 100644 --- a/callgrind/tests/Makefile.am +++ b/callgrind/tests/Makefile.am @@ -13,8 +13,11 @@ EXTRA_DIST = \ find_debuginfo.vgtest find_debuginfo.stderr.exp find_debuginfo.post.exp \ runtime_obj_skip_py.vgtest runtime_obj_skip_py.stderr.exp runtime_obj_skip_py.post.exp \ runtime_obj_skip_py.py runtime_obj_skip_py_shim.c \ + runtime_obj_skip_py314.vgtest runtime_obj_skip_py314.stderr.exp runtime_obj_skip_py314.post.exp \ runtime_obj_skip_c.vgtest runtime_obj_skip_c.stderr.exp runtime_obj_skip_c.post.exp \ runtime_obj_skip_c.c runtime_obj_skip_c_lib.c \ + runtime_obj_skip_underflow.vgtest runtime_obj_skip_underflow.stderr.exp runtime_obj_skip_underflow.post.exp \ + runtime_obj_skip_underflow.c runtime_obj_skip_underflow_lib.c \ bug497723.stderr.exp bug497723.post.exp bug497723.vgtest \ simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \ simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \ @@ -33,7 +36,7 @@ EXTRA_DIST = \ inline-crossfile.vgtest inline-crossfile.stderr.exp inline-crossfile.stdout.exp inline-crossfile.post.exp \ inline-crossfile-helper1.h inline-crossfile-helper2.h filter_inline -check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile runtime_obj_skip_c +check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile runtime_obj_skip_c runtime_obj_skip_underflow AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) @@ -46,7 +49,7 @@ threads_LDADD = -lpthread # Shim loaded by runtime_obj_skip_py.py via ctypes. Built unconditionally; # the test's prereq skips it if the .so is missing. -check_DATA = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so +check_DATA = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so runtime_obj_skip_underflow_lib.so runtime_obj_skip_py_shim.so: runtime_obj_skip_py_shim.c $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ @@ -63,4 +66,14 @@ runtime_obj_skip_c_LDFLAGS = $(AM_LDFLAGS) -L. -l:runtime_obj_skip_c_lib.so \ -Wl,-rpath,'$$ORIGIN' runtime_obj_skip_c_DEPENDENCIES = runtime_obj_skip_c_lib.so -CLEANFILES = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so +# Shared lib for the runtime_obj_skip_underflow test. +runtime_obj_skip_underflow_lib.so: runtime_obj_skip_underflow_lib.c + $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ + $< -o $@ + +runtime_obj_skip_underflow_LDADD = -ldl +runtime_obj_skip_underflow_LDFLAGS = $(AM_LDFLAGS) -L. -l:runtime_obj_skip_underflow_lib.so \ + -Wl,-rpath,'$$ORIGIN' +runtime_obj_skip_underflow_DEPENDENCIES = runtime_obj_skip_underflow_lib.so + +CLEANFILES = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so runtime_obj_skip_underflow_lib.so diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr index fbaca9b1d..1c51f078a 100755 --- a/callgrind/tests/filter_stderr +++ b/callgrind/tests/filter_stderr @@ -30,4 +30,8 @@ sed "/warning: L4 cache found, using its data for the LL simulation./d" | sed "/Warning: Cannot auto-detect cache config, using defaults./d" | sed "/Run with -v to see./d" | sed "/warning: specified LL cache: line_size .*$/d" | -sed "/warning: simulated LL cache: line_size .*$/d" +sed "/warning: simulated LL cache: line_size .*$/d" | + +# Drop callgrind diagnostic logs (verbose obj-skip / cxt / underflow tracing). +# These are chore-level diagnostics that vary by run/host and aren't assertions. +sed -E "/^(add_obj_to_skip|obj-skip list now has| \[[0-9]+\] '|fn_nodes already obj_skip_checked|instrument_state ->|new_fn_node:|obj_skip (HIT|miss):| vs \[[0-9]+\] strcmp=|push_cxt FORCED|underflow reset:|reconstruct_call_stack:|=== python fn summary| fn='[^']*' obj=)/d" diff --git a/callgrind/tests/runtime_obj_skip_c.stderr.exp b/callgrind/tests/runtime_obj_skip_c.stderr.exp index e69de29bb..d0b7820ae 100644 --- a/callgrind/tests/runtime_obj_skip_c.stderr.exp +++ b/callgrind/tests/runtime_obj_skip_c.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/callgrind/tests/runtime_obj_skip_c.vgtest b/callgrind/tests/runtime_obj_skip_c.vgtest index 6b250787f..5817c245a 100644 --- a/callgrind/tests/runtime_obj_skip_c.vgtest +++ b/callgrind/tests/runtime_obj_skip_c.vgtest @@ -1,5 +1,5 @@ prereq: test -f runtime_obj_skip_c && test -f runtime_obj_skip_c_lib.so prog-asis: ./runtime_obj_skip_c vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_c -post: sh -c 'if grep -q "^fn=skipme_func" callgrind.out.runtime_obj_skip_c; then echo "FAIL: skipme_func leaked into top-level fn= block"; else echo OK; fi' +post: sh -c 'leaked=$(grep "^fn=skipme_" callgrind.out.runtime_obj_skip_c); if [ -n "$leaked" ]; then echo "FAIL: skipped fn leaked into top-level fn= block:"; echo "$leaked"; exit 1; else echo OK; fi' cleanup: rm -f callgrind.out.runtime_obj_skip_c diff --git a/callgrind/tests/runtime_obj_skip_py314.post.exp b/callgrind/tests/runtime_obj_skip_py314.post.exp new file mode 100644 index 000000000..d86bac9de --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_py314.post.exp @@ -0,0 +1 @@ +OK diff --git a/callgrind/tests/runtime_obj_skip_py314.stderr.exp b/callgrind/tests/runtime_obj_skip_py314.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_py314.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/callgrind/tests/runtime_obj_skip_py314.vgtest b/callgrind/tests/runtime_obj_skip_py314.vgtest new file mode 100644 index 000000000..57368aafe --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_py314.vgtest @@ -0,0 +1,6 @@ +prereq: command -v python3.14 >/dev/null && test -f runtime_obj_skip_py_shim.so +prog-asis: python3.14 +args: runtime_obj_skip_py.py +vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_py314 +post: sh -c 'c=$(awk "/^ob=/{p=(\$0~/libpython/)} /^fn=/&&p{c++} END{print c+0}" callgrind.out.runtime_obj_skip_py314); if [ "$c" -lt 100 ]; then echo OK; else echo "FAIL libpython fns=$c"; fi' +cleanup: rm -f callgrind.out.runtime_obj_skip_py314 diff --git a/callgrind/tests/runtime_obj_skip_underflow.post.exp b/callgrind/tests/runtime_obj_skip_underflow.post.exp new file mode 100644 index 000000000..d86bac9de --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_underflow.post.exp @@ -0,0 +1 @@ +OK diff --git a/callgrind/tests/runtime_obj_skip_underflow.stderr.exp b/callgrind/tests/runtime_obj_skip_underflow.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_underflow.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/callgrind/tests/runtime_obj_skip_underflow.vgtest b/callgrind/tests/runtime_obj_skip_underflow.vgtest new file mode 100644 index 000000000..c0fc04cd0 --- /dev/null +++ b/callgrind/tests/runtime_obj_skip_underflow.vgtest @@ -0,0 +1,5 @@ +prereq: test -f runtime_obj_skip_underflow && test -f runtime_obj_skip_underflow_lib.so +prog-asis: ./runtime_obj_skip_underflow +vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_underflow +post: sh -c 'leaked=$(grep "^fn=skipme_" callgrind.out.runtime_obj_skip_underflow); if [ -n "$leaked" ]; then echo "FAIL: skipped fn leaked into top-level fn= block:"; echo "$leaked"; exit 1; else echo OK; fi' +cleanup: rm -f callgrind.out.runtime_obj_skip_underflow From 9e62d5b5348bd0d71681b0ad2c80c04c9b1d052b Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 18:18:34 +0200 Subject: [PATCH 16/20] fix(callgrind/tests): link runtime_obj_skip libs via LDADD for --as-needed Move -l:runtime_obj_skip_*_lib.so from LDFLAGS to LDADD so the shared lib appears after the .o files on the link line. On Ubuntu 24.04, ld defaults to --as-needed and was dropping the lib before skipme_run was referenced, causing an undefined-reference link error in CI. --- callgrind/tests/Makefile.am | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am index b28558eeb..be734faea 100644 --- a/callgrind/tests/Makefile.am +++ b/callgrind/tests/Makefile.am @@ -61,9 +61,8 @@ runtime_obj_skip_c_lib.so: runtime_obj_skip_c_lib.c $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ $< -o $@ -runtime_obj_skip_c_LDADD = -ldl -runtime_obj_skip_c_LDFLAGS = $(AM_LDFLAGS) -L. -l:runtime_obj_skip_c_lib.so \ - -Wl,-rpath,'$$ORIGIN' +runtime_obj_skip_c_LDADD = -l:runtime_obj_skip_c_lib.so -ldl +runtime_obj_skip_c_LDFLAGS = $(AM_LDFLAGS) -L. -Wl,-rpath,'$$ORIGIN' runtime_obj_skip_c_DEPENDENCIES = runtime_obj_skip_c_lib.so # Shared lib for the runtime_obj_skip_underflow test. @@ -71,9 +70,8 @@ runtime_obj_skip_underflow_lib.so: runtime_obj_skip_underflow_lib.c $(CC) -shared -fPIC -O2 -I$(top_srcdir) -I$(top_srcdir)/include \ $< -o $@ -runtime_obj_skip_underflow_LDADD = -ldl -runtime_obj_skip_underflow_LDFLAGS = $(AM_LDFLAGS) -L. -l:runtime_obj_skip_underflow_lib.so \ - -Wl,-rpath,'$$ORIGIN' +runtime_obj_skip_underflow_LDADD = -l:runtime_obj_skip_underflow_lib.so -ldl +runtime_obj_skip_underflow_LDFLAGS = $(AM_LDFLAGS) -L. -Wl,-rpath,'$$ORIGIN' runtime_obj_skip_underflow_DEPENDENCIES = runtime_obj_skip_underflow_lib.so CLEANFILES = runtime_obj_skip_py_shim.so runtime_obj_skip_c_lib.so runtime_obj_skip_underflow_lib.so From 939d0075d382a93b77a2ffad9bc3a3ace6066d92 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Wed, 27 May 2026 18:25:47 +0200 Subject: [PATCH 17/20] fix(callgrind/tests): normalize py314 prereq exit code for dash dash's 'command -v' returns 127 when the program is missing, which vg_regtest treats as a fatal abort (it only accepts 0=run or 1=skip). Append '|| exit 1' so the prereq cleanly skips on hosts without python3.14 (e.g. Ubuntu 22.04 CI runners). --- callgrind/tests/runtime_obj_skip_py314.vgtest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/callgrind/tests/runtime_obj_skip_py314.vgtest b/callgrind/tests/runtime_obj_skip_py314.vgtest index 57368aafe..bebb4fd41 100644 --- a/callgrind/tests/runtime_obj_skip_py314.vgtest +++ b/callgrind/tests/runtime_obj_skip_py314.vgtest @@ -1,4 +1,4 @@ -prereq: command -v python3.14 >/dev/null && test -f runtime_obj_skip_py_shim.so +prereq: command -v python3.14 >/dev/null 2>&1 && test -f runtime_obj_skip_py_shim.so || exit 1 prog-asis: python3.14 args: runtime_obj_skip_py.py vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.runtime_obj_skip_py314 From caf470387cf231ca60759ee33d66f49bbbe226c1 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Thu, 28 May 2026 15:43:17 +0200 Subject: [PATCH 18/20] fix(callgrind): skip frame 0 when seeding shadow call stack at START MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The native-stack seed in CLG_(reconstruct_call_stack_from_native) was pushing every frame, including frame 0 (the function that fired CALLGRIND_START_INSTRUMENTATION). Seeded call_entries have jcc=0, and pop_call_stack only restores cxt when jcc!=0 — so on frame 0's `ret` the function stayed stuck on top of the cxt chain and phantom-parented every subsequent call from the real caller. Visible in the CodSpeed flamegraph as callgrind_start_instrumentation appearing as the parent of the benchmark body. Stop the seed loop at frame 1; the trailing epilogue of frame 0 is harmlessly attributed to the caller's cxt. Add callgrind/tests/phantom_root as a minimal red/green reproducer: a start_and_return() function fires the macro and returns, then main calls leaf(). Post-check asserts leaf's only caller is main. --- callgrind/callstack.c | 14 +++++++++++-- callgrind/tests/Makefile.am | 3 ++- callgrind/tests/phantom_root.c | 28 +++++++++++++++++++++++++ callgrind/tests/phantom_root.post.exp | 1 + callgrind/tests/phantom_root.stderr.exp | 6 ++++++ callgrind/tests/phantom_root.vgtest | 4 ++++ 6 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 callgrind/tests/phantom_root.c create mode 100644 callgrind/tests/phantom_root.post.exp create mode 100644 callgrind/tests/phantom_root.stderr.exp create mode 100644 callgrind/tests/phantom_root.vgtest diff --git a/callgrind/callstack.c b/callgrind/callstack.c index aa81aaaf4..a58b91908 100644 --- a/callgrind/callstack.c +++ b/callgrind/callstack.c @@ -468,8 +468,18 @@ void CLG_(reconstruct_call_stack_from_native)(ThreadId tid) * the first non-skipped -> skipped transition. */ BBCC* caller_bbcc = 0; - /* Push bottom-up: oldest caller first, current frame last. */ - for (Int frame = n - 1; frame >= 0; frame--) { + /* Push bottom-up: oldest caller first, stopping before frame 0 (the + * function that fired CALLGRIND_START_INSTRUMENTATION). + * + * Why skip frame 0: + * - Seeded call_entries have jcc=0. + * - pop_call_stack only restores cxt when jcc!=0. + * - So frame 0's `ret` would leave it stuck on top of the cxt chain, + * phantom-parenting every later call from the real caller. + * + * Skipping it leaves cxt ending at the genuine caller; frame 0's + * trailing epilogue is harmlessly attributed there. */ + for (Int frame = n - 1; frame >= 1; frame--) { fn_node* fn = CLG_(get_fn_node_for_addr)(ips[frame]); /* Latch obj-skip on first encounter, matching bbcc.c's check. */ diff --git a/callgrind/tests/Makefile.am b/callgrind/tests/Makefile.am index be734faea..4edd5a81c 100644 --- a/callgrind/tests/Makefile.am +++ b/callgrind/tests/Makefile.am @@ -18,6 +18,7 @@ EXTRA_DIST = \ runtime_obj_skip_c.c runtime_obj_skip_c_lib.c \ runtime_obj_skip_underflow.vgtest runtime_obj_skip_underflow.stderr.exp runtime_obj_skip_underflow.post.exp \ runtime_obj_skip_underflow.c runtime_obj_skip_underflow_lib.c \ + phantom_root.vgtest phantom_root.stderr.exp phantom_root.post.exp \ bug497723.stderr.exp bug497723.post.exp bug497723.vgtest \ simwork1.vgtest simwork1.stdout.exp simwork1.stderr.exp \ simwork2.vgtest simwork2.stdout.exp simwork2.stderr.exp \ @@ -36,7 +37,7 @@ EXTRA_DIST = \ inline-crossfile.vgtest inline-crossfile.stderr.exp inline-crossfile.stdout.exp inline-crossfile.post.exp \ inline-crossfile-helper1.h inline-crossfile-helper2.h filter_inline -check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile runtime_obj_skip_c runtime_obj_skip_underflow +check_PROGRAMS = clreq find_debuginfo simwork threads inline-samefile inline-crossfile runtime_obj_skip_c runtime_obj_skip_underflow phantom_root AM_CFLAGS += $(AM_FLAG_M3264_PRI) AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) diff --git a/callgrind/tests/phantom_root.c b/callgrind/tests/phantom_root.c new file mode 100644 index 000000000..d224424ee --- /dev/null +++ b/callgrind/tests/phantom_root.c @@ -0,0 +1,28 @@ +/* Reproducer for the seeded-shadow-stack "phantom root" bug. + * + * When CALLGRIND_START_INSTRUMENTATION fires inside a function that then + * returns, the seed pushes that function onto callgrind's cxt chain with + * jcc=0. pop_call_stack only restores cxt when jcc!=0, so after the ret + * the cxt is stuck with that function on top, and any subsequent call + * from the real caller appears as a child of it. */ + +#include "../callgrind.h" + +volatile long sink; + +__attribute__((noinline)) +static void leaf(void) { sink++; } + +__attribute__((noinline)) +static void start_and_return(void) +{ + CALLGRIND_START_INSTRUMENTATION; +} + +int main(void) +{ + start_and_return(); + leaf(); + CALLGRIND_STOP_INSTRUMENTATION; + return 0; +} diff --git a/callgrind/tests/phantom_root.post.exp b/callgrind/tests/phantom_root.post.exp new file mode 100644 index 000000000..d86bac9de --- /dev/null +++ b/callgrind/tests/phantom_root.post.exp @@ -0,0 +1 @@ +OK diff --git a/callgrind/tests/phantom_root.stderr.exp b/callgrind/tests/phantom_root.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/callgrind/tests/phantom_root.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/callgrind/tests/phantom_root.vgtest b/callgrind/tests/phantom_root.vgtest new file mode 100644 index 000000000..e44bfb45c --- /dev/null +++ b/callgrind/tests/phantom_root.vgtest @@ -0,0 +1,4 @@ +prog: phantom_root +vgopts: --instr-atstart=no --compress-strings=no --callgrind-out-file=callgrind.out.phantom_root +post: sh -c 'callers=$(awk -v q="'\''" "/^fn=/{fn=\$0; sub(q\"[0-9]+\$\",\"\",fn)} /^cfn=leaf\$/{print fn}" callgrind.out.phantom_root | sort -u); if [ "$callers" != "fn=main" ]; then echo "FAIL: leaf called by unexpected caller(s):"; echo "$callers"; exit 1; else echo OK; fi' +cleanup: rm -f callgrind.out.phantom_root From e31aab95ed82eb93512b012a561593df390508ee Mon Sep 17 00:00:00 2001 From: not-matthias Date: Thu, 28 May 2026 16:05:05 +0200 Subject: [PATCH 19/20] fix(callgrind): restore cxt on ret for seeded shadow-stack entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix only stopped phantom-parenting from frame 0. The bug is structural: every seeded call_entry has jcc=0, and pop_call_stack only restores cxt when jcc!=0. With a multi-frame wrapper chain (e.g. Node.js: macro -> C export -> N-API trampoline -> user JS), each ret on the way out of the chain fails to pop the cxt, leaving the deepest un-returned wrapper stuck on top. Fix it where the asymmetry lives: in pop_call_stack, also restore cxt when the entry has a non-zero saved cxt. push_cxt always populates lower_entry->cxt, while real skip-entries (push_call_stack(skip=True) without a prior push_cxt) leave it at 0 — so the new branch fires only for seeded entries. With cxt now popping cleanly through the whole seeded chain, the frame-0 carve-out is no longer needed; the seed loop walks all native frames again. Extend phantom_root to a 3-deep wrapper chain (wrapper_outer -> wrapper_middle -> wrapper_inner fires START and unwinds) so the multi-frame case is locked under test. --- callgrind/callstack.c | 27 +++++++++++++++------------ callgrind/tests/phantom_root.c | 28 ++++++++++++++++++---------- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/callgrind/callstack.c b/callgrind/callstack.c index a58b91908..bb39b048d 100644 --- a/callgrind/callstack.c +++ b/callgrind/callstack.c @@ -362,6 +362,19 @@ void CLG_(pop_call_stack)(void) if (depth == 0) function_left(to_fn); } + else if (lower_entry->cxt != 0) { + /* Seeded entry from reconstruct_call_stack_from_native: jcc=0 + * (skip-style) but push_cxt did run, so cxt was changed. Restore + * it here so the seeded wrapper doesn't stay stuck on top of the + * cxt chain and phantom-parent every subsequent call from the + * real caller. Real skip-entries (push_call_stack(skip=True) + * without a prior push_cxt) have lower_entry->cxt==0 and skip + * this branch — their cxt was never changed, so nothing to + * restore. */ + CLG_(current_state).cxt = lower_entry->cxt; + CLG_(current_fn_stack).top = + CLG_(current_fn_stack).bottom + lower_entry->fn_sp; + } /* To allow for an assertion in push_call_stack() */ lower_entry->cxt = 0; @@ -468,18 +481,8 @@ void CLG_(reconstruct_call_stack_from_native)(ThreadId tid) * the first non-skipped -> skipped transition. */ BBCC* caller_bbcc = 0; - /* Push bottom-up: oldest caller first, stopping before frame 0 (the - * function that fired CALLGRIND_START_INSTRUMENTATION). - * - * Why skip frame 0: - * - Seeded call_entries have jcc=0. - * - pop_call_stack only restores cxt when jcc!=0. - * - So frame 0's `ret` would leave it stuck on top of the cxt chain, - * phantom-parenting every later call from the real caller. - * - * Skipping it leaves cxt ending at the genuine caller; frame 0's - * trailing epilogue is harmlessly attributed there. */ - for (Int frame = n - 1; frame >= 1; frame--) { + /* Push bottom-up: oldest caller first, current frame last. */ + for (Int frame = n - 1; frame >= 0; frame--) { fn_node* fn = CLG_(get_fn_node_for_addr)(ips[frame]); /* Latch obj-skip on first encounter, matching bbcc.c's check. */ diff --git a/callgrind/tests/phantom_root.c b/callgrind/tests/phantom_root.c index d224424ee..a32313441 100644 --- a/callgrind/tests/phantom_root.c +++ b/callgrind/tests/phantom_root.c @@ -1,10 +1,15 @@ /* Reproducer for the seeded-shadow-stack "phantom root" bug. * - * When CALLGRIND_START_INSTRUMENTATION fires inside a function that then - * returns, the seed pushes that function onto callgrind's cxt chain with - * jcc=0. pop_call_stack only restores cxt when jcc!=0, so after the ret - * the cxt is stuck with that function on top, and any subsequent call - * from the real caller appears as a child of it. */ + * When CALLGRIND_START_INSTRUMENTATION fires inside a wrapper chain + * that then unwinds, the seed pushes every native frame onto callgrind's + * cxt with jcc=0. pop_call_stack only restores cxt when jcc!=0, so each + * ret on the way back fails to pop the cxt — leaving the deepest + * un-returned wrapper stuck on top, phantom-parenting every later call + * from the real caller. + * + * We model a 3-deep wrapper chain (mirroring e.g. a Node.js binding: + * macro -> C export -> N-API trampoline -> user code) so the fix is + * exercised across multiple consecutive seeded pops, not just frame 0. */ #include "../callgrind.h" @@ -14,14 +19,17 @@ __attribute__((noinline)) static void leaf(void) { sink++; } __attribute__((noinline)) -static void start_and_return(void) -{ - CALLGRIND_START_INSTRUMENTATION; -} +static void wrapper_inner(void) { CALLGRIND_START_INSTRUMENTATION; } + +__attribute__((noinline)) +static void wrapper_middle(void) { wrapper_inner(); } + +__attribute__((noinline)) +static void wrapper_outer(void) { wrapper_middle(); } int main(void) { - start_and_return(); + wrapper_outer(); leaf(); CALLGRIND_STOP_INSTRUMENTATION; return 0; From 72e5f602492e19a45445cbd3577e847219c9ceef Mon Sep 17 00:00:00 2001 From: not-matthias Date: Thu, 28 May 2026 16:43:18 +0200 Subject: [PATCH 20/20] fix(callgrind): don't seed cxt for anonymous (JIT) frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix correctly restored cxt on the way back out of named wrapper frames, but Node.js / Python JIT scenarios have an anonymous frame between the codspeed C wrappers and user code (a V8 trampoline or JIT-generated trampoline, mmap'd in an anonymous region with no DebugInfo). CLG_(get_fn_node_for_addr) resolves that frame to fn->name == "???". The seed used to push_cxt that "???" fn onto the chain, but the JIT frame never RETs via C-ABI before user code runs, so the "???" entry stayed stuck on top of cxt and phantom-parented every user fn — visible in a CodSpeed flamegraph as a "???" or 0x{addr} row sitting between the benchmark label and the actual user function, holding the bulk of the inclusive cost. Skip push_cxt and BBCC synthesis for these frames. Still push a bare call_entry so SP-based unwind works for the underflow case; ce->cxt stays 0, which the pop_call_stack else-branch already treats as "no-op cxt restore". Verified against a Node.js benchmark under the codspeed runner: the "???" root previously holding 99.99% of inclusive cost is gone; user JS frames now appear directly under the benchmark. The phantom_root regression test (named-wrapper chain) still passes. An anonymous-frame C reproducer (mmap'd thunk between main and the wrapper) was attempted but hit an unrelated (below main) attribution issue and was dropped — the Node.js manual reproducer covers this case end-to-end. --- callgrind/callstack.c | 64 ++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/callgrind/callstack.c b/callgrind/callstack.c index bb39b048d..cc876e0cc 100644 --- a/callgrind/callstack.c +++ b/callgrind/callstack.c @@ -499,33 +499,47 @@ void CLG_(reconstruct_call_stack_from_native)(ThreadId tid) ensure_stack_size(cs->sp + 1); BBCC* prev_nonskipped = CLG_(current_state).nonskipped; - CLG_(push_cxt)(fn); - - /* Create a BBCC for non-skipped caller frames. ips[frame] for - * frame>=1 is "last byte of the call instruction" per VG_(get_StackTrace), - * so it's never a real BB start and the 0-insn synthetic BB cannot - * collide with later real instrumentation. The top frame's IP can - * land on a real BB, so we don't synthesize there — real BBCC will - * be created naturally on the first instrumented BB. */ - if (frame > 0 && !fn->skip) { - Bool seen; - BBCC* b = CLG_(get_bbcc)(CLG_(get_bb)(ips[frame], NULL, &seen)); - if (!seen) { - b->rec_array = CLG_(new_recursion)(fn->separate_recursions); - b->rec_array[0] = b; - b->cxt = CLG_(current_state).cxt; - CLG_(insert_bbcc_into_hash)(b); + + /* Anonymous JIT frames (V8 trampolines, generated code with no + * DebugInfo) resolve to fn->name == "???". Don't push_cxt them: + * they have no useful identity, and because no later RET ever + * pops them (JS resumes via dispatch, not C-ABI ret), they would + * stay stuck on top of the cxt chain forever and become a phantom + * "???" root of every user fn. Push only a bare call_entry so SP + * unwind still works; ce->cxt stays 0, signaling pop_call_stack + * to leave cxt alone. */ + Bool anonymous = (VG_(strcmp)(fn->name, "???") == 0); + + if (!anonymous) { + CLG_(push_cxt)(fn); + + /* Create a BBCC for non-skipped caller frames. ips[frame] for + * frame>=1 is "last byte of the call instruction" per + * VG_(get_StackTrace), so it's never a real BB start and the + * 0-insn synthetic BB cannot collide with later real + * instrumentation. The top frame's IP can land on a real BB, + * so we don't synthesize there — real BBCC will be created + * naturally on the first instrumented BB. */ + if (frame > 0 && !fn->skip) { + Bool seen; + BBCC* b = CLG_(get_bbcc)(CLG_(get_bb)(ips[frame], NULL, &seen)); + if (!seen) { + b->rec_array = CLG_(new_recursion)(fn->separate_recursions); + b->rec_array[0] = b; + b->cxt = CLG_(current_state).cxt; + CLG_(insert_bbcc_into_hash)(b); + } + caller_bbcc = b; } - caller_bbcc = b; - } - /* Mirror push_call_stack's nonskipped transition. */ - if (!fn->skip) { - CLG_(current_state).nonskipped = 0; - } else if (prev_nonskipped == 0 && caller_bbcc) { - CLG_(current_state).nonskipped = caller_bbcc; - if (!caller_bbcc->skipped) - CLG_(init_cost_lz)(CLG_(sets).full, &caller_bbcc->skipped); + /* Mirror push_call_stack's nonskipped transition. */ + if (!fn->skip) { + CLG_(current_state).nonskipped = 0; + } else if (prev_nonskipped == 0 && caller_bbcc) { + CLG_(current_state).nonskipped = caller_bbcc; + if (!caller_bbcc->skipped) + CLG_(init_cost_lz)(CLG_(sets).full, &caller_bbcc->skipped); + } } call_entry* ce = &cs->entry[cs->sp];