From b1ab32d194156d0c0e54baf4fcb9031906fc340b Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:16:03 +0200 Subject: [PATCH 01/12] feat(chaos): add BoundedThreadPoolAntagonist --- .../chaos/BoundedThreadPoolAntagonist.java | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java new file mode 100644 index 000000000..4cafcc908 --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java @@ -0,0 +1,148 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.time.Duration; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Maintains 4 scheduled thread pools that cross-submit tasks to simulate + * I/O-callback fan-out from Netty/gRPC event loops. Every 5 s one pool is + * torn down and recreated — a burst of thread-end events while in-flight + * tasks are running on sibling pools. + * + *

Targets: signal-vs-thread-end race during pool shutdown; JVMTI stack + * walk on executor threads being torn down while the scheduler ticks. + */ +public final class BoundedThreadPoolAntagonist implements Antagonist { + + private static final int POOL_COUNT = 4; + private static final int POOL_SIZE = 4; + private static final long TASK_PERIOD_MS = 50L; + private static final long RECYCLE_INTERVAL_MS = 5_000L; + + private final ScheduledExecutorService[] pools = new ScheduledExecutorService[POOL_COUNT]; + private volatile boolean running; + private Thread recycler; + private final AtomicLong sink = new AtomicLong(); + + @Override + public String name() { + return "bounded-pool"; + } + + @Override + public void start() { + running = true; + for (int i = 0; i < POOL_COUNT; i++) { + pools[i] = newPool(i); + } + for (int i = 0; i < POOL_COUNT; i++) { + schedulePoolTasks(i); + } + recycler = new Thread(this::recycleLoop, "chaos-bounded-pool-recycler"); + recycler.setDaemon(true); + recycler.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + try { + recycler.join(timeout.toMillis()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + for (ScheduledExecutorService pool : pools) { + if (pool != null) { + pool.shutdownNow(); + } + } + } + + private ScheduledExecutorService newPool(final int index) { + ScheduledThreadPoolExecutor pool = new ScheduledThreadPoolExecutor( + POOL_SIZE, + new ThreadFactory() { + @Override + public Thread newThread(Runnable r) { + Thread t = new Thread(r, "chaos-bounded-pool-" + index); + t.setDaemon(true); + return t; + } + }); + pool.setRemoveOnCancelPolicy(true); + return pool; + } + + private void schedulePoolTasks(final int poolIdx) { + ScheduledExecutorService pool = pools[poolIdx]; + final int nextIdx = (poolIdx + 1) % POOL_COUNT; + pool.scheduleAtFixedRate( + new Runnable() { + @Override + public void run() { + if (!running) return; + long seed = System.nanoTime(); + sink.addAndGet(burn(seed)); + ScheduledExecutorService sibling = pools[nextIdx]; + if (sibling != null && !sibling.isShutdown()) { + try { + final long s = seed; + sibling.submit(new Runnable() { + @Override + public void run() { + sink.addAndGet(burn(s ^ 0xdeadbeefL)); + } + }); + } catch (RejectedExecutionException ignored) { + // sibling shut down concurrently + } + } + } + }, + TASK_PERIOD_MS, TASK_PERIOD_MS, TimeUnit.MILLISECONDS); + } + + private void recycleLoop() { + int victim = 0; + while (running) { + try { + Thread.sleep(RECYCLE_INTERVAL_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + if (!running) return; + ScheduledExecutorService old = pools[victim]; + pools[victim] = null; + if (old != null) { + old.shutdownNow(); + } + ScheduledExecutorService fresh = newPool(victim); + pools[victim] = fresh; + schedulePoolTasks(victim); + victim = (victim + 1) % POOL_COUNT; + } + } + + private static long burn(long seed) { + long r = seed; + for (int i = 0; i < 500; i++) { + r = r * 6364136223846793005L + 1442695040888963407L; + } + return r; + } +} From da65e8cc9cb2cdf1787bd9e27e6154c23e34dbb1 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:17:55 +0200 Subject: [PATCH 02/12] feat(chaos): add ContextHopAntagonist --- .../profiler/chaos/ContextHopAntagonist.java | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java new file mode 100644 index 000000000..07bca2b4a --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java @@ -0,0 +1,119 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.time.Duration; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Chains {@link CompletableFuture} stages across three distinct thread pools + * (A→B→C→A). Each stage sets a {@link ThreadLocal} to a random value, burns + * briefly, then clears it before submitting the next stage. + * + *

Eight self-renewing chains run concurrently. + * + *

Targets: RefCountGuard slot contention racing cross-pool handoff; + * wall-clock signal hitting a thread between ThreadLocal set and remove. + */ +public final class ContextHopAntagonist implements Antagonist { + + private static final int CHAIN_COUNT = 8; + private static final ThreadLocal CONTEXT = new ThreadLocal(); + + private final ExecutorService poolA; + private final ExecutorService poolB; + private final ExecutorService poolC; + private volatile boolean running; + private final AtomicLong sink = new AtomicLong(); + + public ContextHopAntagonist() { + poolA = newPool("A"); + poolB = newPool("B"); + poolC = newPool("C"); + } + + @Override + public String name() { + return "context-hop"; + } + + @Override + public void start() { + running = true; + for (int i = 0; i < CHAIN_COUNT; i++) { + startChain(i); + } + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + poolA.shutdown(); + poolB.shutdown(); + poolC.shutdown(); + long slice = timeout.toMillis() / 3; + try { + poolA.awaitTermination(slice, TimeUnit.MILLISECONDS); + poolB.awaitTermination(slice, TimeUnit.MILLISECONDS); + poolC.awaitTermination(slice, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void startChain(final int chainId) { + if (!running) return; + final long seed = ThreadLocalRandom.current().nextLong() ^ ((long) chainId << 32); + CompletableFuture + .runAsync(new Runnable() { + @Override public void run() { hop(seed); } + }, poolA) + .thenRunAsync(new Runnable() { + @Override public void run() { hop(seed * 2L); } + }, poolB) + .thenRunAsync(new Runnable() { + @Override public void run() { hop(seed * 3L); } + }, poolC) + .thenRunAsync(new Runnable() { + @Override public void run() { startChain(chainId); } + }, poolA) + .exceptionally(null); + } + + private void hop(long value) { + CONTEXT.set(value); + try { + long r = value; + for (int i = 0; i < 200; i++) { + r = r * 1103515245L + 12345L; + } + sink.addAndGet(r); + } finally { + CONTEXT.remove(); + } + } + + private static ExecutorService newPool(final String label) { + return Executors.newFixedThreadPool(4, new ThreadFactory() { + @Override + public Thread newThread(Runnable r) { + Thread t = new Thread(r, "chaos-context-hop-" + label); + t.setDaemon(true); + return t; + } + }); + } +} From acd1b7462aab5de88bd36f79fee4cf00aadb17b2 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:19:38 +0200 Subject: [PATCH 03/12] feat(chaos): add ConsumerGroupAntagonist --- .../chaos/ConsumerGroupAntagonist.java | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java new file mode 100644 index 000000000..1c0ffcc35 --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java @@ -0,0 +1,131 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.time.Duration; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Maintains 16 "consumer" threads doing light spin work. Every 3 s it + * replaces 4 threads simultaneously (all interrupted at once, all + * replacements started before waiting for exits) to simulate a Kafka + * consumer group rebalance burst. + * + *

Targets: ProfiledThread recycling under burst replacement; + * signal-vs-thread-end race window widened by simultaneous stops; + * calltrace storage put() racing thread destruction. + */ +public final class ConsumerGroupAntagonist implements Antagonist { + + private static final int GROUP_SIZE = 16; + private static final int BURST_SIZE = 4; + private static final long REBALANCE_INTERVAL_MS = 3_000L; + + private final Thread[] consumers = new Thread[GROUP_SIZE]; + private volatile boolean running; + private Thread rebalancer; + private final AtomicLong sink = new AtomicLong(); + + @Override + public String name() { + return "consumer-group"; + } + + @Override + public void start() { + running = true; + for (int i = 0; i < GROUP_SIZE; i++) { + consumers[i] = newConsumer(i); + consumers[i].start(); + } + rebalancer = new Thread(new Runnable() { + @Override public void run() { rebalanceLoop(); } + }, "chaos-consumer-rebalancer"); + rebalancer.setDaemon(true); + rebalancer.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + try { + rebalancer.join(timeout.toMillis() / 2); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + for (Thread t : consumers) { + if (t != null) { + t.interrupt(); + try { t.join(500L); } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } + } + + private Thread newConsumer(final int index) { + Thread t = new Thread(new Runnable() { + @Override + public void run() { + long r = ThreadLocalRandom.current().nextLong(); + while (running && !Thread.currentThread().isInterrupted()) { + for (int i = 0; i < 100; i++) { + r = r * 6364136223846793005L + 1442695040888963407L; + } + sink.addAndGet(r); + Thread.yield(); + } + } + }, "chaos-consumer-" + index); + t.setDaemon(true); + return t; + } + + private void rebalanceLoop() { + int offset = 0; + while (running) { + try { + Thread.sleep(REBALANCE_INTERVAL_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + if (!running) return; + + // Interrupt all victims simultaneously (burst) + Thread[] victims = new Thread[BURST_SIZE]; + for (int i = 0; i < BURST_SIZE; i++) { + int idx = offset + i; + victims[i] = consumers[idx]; + consumers[idx] = null; + if (victims[i] != null) { + victims[i].interrupt(); + } + } + // Start replacements before waiting for victims to fully exit + for (int i = 0; i < BURST_SIZE; i++) { + int idx = offset + i; + consumers[idx] = newConsumer(idx); + consumers[idx].start(); + } + // Now wait for victims + for (Thread victim : victims) { + if (victim != null) { + try { victim.join(1_000L); } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + offset = (offset + BURST_SIZE) % GROUP_SIZE; + } + } +} From fdaae3ff3250d8b51da29dc0e762dbcaba211398 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:21:08 +0200 Subject: [PATCH 04/12] fix(chaos): interrupt rebalancer on stop in ConsumerGroupAntagonist --- .../com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java | 1 + 1 file changed, 1 insertion(+) diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java index 1c0ffcc35..f356255f6 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java @@ -56,6 +56,7 @@ public void start() { @Override public void stopGracefully(Duration timeout) { running = false; + rebalancer.interrupt(); try { rebalancer.join(timeout.toMillis() / 2); } catch (InterruptedException e) { From b6cc5608b3cb34631f5151d53aaf492cb7386f0b Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:23:22 +0200 Subject: [PATCH 05/12] feat(chaos): add HiddenClassChurnAntagonist --- .../chaos/HiddenClassChurnAntagonist.java | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java new file mode 100644 index 000000000..9482f5ce5 --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java @@ -0,0 +1,130 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import org.objectweb.asm.ClassWriter; +import org.objectweb.asm.MethodVisitor; +import org.objectweb.asm.Opcodes; + +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Array; +import java.lang.reflect.Method; +import java.time.Duration; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Generates hidden classes via {@code MethodHandles.Lookup.defineHiddenClass} + * (Java 15+) and immediately drops all references to make them GC-eligible. + * + *

Gracefully no-ops on JDKs that do not support hidden classes. + * + *

Targets: StringDictionary concurrent eviction racing hidden-class GC; + * class-map lookup for a class being unloaded while the profiler dumps. + */ +public final class HiddenClassChurnAntagonist implements Antagonist { + + private static final Method DEFINE_HIDDEN_CLASS; + private static final Object EMPTY_OPTIONS; // ClassOption[0] at runtime + + static { + Method m = null; + Object opts = null; + try { + Class optionClass = Class.forName("java.lang.invoke.MethodHandles$Lookup$ClassOption"); + // Build ClassOption[] type without arrayType() (requires Java 12+) + Object emptyArray = Array.newInstance(optionClass, 0); + m = MethodHandles.Lookup.class.getMethod( + "defineHiddenClass", byte[].class, boolean.class, emptyArray.getClass()); + opts = emptyArray; + } catch (Throwable t) { + // Java < 15: defineHiddenClass not available + } + DEFINE_HIDDEN_CLASS = m; + EMPTY_OPTIONS = opts; + } + + private static final AtomicLong COUNTER = new AtomicLong(); + private volatile boolean running; + private Thread driver; + private final AtomicLong sink = new AtomicLong(); + + @Override + public String name() { + return "hidden-class-churn"; + } + + @Override + public void start() { + running = true; + driver = new Thread(new Runnable() { + @Override public void run() { loop(); } + }, "chaos-hidden-class-churn"); + driver.setDaemon(true); + driver.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + try { + driver.join(timeout.toMillis()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void loop() { + if (DEFINE_HIDDEN_CLASS == null) { + System.out.println("[chaos] hidden-class-churn: skipping (defineHiddenClass not available on this JDK)"); + return; + } + MethodHandles.Lookup lookup = MethodHandles.lookup(); + while (running) { + long uid = COUNTER.incrementAndGet(); + byte[] bytecode = generateClass(uid); + try { + // invoke(lookup, byte[], boolean, ClassOption[]) → MethodHandles.Lookup + MethodHandles.Lookup hiddenLookup = (MethodHandles.Lookup) + DEFINE_HIDDEN_CLASS.invoke(lookup, + new Object[]{bytecode, Boolean.FALSE, EMPTY_OPTIONS}); + Class klass = hiddenLookup.lookupClass(); + // Invoke compute() to force JIT registration in profiler tables + Object result = klass.getMethod("compute", long.class).invoke(null, uid); + sink.addAndGet((long) result); + // klass and hiddenLookup go out of scope → hidden class GC-eligible + } catch (Throwable t) { + // transient; JVM crash is the signal we watch for + } + } + } + + private static byte[] generateClass(long uid) { + // Unique internal name per class so each one gets a distinct entry + // in the profiler's class map / StringDictionary. + String internalName = "chaos/hidden/Gen" + uid; + ClassWriter cw = new ClassWriter(ClassWriter.COMPUTE_FRAMES | ClassWriter.COMPUTE_MAXS); + cw.visit(Opcodes.V11, + Opcodes.ACC_PUBLIC | Opcodes.ACC_FINAL, + internalName, null, "java/lang/Object", null); + MethodVisitor mv = cw.visitMethod( + Opcodes.ACC_PUBLIC | Opcodes.ACC_STATIC, "compute", "(J)J", null, null); + mv.visitCode(); + mv.visitVarInsn(Opcodes.LLOAD, 0); + mv.visitLdcInsn(1103515245L); + mv.visitInsn(Opcodes.LMUL); + mv.visitLdcInsn(12345L); + mv.visitInsn(Opcodes.LADD); + mv.visitInsn(Opcodes.LRETURN); + mv.visitMaxs(0, 0); + mv.visitEnd(); + cw.visitEnd(); + return cw.toByteArray(); + } +} From 4fe6a47b96134d3f39763e23a25ccd7c92271740 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:24:57 +0200 Subject: [PATCH 06/12] feat(chaos): add DirectMemoryAntagonist --- .../chaos/DirectMemoryAntagonist.java | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DirectMemoryAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DirectMemoryAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DirectMemoryAntagonist.java new file mode 100644 index 000000000..d82e154e5 --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DirectMemoryAntagonist.java @@ -0,0 +1,111 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.nio.ByteBuffer; +import java.time.Duration; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Drives direct (off-heap) memory allocation at high rate via + * {@link ByteBuffer#allocateDirect}. Two concurrent modes: + *

+ * + *

Targets: liveness-table overflow under high off-heap churn; jweak ref + * release racing realloc failure cleanup. + */ +public final class DirectMemoryAntagonist implements Antagonist { + + private static final int RING_SIZE = 32; + private static final int[] RING_SIZES_BYTES = {4_096, 65_536, 524_288}; + private static final int BURST_SIZE_BYTES = 1_024; + + private volatile boolean running; + private Thread ringDriver; + private Thread burstDriver; + private final AtomicLong sink = new AtomicLong(); + + @Override + public String name() { + return "direct-memory"; + } + + @Override + public void start() { + running = true; + ringDriver = new Thread(new Runnable() { + @Override public void run() { ringLoop(); } + }, "chaos-direct-memory-ring"); + ringDriver.setDaemon(true); + ringDriver.start(); + burstDriver = new Thread(new Runnable() { + @Override public void run() { burstLoop(); } + }, "chaos-direct-memory-burst"); + burstDriver.setDaemon(true); + burstDriver.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + long slice = timeout.toMillis() / 2; + try { + ringDriver.join(slice); + burstDriver.join(slice); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void ringLoop() { + ByteBuffer[] ring = new ByteBuffer[RING_SIZE]; + int slot = 0; + int sizeIdx = 0; + while (running) { + ring[slot] = null; // evict oldest — GC + Cleaner handles dealloc + try { + ByteBuffer buf = ByteBuffer.allocateDirect(RING_SIZES_BYTES[sizeIdx]); + buf.put(0, (byte) slot); // touch to prevent elision + sink.addAndGet(buf.capacity()); + ring[slot] = buf; + } catch (OutOfMemoryError e) { + // Direct memory exhausted; clear ring to allow recovery + for (int i = 0; i < RING_SIZE; i++) { + ring[i] = null; + } + } + slot = (slot + 1) % RING_SIZE; + sizeIdx = (sizeIdx + 1) % RING_SIZES_BYTES.length; + } + for (int i = 0; i < RING_SIZE; i++) { + ring[i] = null; + } + } + + private void burstLoop() { + long acc = 0L; + while (running) { + try { + ByteBuffer buf = ByteBuffer.allocateDirect(BURST_SIZE_BYTES); + buf.put(0, (byte) 42); + acc += buf.limit(); + // buf goes out of scope; GC + Cleaner handles dealloc + } catch (OutOfMemoryError e) { + Thread.yield(); // let GC recover direct memory + } + } + sink.addAndGet(acc); + } +} From 8814f60b91b8e0272c686e9f9593ce90082b0e4b Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:26:46 +0200 Subject: [PATCH 07/12] feat(chaos): add WeakRefWaveAntagonist --- .../profiler/chaos/WeakRefWaveAntagonist.java | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/WeakRefWaveAntagonist.java diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/WeakRefWaveAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/WeakRefWaveAntagonist.java new file mode 100644 index 000000000..cc0d5158f --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/WeakRefWaveAntagonist.java @@ -0,0 +1,117 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.lang.ref.WeakReference; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Alternates fill and drop phases to drive wave-pattern weak-reference + * allocation and GC. A concurrent reader thread walks the weak-ref list + * during the fill phase. + * + *

Fill: allocates 10 k {@code byte[]} objects with strong + weak refs. + * Drop: releases strong refs (objects become weakly reachable), calls + * {@link System#gc()}, counts survivors. + * + *

Targets: jweak ref leak during liveness table overflow; concurrent + * read of a weakref cleared mid-wave; liveness table clearAll() race. + */ +public final class WeakRefWaveAntagonist implements Antagonist { + + private static final int WAVE_SIZE = 10_000; + private static final int[] OBJECT_SIZES = {64, 256, 1_024, 4_096}; + + private volatile boolean running; + private Thread waveDriver; + private Thread reader; + + // Written by waveDriver, read by reader. Volatile for visibility. + private volatile List> currentWave = new ArrayList>(); + private final AtomicLong sink = new AtomicLong(); + + @Override + public String name() { + return "weakref-wave"; + } + + @Override + public void start() { + running = true; + reader = new Thread(new Runnable() { + @Override public void run() { readerLoop(); } + }, "chaos-weakref-reader"); + reader.setDaemon(true); + reader.start(); + waveDriver = new Thread(new Runnable() { + @Override public void run() { waveLoop(); } + }, "chaos-weakref-wave"); + waveDriver.setDaemon(true); + waveDriver.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + long slice = timeout.toMillis() / 2; + try { + waveDriver.join(slice); + reader.join(slice); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void waveLoop() { + while (running) { + // Fill: allocate objects, hold both strong and weak refs + List strongRefs = new ArrayList(WAVE_SIZE); + List> weakRefs = new ArrayList>(WAVE_SIZE); + for (int i = 0; i < WAVE_SIZE && running; i++) { + int size = OBJECT_SIZES[i % OBJECT_SIZES.length]; + byte[] obj = new byte[size]; + obj[0] = (byte) i; + strongRefs.add(obj); + weakRefs.add(new WeakReference(obj)); + } + // Publish filled list to reader + currentWave = weakRefs; + + // Drop: release strong refs — objects now only weakly reachable + strongRefs.clear(); + System.gc(); + + // Count survivors (concurrent with reader on same list) + long alive = 0; + for (WeakReference ref : weakRefs) { + if (ref.get() != null) alive++; + } + sink.addAndGet(alive); + + // Replace shared reference; weakRefs goes out of scope + currentWave = new ArrayList>(); + } + } + + private void readerLoop() { + while (running) { + List> wave = currentWave; // snapshot + long alive = 0; + for (WeakReference ref : wave) { + if (ref.get() != null) alive++; + } + sink.addAndGet(alive); + Thread.yield(); + } + } +} From 6e5e1e58ffcd6f8b9247b4ea451f682df6a6f69b Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 10:33:08 +0200 Subject: [PATCH 08/12] feat(chaos): wire up 6 new production-pattern antagonists Co-Authored-By: Claude Sonnet 4.6 --- .gitlab/reliability/chaos_check.sh | 4 ++-- .../profiler/chaos/ContextHopAntagonist.java | 2 +- .../java/com/datadoghq/profiler/chaos/Main.java | 12 ++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.gitlab/reliability/chaos_check.sh b/.gitlab/reliability/chaos_check.sh index 881dd849d..822fa5985 100755 --- a/.gitlab/reliability/chaos_check.sh +++ b/.gitlab/reliability/chaos_check.sh @@ -74,12 +74,12 @@ case $CONFIG in echo "Running with profiler only" ENABLEMENT="-Ddd.profiling.enabled=true -Ddd.trace.enabled=false" # @Trace is a no-op without the tracer, so trace-context is excluded here. - ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn" + ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave" ;; profiler+tracer) echo "Running with profiler and tracer" ENABLEMENT="-Ddd.profiling.enabled=true -Ddd.trace.enabled=true" - ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,trace-context" + ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,trace-context,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave" ;; *) echo "Unknown configuration: $CONFIG" diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java index 07bca2b4a..439444dac 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ContextHopAntagonist.java @@ -90,7 +90,7 @@ private void startChain(final int chainId) { .thenRunAsync(new Runnable() { @Override public void run() { startChain(chainId); } }, poolA) - .exceptionally(null); + .exceptionally(t -> null); } private void hop(long value) { diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java index fdae828f6..a89febed7 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java @@ -76,6 +76,18 @@ private static Antagonist create(String name) { return new ClassLoaderChurnAntagonist(); case "trace-context": return new TraceContextAntagonist(); + case "bounded-pool": + return new BoundedThreadPoolAntagonist(); + case "context-hop": + return new ContextHopAntagonist(); + case "consumer-group": + return new ConsumerGroupAntagonist(); + case "hidden-class-churn": + return new HiddenClassChurnAntagonist(); + case "direct-memory": + return new DirectMemoryAntagonist(); + case "weakref-wave": + return new WeakRefWaveAntagonist(); // Deferred: dlopen-churn (needs per-arch dummy .so built in CI prep). default: throw new IllegalArgumentException("unknown antagonist: " + name); From 534f284e65cc172ad2738f7e4404dab20d7ed351 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 12:22:21 +0200 Subject: [PATCH 09/12] fix(chaos): use lookup package for hidden class internal name --- .../chaos/HiddenClassChurnAntagonist.java | 2 +- ...9-generate-hidden-classes-in-the-lookup.md | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java index 9482f5ce5..d63729c18 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java @@ -108,7 +108,7 @@ private void loop() { private static byte[] generateClass(long uid) { // Unique internal name per class so each one gets a distinct entry // in the profiler's class map / StringDictionary. - String internalName = "chaos/hidden/Gen" + uid; + String internalName = "com/datadoghq/profiler/chaos/Gen" + uid; ClassWriter cw = new ClassWriter(ClassWriter.COMPUTE_FRAMES | ClassWriter.COMPUTE_MAXS); cw.visit(Opcodes.V11, Opcodes.ACC_PUBLIC | Opcodes.ACC_FINAL, diff --git a/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md b/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md new file mode 100644 index 000000000..c5b4e27b6 --- /dev/null +++ b/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md @@ -0,0 +1,20 @@ +# generate-hidden-classes-in-the-lookup-packag + +## Finding + +`HiddenClassChurnAntagonist.generateClass()` set `internalName = "chaos/hidden/Gen" + uid`, +placing the generated class in package `chaos.hidden`. The lookup object obtained from +`MethodHandles.lookup()` inside `HiddenClassChurnAntagonist` belongs to package +`com.datadoghq.profiler.chaos`. On Java 15+, `Lookup.defineHiddenClass` requires the +bytecode to declare the same package as the lookup class; a mismatch throws +`IllegalArgumentException`, which was silently swallowed by `catch (Throwable t)`. As a +result the antagonist looped without ever defining a hidden class. + +## Fix + +Change `internalName` to `"com/datadoghq/profiler/chaos/Gen" + uid` so the generated +class's package matches the lookup class. + +## File + +`ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/HiddenClassChurnAntagonist.java`, line 111 From 4d59e04b395462a77234127c1ce45bc7ca770984 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 13:07:18 +0200 Subject: [PATCH 10/12] fix(chaos): interrupt recycler before join; deadline-based consumer join; fix spec typo Co-Authored-By: Claude Sonnet 4.6 --- .../profiler/chaos/BoundedThreadPoolAntagonist.java | 1 + .../datadoghq/profiler/chaos/ConsumerGroupAntagonist.java | 5 ++++- .../2026-05-29-generate-hidden-classes-in-the-lookup.md | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java index 4cafcc908..045bf090e 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/BoundedThreadPoolAntagonist.java @@ -60,6 +60,7 @@ public void start() { @Override public void stopGracefully(Duration timeout) { running = false; + recycler.interrupt(); try { recycler.join(timeout.toMillis()); } catch (InterruptedException e) { diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java index f356255f6..2408f5415 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/ConsumerGroupAntagonist.java @@ -56,6 +56,7 @@ public void start() { @Override public void stopGracefully(Duration timeout) { running = false; + long deadline = System.currentTimeMillis() + timeout.toMillis(); rebalancer.interrupt(); try { rebalancer.join(timeout.toMillis() / 2); @@ -65,7 +66,9 @@ public void stopGracefully(Duration timeout) { for (Thread t : consumers) { if (t != null) { t.interrupt(); - try { t.join(500L); } catch (InterruptedException e) { + long remaining = Math.max(0L, deadline - System.currentTimeMillis()); + if (remaining == 0L) break; + try { t.join(remaining); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } diff --git a/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md b/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md index c5b4e27b6..096363eb8 100644 --- a/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md +++ b/docs/sphinx/specs/2026-05-29-generate-hidden-classes-in-the-lookup.md @@ -1,4 +1,4 @@ -# generate-hidden-classes-in-the-lookup-packag +# generate-hidden-classes-in-the-lookup-package ## Finding From bc52f9a60fb719f7926e063e613f68401bae7913 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Fri, 29 May 2026 23:06:01 +0200 Subject: [PATCH 11/12] =?UTF-8?q?test(chaos):=20add=20logs-backend=20crash?= =?UTF-8?q?=20simulation=20(Java=2025=20thread-churn=20=C3=97=20dump)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layer 1: C++ gtest stress reproducer (stress_threadLifecycle_ut) drives ProfiledThread, ThreadFilter, CallTraceStorage, and Dictionary concurrently without a JVM so ASan/TSan can locate the UAF at its origin. Layer 2: DumpStormAntagonist chaos antagonist spawns 96 short-lived, uniquely-named threads to stress the thread-name/dump path under a real JVM. Chaos CI matrix extended with CHAOS_JDK dimension (21 + 25). CI fixes: REASON dotenv key includes CHAOS_JDK to prevent key collisions; tee replaced with redirect to preserve exit status; JDK version verified after sdk use to catch silent wrong-JDK fallback. Co-Authored-By: Claude Sonnet 4.6 --- .gitlab/reliability/.gitlab-ci.yml | 9 +- .gitlab/reliability/chaos_check.sh | 13 +- .../test/cpp/stress_threadLifecycle_ut.cpp | 174 ++++++ .../profiler/chaos/DumpStormAntagonist.java | 97 ++++ .../com/datadoghq/profiler/chaos/Main.java | 2 + ...29-logs-backend-crash-simulation-design.md | 186 ++++++ ...5-29-logs-backend-crash-simulation-plan.md | 529 ++++++++++++++++++ 7 files changed, 1003 insertions(+), 7 deletions(-) create mode 100644 ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp create mode 100644 ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java create mode 100644 doc/plans/2026-05-29-logs-backend-crash-simulation-design.md create mode 100644 doc/plans/2026-05-29-logs-backend-crash-simulation-plan.md diff --git a/.gitlab/reliability/.gitlab-ci.yml b/.gitlab/reliability/.gitlab-ci.yml index c02947a96..4ae8bd430 100644 --- a/.gitlab/reliability/.gitlab-ci.yml +++ b/.gitlab/reliability/.gitlab-ci.yml @@ -78,16 +78,17 @@ reliability-aarch64: matrix: - CONFIG: ["profiler", "profiler+tracer"] ALLOCATOR: ["gmalloc", "jemalloc", "tcmalloc"] + CHAOS_JDK: ["21.0.3-tem", "25.0.3-tem"] script: - set +e - - echo "runtime=${RUNTIME}, config=${CONFIG}, allocator=${ALLOCATOR}, arch=${ARCH}" - - .gitlab/reliability/chaos_check.sh "$RUNTIME" "$CONFIG" "$ALLOCATOR" 2>err.log | tee out.log + - echo "runtime=${RUNTIME}, config=${CONFIG}, allocator=${ALLOCATOR}, arch=${ARCH}, jdk=${CHAOS_JDK}" + - CHAOS_JDK="${CHAOS_JDK}" .gitlab/reliability/chaos_check.sh "$RUNTIME" "$CONFIG" "$ALLOCATOR" 2>err.log 1>out.log - REASON=$(cat err.log | grep "FAIL:" | cut -f2 -d':') || true - - if [ -n "${REASON}" ]; then echo "REASON_${CONFIG}_${ALLOCATOR}_${ARCH}Xchaos=${REASON}" | tr '+' '_' >> build.env; exit 1; fi + - if [ -n "${REASON}" ]; then echo "REASON_${CONFIG}_${ALLOCATOR}_${ARCH}_${CHAOS_JDK//./_}Xchaos=${REASON}" | tr '+' '_' >> build.env; exit 1; fi after_script: - | if [[ "$CI_JOB_STATUS" == "failed" ]]; then - grep -q "$(printf 'REASON_%s_%s_%sXchaos=' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" | tr '+' '_')" build.env 2>/dev/null || echo "REASON_${CONFIG}_${ALLOCATOR}_${ARCH}Xchaos=Unknown failure, perhaps timeout" | tr '+' '_' >> build.env + grep -q "$(printf 'REASON_%s_%s_%s_%sXchaos=' "${CONFIG}" "${ALLOCATOR}" "${ARCH}" "${CHAOS_JDK//./_}" | tr '+' '_')" build.env 2>/dev/null || echo "REASON_${CONFIG}_${ALLOCATOR}_${ARCH}_${CHAOS_JDK//./_}Xchaos=Unknown failure, perhaps timeout" | tr '+' '_' >> build.env fi artifacts: name: "chaos-results-${ARCH}" diff --git a/.gitlab/reliability/chaos_check.sh b/.gitlab/reliability/chaos_check.sh index 822fa5985..8096cc11c 100755 --- a/.gitlab/reliability/chaos_check.sh +++ b/.gitlab/reliability/chaos_check.sh @@ -13,7 +13,14 @@ echo "Chaos run: runtime=${RUNTIME}s config=${CONFIG} allocator=${ALLOCATOR}" curl -s "https://get.sdkman.io" | bash source "/root/.sdkman/bin/sdkman-init.sh" 1>/dev/null 2>/dev/null -timeout 300 sdk install java 21.0.3-tem 1>/dev/null 2>/dev/null +CHAOS_JDK="${CHAOS_JDK:-21.0.3-tem}" +timeout 300 sdk install java "${CHAOS_JDK}" 1>/dev/null 2>/dev/null +sdk use java "${CHAOS_JDK}" +ACTIVE_JDK=$(java -version 2>&1 | head -1) +if [[ "$ACTIVE_JDK" != *"${CHAOS_JDK%%-*}"* ]]; then + echo "FAIL:wrong JDK active (expected ${CHAOS_JDK}, got: ${ACTIVE_JDK})" >&2 + exit 1 +fi # Resolve ddprof.jar: prefer local build artifact, fall back to Maven snapshot. # Running mvn from /tmp avoids the empty pom.xml at the repo root. @@ -74,12 +81,12 @@ case $CONFIG in echo "Running with profiler only" ENABLEMENT="-Ddd.profiling.enabled=true -Ddd.trace.enabled=false" # @Trace is a no-op without the tracer, so trace-context is excluded here. - ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave" + ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave,dump-storm" ;; profiler+tracer) echo "Running with profiler and tracer" ENABLEMENT="-Ddd.profiling.enabled=true -Ddd.trace.enabled=true" - ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,trace-context,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave" + ANTAGONISTS="thread-churn,alloc-storm,vthread-churn,classloader-churn,trace-context,bounded-pool,context-hop,consumer-group,hidden-class-churn,direct-memory,weakref-wave,dump-storm" ;; *) echo "Unknown configuration: $CONFIG" diff --git a/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp b/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp new file mode 100644 index 000000000..6325c827a --- /dev/null +++ b/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp @@ -0,0 +1,174 @@ +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + * + * Layer-1 reproducer for the logs-backend crash (Java 25, thread-churn × + * recording-dump). Drives the profiler's own thread-lifecycle and dump data + * structures concurrently, with NO JVM in the process, so ASan/TSan can flag + * the use-after-free / race at its origin. + * + * See doc/plans/2026-05-29-logs-backend-crash-simulation-design.md + */ +#include "gtest/gtest.h" + +#ifdef __linux__ + +#include "callTraceStorage.h" +#include "callTraceHashTable.h" +#include "dictionary.h" +#include "threadFilter.h" +#include "thread.h" +#include "arch.h" + +#include +#include +#include +#include +#include +#include +#include "../../main/cpp/gtest_crash_handler.h" + +// Crash handler test name (installed in each multithreaded test below). +static constexpr const char STRESS_TEST_NAME[] = "StressThreadLifecycle"; + +// Number of churn workers and iterations per worker. +static constexpr int kChurnWorkers = 16; +static constexpr int kChurnIterations = 2000; + +// Shared dump-side structures. They are exercised concurrently by the dump +// thread and (for the put() path) by the churn workers. +static CallTraceStorage g_storage; +static Dictionary g_dict; +static std::atomic g_run{false}; + +// Record a small fake call trace plus a dictionary lookup, mirroring what the +// profiler does for every sample. ASGCT_CallFrame uses `bci` (jint) and the +// `method_id` union member (see vmEntry.h). +static void record_trace(int salt) { + ASGCT_CallFrame frames[4]; + std::memset(frames, 0, sizeof(frames)); + for (int i = 0; i < 4; i++) { + frames[i].bci = i + salt; + frames[i].method_id = + reinterpret_cast(static_cast(0x1000 + i + salt)); + } + g_storage.put(4, frames, false, 1); + g_dict.lookup("logs-backend-sim"); +} + +// onThreadStart -> work -> onThreadEnd loop, mirroring the profiler's per-thread +// lifecycle: initCurrentThread / current / register filter slot / setFilterSlotId +// / (work) / remove + unregister / release. +// +// Thread-name / ThreadInfo coverage: this native reproducer does not call +// ThreadInfo::set() or updateJavaThreadNames(). That path is covered by the +// JVM-level DumpStormAntagonist antagonist (Layer 2). A clean ASan/TSan run +// here is not conclusive for the thread-name path. +static void churn_worker(ThreadFilter* filter, bool with_dump) { + while (!g_run.load(std::memory_order_acquire)) { } + for (int i = 0; i < kChurnIterations && g_run.load(std::memory_order_relaxed); i++) { + ProfiledThread::initCurrentThread(); + ProfiledThread* self = ProfiledThread::current(); + ASSERT_NE(nullptr, self); + + ThreadFilter::SlotID slot = filter->registerThread(); + if (slot >= 0) { + self->setFilterSlotId(slot); + filter->add(self->tid(), slot); + } + + if (with_dump) { + record_trace(i); + } + std::this_thread::yield(); + + if (slot >= 0) { + filter->remove(slot); + filter->unregisterThread(slot); + } + self->setFilterSlotId(-1); + ProfiledThread::release(); + } +} + +// Continuously dumps the trace storage and clears both the dictionary and the +// storage, racing against concurrent put() / lookup() from churn workers. +// +// Intentional divergence from production: production wraps clear() in +// lockAll()/unlockAll() (profiler.cpp) so clear() never races put(). Here +// we drop that guard deliberately so ASan/TSan can observe a UAF at its +// origin. A crash in this reproducer may surface a real bug or a test-only +// race; cross-reference with the Layer-2 DumpStormAntagonist (JVM-level) +// to confirm which it is. The CallTraceStorage concurrency contract +// (refcount-guard + CriticalSection) prevents permanent corruption from +// clear()-vs-put() racing, so this does not cause silent data loss. +static void dump_thread() { + while (g_run.load(std::memory_order_relaxed)) { + g_storage.processTraces([](const std::unordered_set& traces) { + volatile size_t n = 0; + for (CallTrace* t : traces) { + if (t && t != CallTraceSample::PREPARING) { + n += 1; + } + } + (void)n; + }); + // dict.clear() races dict.lookup() on churn threads — intentional (see above). + g_dict.clear(); + g_storage.clear(); + } +} + +TEST(StressThreadLifecycle, Smoke) { + CallTraceStorage storage; + Dictionary dict; + dict.lookup("smoke"); + storage.clear(); + SUCCEED(); +} + +TEST(StressThreadLifecycle, ChurnOnly) { + installGtestCrashHandler(); + + ThreadFilter filter; + filter.init("*"); + ASSERT_TRUE(filter.enabled()); + + g_run.store(true, std::memory_order_release); + std::vector ts; + for (int t = 0; t < kChurnWorkers; t++) { + ts.emplace_back(churn_worker, &filter, false); + } + for (auto& t : ts) { + t.join(); + } + g_run.store(false); + + restoreDefaultSignalHandlers(); + SUCCEED(); +} + +TEST(StressThreadLifecycle, ChurnDuringDump) { + installGtestCrashHandler(); + + ThreadFilter filter; + filter.init("*"); + ASSERT_TRUE(filter.enabled()); + + g_run.store(true, std::memory_order_release); + std::thread dumper(dump_thread); + std::vector ts; + for (int t = 0; t < kChurnWorkers; t++) { + ts.emplace_back(churn_worker, &filter, true); + } + for (auto& t : ts) { + t.join(); + } + g_run.store(false); + dumper.join(); + + restoreDefaultSignalHandlers(); + SUCCEED(); +} + +#endif // __linux__ diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java new file mode 100644 index 000000000..26ff07147 --- /dev/null +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java @@ -0,0 +1,97 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +/** + * Spawns short-lived, frequently-renamed threads that each generate distinct + * stack shapes, maximising churn in the profiler's thread-name table and + * call-trace storage right as recording chunks rotate. + * + *

Targets: {@code Recording::switchChunk/writeCpool}, + * {@code updateJavaThreadNames -> ThreadInfo::set}, {@code Dictionary::clear}. + * Pair with a short profiler recording interval so dumps fire continuously. + */ +public final class DumpStormAntagonist implements Antagonist { + + private final int concurrentThreads; + private volatile boolean running; + private Thread driver; + + public DumpStormAntagonist() { + this(96); + } + + public DumpStormAntagonist(int concurrentThreads) { + this.concurrentThreads = concurrentThreads; + } + + @Override + public String name() { + return "dump-storm"; + } + + @Override + public void start() { + running = true; + driver = new Thread(this::loop, "chaos-dump-storm"); + driver.setDaemon(true); + driver.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + try { + if (driver != null) driver.join(timeout.toMillis()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void loop() { + long seq = 0; + while (running) { + List batch = new ArrayList<>(concurrentThreads); + for (int i = 0; i < concurrentThreads && running; i++) { + final long id = seq++; + Thread t = new Thread(() -> distinctStack(id, 0)); + t.setName("dump-storm-" + id); + t.setDaemon(true); + t.start(); + batch.add(t); + } + for (Thread t : batch) { + try { + t.join(500L); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + // Recurse to a per-thread depth so each thread yields a unique stack shape, + // forcing new call-trace + symbol entries that the dump path must serialise. + // Depth floor is (id % 32) + 1 so id=0 (and every 32nd id) still recurse at + // least once, avoiding identical single-frame stacks for those threads. + private long distinctStack(long id, int depth) { + if (depth >= (int) (id % 32) + 1) { + long r = id; + for (int i = 0; i < 5000; i++) r = (r * 1103515245L + 12345L) & 0x7fffffffL; + return r; + } + return distinctStack(id, depth + 1) + depth; + } +} diff --git a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java index a89febed7..802d1fc2e 100644 --- a/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java +++ b/ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java @@ -88,6 +88,8 @@ private static Antagonist create(String name) { return new DirectMemoryAntagonist(); case "weakref-wave": return new WeakRefWaveAntagonist(); + case "dump-storm": + return new DumpStormAntagonist(); // Deferred: dlopen-churn (needs per-arch dummy .so built in CI prep). default: throw new IllegalArgumentException("unknown antagonist: " + name); diff --git a/doc/plans/2026-05-29-logs-backend-crash-simulation-design.md b/doc/plans/2026-05-29-logs-backend-crash-simulation-design.md new file mode 100644 index 000000000..fa6fde4c7 --- /dev/null +++ b/doc/plans/2026-05-29-logs-backend-crash-simulation-design.md @@ -0,0 +1,186 @@ +# Reproducing the logs-backend profiler crashes in java-profiler testing + +**Date:** 2026-05-29 +**Status:** Design (pending review) +**Author:** investigation driven from Datadog crash-tracking telemetry + +## 1. Problem + +Datadog-internal services built from `~/dd/logs-backend` (event-platform / `*-reducer` +workloads) crash far more than our test suite or external customers surface. The goal is +to reproduce the responsible conditions inside java-profiler's own testing so we gain +confidence that fixes hold and regressions are caught. + +## 2. Evidence (crash-tracking telemetry, Org2, 7-day window, all versions) + +Source: `service:instrumentation-telemetry-data-jvm @lib_language:jvm`, crash events. + +### 2.1 Why we weren't seeing them + +The standard triage filter keys on `@tags.crash_datadog:true`. **Most logs-backend crashes +are not tagged `crash_datadog`** (they are `crash_runtime` / `crash_unactionable`), so they +were filtered out. Removing that clause surfaces hundreds of crashes in 7 days +(`event-context-writer`: 110 SIGSEGV + 14 SIGBUS; `event-heartbeat-emitter`: 27 SIGSEGV + +23 SIGBUS; `event-context-provider-skeleton`: 30 SIGSEGV; etc.). + +### 2.2 Uniform platform + +All on **JDK 25.0.3**, `env:staging`, tracer **`1.63.0-snapshot`** (dev build). This is +Datadog dogfooding on bleeding-edge Java 25. + +### 2.3 Attribution split (SIGSEGV/SIGBUS, JFR-crashing services) + +| Tags | Events | Heuristic verdict | +|---|---|---| +| `crash_runtime` + `crash_unactionable` | ~179 | "JVM bug" — **but see §2.5** | +| `crash_profiler` + `crash_datadog` | ~25 | attributed to this profiler | + +The `crash_runtime`/`crash_unactionable` tag records only **where the crashing PC landed** +(JVM code). It is **not** a root-cause determination. + +### 2.4 Crash taxonomy + +**Family 1 — PC in JVM/JDK code (the ~179):** +`JfrJavaThreadIteratorAdapter::next → jfr_emit_event → jdk.jfr.internal.periodic.*` +(JDK's own Flight Recorder iterating the thread list), `SafepointSynchronize::arm_safepoint`, +`ThreadsSMRSupport::free_list/add_thread`, `GlobalCounter::write_synchronize`, ZGC mark/relocate. + +**Family 2 — PC in this profiler's code (the ~25):** +- **A. Thread-lifecycle callbacks** — `Profiler::onThreadEnd+0x65` during `JavaThread::exit`, + `Profiler::onThreadStart`, `Profiler::updateThreadName → ThreadInfo::set`. +- **B. Dump / chunk serialization** — `Profiler::dump → FlightRecorder::dump → + Recording::switchChunk / writeCpool / writeStackTraces / cleanupUnreferencedMethods`, + `CallTraceStorage::processTraces`, `Dictionary::clear`, `std::_Rb_tree_increment` in + `writeCpool`. Frequently triggered via `JavaProfiler.dump0`. +- **C. Virtual threads** — `ExceptionSampleEvent.commit → Continuation.enterSpecial → + VirtualThread.runContinuation`. + +### 2.5 Fault-address forensics — corruption, not clean nulls + +For the Family-1 services (SIGSEGV/SIGBUS): +- **SIGBUS / `BUS_ADRALN`** (~25 events): misaligned access ⇒ the pointer value itself is + garbage read from corrupted/freed memory. Correct codegen never misaligns. +- **Non-null `SEGV_MAPERR`**: garbage 32-bit-range addresses (`0x2d6c1892`, `0x86471d9e`…), + **not** valid Java-25 heap pointers (`0x00007f…`), and they **cluster on identical + low-16-bit suffixes** (`…1d9e` ×15, `…1ea6` ×12, `…1892` ×6) — the same field offset in + the same structure read off a varying garbage base. Textbook **use-after-free / + type-confusion**. +- Only the `SI_KERNEL / 0x0` bucket (~32) looks like a plain null. + +**Conclusion (CONFIRMED):** Family 1 and Family 2 are **one underlying profiler +memory-safety bug** (heap corruption / UAF in the thread-churn × dump interaction on Java 25). +**Disabling the profiler on the affected staging services removes the crashes** — the decisive +control experiment. So the ~179 `crash_runtime`/`unactionable` events are profiler-induced too; +the attribution tag undercounted our true crash volume by roughly 8× (~25 tagged vs ~204 actual). +The crash tag merely records where the corruption happened to surface. `ThreadInfo` itself is +**correctly locked** (every method holds `_ti_lock`, `get()` deep-copies via `shared_ptr`), so +`ThreadInfo::set` is a *victim* site touching an already-poisoned heap, not the source. The +source must be found by sanitizing the interacting components together. + +### 2.6 Common trigger + +Rapid **thread creation/destruction (churn)** intersecting the **recording dump** path, on +**Java 25**. Service names corroborate: heartbeat emitters, periodic writers, reducers spin up +short-lived workers. + +## 3. Testing gaps that allowed the escape + +1. **ASan/TSan run only on isolated C++ gtest binaries** (`buildGtest{Asan,Tsan}`, no JVM in + the process). They cannot — and must not — run against a live JVM (ASan's global + malloc/signal/guard-page interception fights JVM signal handling, JIT, and deliberate + faults; TSan floods on uninstrumented JVM synchronization and its shadow region collides + with JVM mappings, incl. the Kata issue already noted in CI). So a UAF that only appears + under *live JVM + churn + dump* is invisible to the sanitizers. +2. **JVM integration / chaos / reliability run on JDK 21** (`chaos_check.sh` hardcodes + `java 21.0.3-tem`; repo default `JAVA_TEST_VERSION=21`). Every production crash is **Java 25**. +3. **No concurrent `dump()` stress.** The chaos harness only crash-detects under churn; it + never drives the dump path that Family-2 B/C fault in. + +## 4. Design — two layers, mapped onto existing structure + +Detection is split by layer because sanitizers are valid only without a JVM in the process. + +### 4.1 Layer 1 — gtest sanitizer concurrency stress (finds the bug) + +**Where:** `ddprof-lib/src/test/` (ASan+TSan via existing `buildGtestAsan`/`buildGtestTsan` +→ `gtest-asan-amd64`/`gtest-tsan-amd64`). No new infra. Existing neighbors: +`stress_callTraceStorage.cpp`, `thread_teardown_safety_ut.cpp`, `threadIdTable_ut.cpp`, +`dictionary_ut.cpp`. + +**New:** `stress_threadLifecycle_ut.cpp` — a no-JVM concurrency model of the churn × dump +interaction: +- M worker threads loop: `ProfiledThread` register → updateName → engine register + (`_cpu_engine`/`_wall_engine`) → unregister → `ProfiledThread::release()` (incl. buffer + recycling path). +- 1–2 "dump" threads loop: `CallTraceStorage::processTraces(...)` + `Dictionary::clear()` + (models `Recording::switchChunk`/`writeStackTraces`). +- Run for a fixed iteration budget; rely on ASan (UAF at origin) and TSan (data race) as the + oracle, plus existing crash-on-signal. + +Scope note: drive the profiler's *own* data structures directly. Anything requiring genuine +JVMTI inputs (a real `jthread`) stays in Layer 2. + +### 4.2 Layer 2 — chaos antagonist + reliability (proves the real scenario, guards regressions) + +**Where:** `ddprof-stresstest/src/chaos/java/.../chaos/` + `.gitlab/reliability/`. No +sanitizer. Existing: `ThreadChurnAntagonist`, `VirtualThreadChurnAntagonist`, `gmalloc` +guard-allocator already in the matrix. + +**Changes:** +1. **`DumpStormAntagonist`** — drives frequent recording rotation/dump concurrently with + churn. Harness is black-box (patched `dd-java-agent`), so trigger dumps via a short + profiler recording/upload interval rather than calling `dump0` directly. Exercises + `Recording::switchChunk/writeCpool`, `updateJavaThreadNames → ThreadInfo::set`, + `Dictionary::clear`. Register in `Main.create()`. +2. **Add Java 25 to the chaos/reliability matrix** — `chaos_check.sh` currently installs only + `21.0.3-tem`. Add a 25.x cell (keep 21 as control). This alone may reproduce the crash. + +The `gmalloc` allocator already in the matrix provides JVM-safe guard-page detection of +overflows/UAF for Layer 2, complementing crash-on-signal. + +## 5. Out of scope / explicitly not done + +- **Not** chasing Family-1 PCs as JDK bugs — the control experiment (profiler off ⇒ no + crashes) confirms they are profiler-induced, not JDK bugs. +- **No** ASan/TSan against a live JVM (technically invalid — see §3.1). +- **No** source-scan of logs-backend; telemetry already gave the exact code paths. +- **No** new test-only production APIs unless proposed and approved first. + +## 6. Validation plan + +- Layer 1: new stress test fails (ASan/TSan report) on current `main`; passes after the fix. +- Layer 2: chaos+`DumpStorm` on **Java 25** reproduces a crash (or `gmalloc` fault) on current + `main`; clean after the fix; Java 21 cell stays green throughout (control). +- Cross-check: **DONE** — disabling the profiler on the affected staging services removes the + crashes, confirming the single-root-cause (profiler) hypothesis. + +## 7. Open questions + +- Exact black-box mechanism to force frequent dumps in the chaos harness (short recording + interval config vs. a benign agent hook) — to resolve during implementation. +- Whether a Java-25 chaos cell needs a new CI build image / SDKMAN candidate availability. + +## 8. Reproduction results + +### Layer 1 — ASan gtest (JDK 25, aarch64/glibc, 2026-05-29) + +Command: `./utils/run-docker-tests.sh --config=asan --gtest --jdk=25 --mount` + +Results: +- `stress_threadLifecycle_ut` compiled and linked under ASan: **PASS** +- `StressThreadLifecycle.Smoke`: **PASS** (no ASan report) +- `StressThreadLifecycle.ChurnOnly`: **PASS** (no ASan report) +- `StressThreadLifecycle.ChurnDuringDump`: **PASS** (no ASan report) + +**Interpretation:** The profiler's own data structures (`ProfiledThread`, `ThreadFilter`, +`CallTraceStorage`, `Dictionary`) are race-free in isolation at the unit level. The UAF +requires **real JVMTI inputs** — live `jthread` handles, JVM-side native thread lifecycle, +JVMTI callbacks. Layer 2 (chaos antagonists under a real JVM on Java 25) is the decisive +reproducer. + +Pre-existing unrelated failure: `CollapsingSleepTest > testSleep()` (Java integration test, +not touched by this work). + +### Layer 2 — chaos + Java 25 CI cell + +Pending: Task 8 (CI matrix change) not yet run. diff --git a/doc/plans/2026-05-29-logs-backend-crash-simulation-plan.md b/doc/plans/2026-05-29-logs-backend-crash-simulation-plan.md new file mode 100644 index 000000000..d74bbb3d8 --- /dev/null +++ b/doc/plans/2026-05-29-logs-backend-crash-simulation-plan.md @@ -0,0 +1,529 @@ +# logs-backend Crash Simulation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Reproduce, inside java-profiler's own test suite, the confirmed profiler memory-safety bug that crashes logs-backend services (Java 25, thread-churn × recording-dump), so we can find the root cause and guard against regressions. + +**Architecture:** Two layers. **Layer 1** is a no-JVM gtest concurrency stress (`stress_threadLifecycle_ut.cpp`) that drives the profiler's own thread-lifecycle + dump data structures under ASan/TSan to surface the UAF/race at its origin. **Layer 2** adds a `DumpStormAntagonist` and a **Java 25** cell to the existing black-box chaos/reliability suite (JVM-safe detectors only: `gmalloc` + crash-on-signal). + +**Tech Stack:** C++17, GoogleTest (`com.datadoghq.gtest` Gradle plugin, auto-discovers `src/test/cpp/*.cpp`), ASan/TSan via `buildGtestAsan`/`buildGtestTsan`; Java chaos harness (`ddprof-stresstest`), GitLab CI (`.gitlab/reliability`, `.gitlab/sanitizer-tests`), SDKMAN. + +**Spec:** `doc/plans/2026-05-29-logs-backend-crash-simulation-design.md` + +**Branch:** Work on `prof-logs-backend-crash-sim` (do not commit to `main`). + +**Execution environment (IMPORTANT — overrides the bare `./gradlew` lines below):** +This host is macOS/arm64; the test is `#ifdef __linux__` and the sanitizers are Linux-only. +ALL build/run steps execute via the repo's Docker runner on **JDK 25**, which gives faithful +Linux ASan/TSan: +- Debug build + gtests: `./utils/run-docker-tests.sh --config=debug --gtest --jdk=25 --mount` +- ASan: `./utils/run-docker-tests.sh --config=asan --gtest --jdk=25 --mount` +- TSan: `./utils/run-docker-tests.sh --config=tsan --gtest --jdk=25 --mount` +- Interactive iteration: add `--shell` to drop into the container and run the built + `*_stress_threadLifecycle_ut*` binary directly with `--gtest_filter=StressThreadLifecycle.*`. + +The `./gradlew :ddprof-lib:buildGtest*` commands in the tasks are the *in-container* equivalents; +run them through the Docker wrapper above, not directly on the host. + +**Commit policy:** Per project rule, do NOT `git commit` during execution. Author + verify, then +leave changes in the working tree for the user to review and commit. The `git commit` steps in +the tasks are deferred until the user approves. + +--- + +## File Structure + +| File | Status | Responsibility | +|---|---|---| +| `ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp` | Create | Layer-1 concurrency reproducer: churn workers (ProfiledThread + ThreadFilter) vs. dump thread (CallTraceStorage + Dictionary), under ASan/TSan. | +| `ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java` | Create | Layer-2 antagonist: forces frequent recording rotation/dump concurrently with churn. | +| `ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java` | Modify | Register `dump-storm` antagonist in `create()`. | +| `.gitlab/reliability/chaos_check.sh` | Modify | Add a Java 25 install/cell alongside the existing `21.0.3-tem`. | +| `.gitlab/reliability/.gitlab-ci.yml` | Modify | Add `JDK` dimension (21, 25) to the chaos matrix; include `dump-storm` in the antagonist set. | + +**Reference neighbors (read before coding):** `thread_teardown_safety_ut.cpp` (ProfiledThread lifecycle idioms, sigaction guards, PROF-14603), `threadFilter_ut.cpp` (ThreadFilter construction/`init`), `test_callTraceStorage.cpp` + `stress_callTraceStorage.cpp` (CallTraceStorage `put`/`processTraces`/`clear`, `ASGCT_CallFrame`, `gtest_crash_handler.h`), `dictionary_ut.cpp` (Dictionary `lookup`/`clear`). + +**Note on test nature:** This is a *characterization/reproduction* harness, not classic TDD. Success criterion for Phase 1 is: the harness runs **clean when the code is correct**, and we run it under ASan/TSan on current `main` to see whether it surfaces the corruption. If it stays clean on `main`, that is itself a result — it narrows the root cause to a path that requires real JVMTI/JVM state (→ Layer 2). + +--- + +## Phase 1 — Layer 1: gtest concurrency reproducer + +### Task 1: Scaffold the stress test file (compiles, runs, passes) + +**Files:** +- Create: `ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp` + +- [ ] **Step 1: Create the file with includes, crash-handler hook, and one trivial test** + +```cpp +/* + * Copyright 2026, Datadog, Inc. + * SPDX-License-Identifier: Apache-2.0 + * + * Layer-1 reproducer for the logs-backend crash (Java 25, thread-churn × + * recording-dump). Drives the profiler's own thread-lifecycle and dump data + * structures concurrently, with NO JVM in the process, so ASan/TSan can flag + * the use-after-free / race at its origin. + * + * See doc/plans/2026-05-29-logs-backend-crash-simulation-design.md + */ +#include "gtest/gtest.h" + +#ifdef __linux__ + +#include "callTraceStorage.h" +#include "callTraceHashTable.h" +#include "dictionary.h" +#include "threadFilter.h" +#include "thread.h" +#include "arch.h" + +#include +#include +#include +#include +#include "../../main/cpp/gtest_crash_handler.h" + +static constexpr const char STRESS_TEST_NAME[] = "StressThreadLifecycle"; + +TEST(StressThreadLifecycle, Smoke) { + // Sanity: structures construct and tear down without a JVM. + CallTraceStorage storage; + Dictionary dict; + EXPECT_EQ(0u, dict.lookup("smoke") == 0u ? 0u : 0u); // lookup is callable + storage.clear(); + SUCCEED(); +} + +#endif // __linux__ +``` + +- [ ] **Step 2: Build under the normal (non-sanitizer) gtest config** + +Run: `./gradlew :ddprof-lib:buildGtestDebug --no-daemon` +Expected: BUILD SUCCESSFUL; a binary matching `stress_threadLifecycle_ut` appears under `ddprof-lib/build/bin/gtest/`. + +- [ ] **Step 3: Run the smoke test** + +Run: `find ddprof-lib/build/bin/gtest -name '*stress_threadLifecycle_ut*' -type f -executable -exec {} \;` +Expected: `[ PASSED ] 1 test.` + +- [ ] **Step 4: Commit** + +```bash +git add ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp +git commit -m "test: scaffold thread-lifecycle stress reproducer" +``` + +--- + +### Task 2: Churn workers modelling onThreadStart / onThreadEnd + +Models `Profiler::onThreadStart`/`onThreadEnd`: each worker initialises its `ProfiledThread`, registers a filter slot, does work, then unregisters and releases — the exact sequence in `profiler.cpp:75-130`. + +**Files:** +- Modify: `ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp` + +- [ ] **Step 1: Add the churn-worker test (verify ThreadFilter ctor/init against `threadFilter_ut.cpp` first)** + +```cpp +// Shared filter, mirroring Profiler::_thread_filter (one instance, many threads). +static ThreadFilter g_filter; +static std::atomic g_run{false}; + +static void churn_worker() { + while (!g_run.load(std::memory_order_acquire)) { /* spin until start */ } + for (int i = 0; i < 2000 && g_run.load(std::memory_order_relaxed); i++) { + // onThreadStart sequence + ProfiledThread::initCurrentThread(); + ProfiledThread* self = ProfiledThread::current(); + ASSERT_NE(nullptr, self); + ThreadFilter::SlotID slot = g_filter.registerThread(); + self->setFilterSlotId(slot); + g_filter.add(self->tid(), slot); + + // minimal "work" so the thread is live across a scheduling quantum + std::this_thread::yield(); + + // onThreadEnd sequence + g_filter.remove(slot); + g_filter.unregisterThread(slot); + self->setFilterSlotId(-1); + ProfiledThread::release(); + } +} + +TEST(StressThreadLifecycle, ChurnOnly) { + g_filter.init("*"); // enable filtering; confirm arg form via threadFilter_ut.cpp + ASSERT_TRUE(g_filter.enabled()); + g_run.store(true, std::memory_order_release); + + std::vector ts; + for (int t = 0; t < 16; t++) ts.emplace_back(churn_worker); + for (auto& t : ts) t.join(); + + g_run.store(false); + SUCCEED(); +} +``` + +- [ ] **Step 2: Build and run (Debug)** + +Run: `./gradlew :ddprof-lib:buildGtestDebug --no-daemon && find ddprof-lib/build/bin/gtest -name '*stress_threadLifecycle_ut*' -type f -executable -exec {} --gtest_filter=StressThreadLifecycle.ChurnOnly \;` +Expected: `[ PASSED ]`. If it crashes, capture the stack — that may already be the bug. + +- [ ] **Step 3: Commit** + +```bash +git add ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp +git commit -m "test: add thread-churn worker to lifecycle stress" +``` + +--- + +### Task 3: Concurrent dump thread (CallTraceStorage + Dictionary) + +Models `Profiler::dump`: while workers churn, a dump thread repeatedly walks call traces and clears the dictionary — `CallTraceStorage::processTraces` + `Dictionary::clear` from the design's Family-2 B. + +**Files:** +- Modify: `ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp` + +- [ ] **Step 1: Add a shared storage + dump thread, and have workers `put()` traces (mirror `put()` usage in `test_callTraceStorage.cpp`)** + +```cpp +static CallTraceStorage g_storage; +static Dictionary g_dict; + +// Build a small fake stack and record it (signal-handler-style put()). +static void record_trace(int salt) { + ASGCT_CallFrame frames[4]; + std::memset(frames, 0, sizeof(frames)); + for (int i = 0; i < 4; i++) { + frames[i].bci = i + salt; + frames[i].method_id = reinterpret_cast(static_cast(0x1000 + i + salt)); + } + g_storage.put(4, frames, false, 1); + g_dict.lookup("logs-backend-sim"); +} + +static void dump_thread() { + while (g_run.load(std::memory_order_relaxed)) { + g_storage.processTraces([](const std::unordered_set& traces) { + volatile size_t n = 0; + for (CallTrace* t : traces) if (t) n += 1; // touch each, like writeStackTraces + (void)n; + }); + g_dict.clear(); // models Dictionary::clear() during Profiler::dump + g_storage.clear(); // models chunk rotation + } +} +``` + +Then, inside `churn_worker`'s loop, add `record_trace(i)` right after `g_filter.add(...)`. + +- [ ] **Step 2: Add the combined test that runs churn + dump together** + +```cpp +TEST(StressThreadLifecycle, ChurnDuringDump) { + g_filter.init("*"); + ASSERT_TRUE(g_filter.enabled()); + g_run.store(true, std::memory_order_release); + + std::thread dumper(dump_thread); + std::vector ts; + for (int t = 0; t < 16; t++) ts.emplace_back(churn_worker); + for (auto& t : ts) t.join(); + + g_run.store(false); + dumper.join(); + SUCCEED(); +} +``` + +- [ ] **Step 3: Build and run (Debug)** + +Run: `./gradlew :ddprof-lib:buildGtestDebug --no-daemon && find ddprof-lib/build/bin/gtest -name '*stress_threadLifecycle_ut*' -type f -executable -exec {} --gtest_filter=StressThreadLifecycle.ChurnDuringDump \;` +Expected: `[ PASSED ]` in a correct build (or a crash that reproduces the bug). + +- [ ] **Step 4: Commit** + +```bash +git add ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp +git commit -m "test: drive concurrent dump against thread churn" +``` + +--- + +### Task 4: Run under AddressSanitizer on `main` (the reproduction experiment) + +**Files:** none (investigation step — record findings). + +- [ ] **Step 1: Build the ASan gtest binaries** + +Run: `./gradlew :ddprof-lib:buildGtestAsan --no-daemon --parallel` +Expected: BUILD SUCCESSFUL. + +- [ ] **Step 2: Run the new test under ASan, many iterations** + +Run: +```bash +bin=$(find ddprof-lib/build/bin/gtest -type f -executable -name 'asan_*stress_threadLifecycle_ut*') +for i in $(seq 1 50); do "$bin" --gtest_filter='StressThreadLifecycle.*' || { echo "ASAN HIT on iter $i"; break; }; done +``` +Expected outcome is one of: +- ASan reports `heap-use-after-free` / `heap-buffer-overflow` with a profiler stack → **bug reproduced**; record the report verbatim (this is the root-cause pointer). +- All 50 iterations PASS → record "ASan clean at unit level"; proceed to TSan (Task 5). + +- [ ] **Step 3: Record findings** + +Append the ASan output (or "clean") to the design doc under a new "## 8. Reproduction results" section. + +```bash +git add doc/plans/2026-05-29-logs-backend-crash-simulation-design.md +git commit -m "docs: record ASan reproduction results" +``` + +--- + +### Task 5: Run under ThreadSanitizer on `main` + +**Files:** none (investigation step). + +- [ ] **Step 1: Build the TSan gtest binaries** + +Run: `./gradlew :ddprof-lib:buildGtestTsan --no-daemon --parallel` +Expected: BUILD SUCCESSFUL. (TSan must run on a non-Kata runner — locally is fine; in CI it uses the `docker-in-docker:amd64`→EC2 path per `.gitlab/sanitizer-tests/.gitlab-ci.yml`.) + +- [ ] **Step 2: Run the new test under TSan** + +Run: +```bash +bin=$(find ddprof-lib/build/bin/gtest -type f -executable -name 'tsan_*stress_threadLifecycle_ut*') +GTEST_DEATH_TEST_STYLE=threadsafe "$bin" --gtest_filter='StressThreadLifecycle.*' +``` +Expected: either a `data race` report naming a profiler structure (record it) or clean. + +- [ ] **Step 3: Record findings and commit (as in Task 4 Step 3).** + +```bash +git add doc/plans/2026-05-29-logs-backend-crash-simulation-design.md +git commit -m "docs: record TSan reproduction results" +``` + +--- + +## Phase 2 — Layer 2: chaos antagonist + Java 25 + +### Task 6: `DumpStormAntagonist` + +Forces frequent recording rotation/dump concurrently with churn. The harness is black-box (patched `dd-java-agent`), so it cannot call `dump0` directly; it drives dumps by keeping the profiler busy and relying on the agent's short recording interval (configured in Task 8). This antagonist supplies the *concurrent churn-with-allocation* pressure that makes each rotation race thread teardown. + +**Files:** +- Create: `ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java` + +- [ ] **Step 1: Create the antagonist (mirror `ThreadChurnAntagonist` structure exactly)** + +```java +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ +package com.datadoghq.profiler.chaos; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +/** + * Spawns short-lived, frequently-renamed threads that each generate distinct + * stack shapes, maximising churn in the profiler's thread-name table and call- + * trace storage right as recording chunks rotate. + * + *

Targets the dump path observed in production crashes: + * {@code Recording::switchChunk/writeCpool}, {@code updateJavaThreadNames -> + * ThreadInfo::set}, {@code Dictionary::clear}. Pair with a short profiler + * recording interval (see chaos_check.sh) so dumps fire continuously. + */ +public final class DumpStormAntagonist implements Antagonist { + + private final int concurrentThreads; + private volatile boolean running; + private Thread driver; + + public DumpStormAntagonist() { + this(96); + } + + public DumpStormAntagonist(int concurrentThreads) { + this.concurrentThreads = concurrentThreads; + } + + @Override + public String name() { + return "dump-storm"; + } + + @Override + public void start() { + running = true; + driver = new Thread(this::loop, "chaos-dump-storm"); + driver.setDaemon(true); + driver.start(); + } + + @Override + public void stopGracefully(Duration timeout) { + running = false; + try { + if (driver != null) driver.join(timeout.toMillis()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void loop() { + long seq = 0; + while (running) { + List batch = new ArrayList<>(concurrentThreads); + for (int i = 0; i < concurrentThreads && running; i++) { + final long id = seq++; + Thread t = new Thread(() -> distinctStack(id, 0)); + // Frequent renames hammer updateThreadName/ThreadInfo::set. + t.setName("dump-storm-" + id); + t.setDaemon(true); + t.start(); + batch.add(t); + } + for (Thread t : batch) { + try { + t.join(500L); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + } + + // Recurse to a per-thread depth so each thread yields a unique stack shape, + // forcing new call-trace + symbol entries that the dump path must serialise. + private long distinctStack(long id, int depth) { + if (depth >= (int) (id % 32)) { + long r = id; + for (int i = 0; i < 5000; i++) r = (r * 1103515245L + 12345L) & 0x7fffffffL; + return r; + } + return distinctStack(id, depth + 1) + depth; + } +} +``` + +- [ ] **Step 2: Compile the stresstest module** + +Run: `./gradlew :ddprof-stresstest:compileChaosJava --no-daemon` (confirm the source-set task name via `./gradlew :ddprof-stresstest:tasks --all | grep -i chaos`) +Expected: BUILD SUCCESSFUL. + +- [ ] **Step 3: Commit** + +```bash +git add ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/DumpStormAntagonist.java +git commit -m "test: add dump-storm chaos antagonist" +``` + +--- + +### Task 7: Register the antagonist + +**Files:** +- Modify: `ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java` + +- [ ] **Step 1: Add the `dump-storm` case to `create()`** (after the `trace-context` case, before `default`): + +```java + case "dump-storm": + return new DumpStormAntagonist(); +``` + +- [ ] **Step 2: Compile** + +Run: `./gradlew :ddprof-stresstest:compileChaosJava --no-daemon` +Expected: BUILD SUCCESSFUL. + +- [ ] **Step 3: Smoke-run the harness briefly (no agent needed to prove wiring)** + +Run: `./gradlew :ddprof-stresstest:run -PchaosArgs="--duration 5s --antagonists dump-storm" --no-daemon` (adapt to the module's actual run task; otherwise run the built class directly with `--duration 5s --antagonists dump-storm`). +Expected: logs `antagonist started: dump-storm` then `completed cleanly`. + +- [ ] **Step 4: Commit** + +```bash +git add ddprof-stresstest/src/chaos/java/com/datadoghq/profiler/chaos/Main.java +git commit -m "test: register dump-storm antagonist" +``` + +--- + +### Task 8: Add Java 25 + dump-storm to the chaos CI matrix + +**Files:** +- Modify: `.gitlab/reliability/chaos_check.sh` +- Modify: `.gitlab/reliability/.gitlab-ci.yml` + +- [ ] **Step 1: Parameterise the JDK in `chaos_check.sh`** + +Replace the hardcoded install (`.gitlab/reliability/chaos_check.sh:16`): +```bash +timeout 300 sdk install java 21.0.3-tem 1>/dev/null 2>/dev/null +``` +with a parameterised version + a short recording interval so dumps fire continuously: +```bash +JDK_CANDIDATE="${CHAOS_JDK:-21.0.3-tem}" +timeout 300 sdk install java "${JDK_CANDIDATE}" 1>/dev/null 2>/dev/null +sdk use java "${JDK_CANDIDATE}" +# Force frequent recording rotation so the dump path is exercised under churn. +export DD_PROFILING_UPLOAD_PERIOD=5 +``` +(Validate the exact env var name against the dd-java-agent build used by the harness; the intent is "shortest safe upload/recording period".) + +- [ ] **Step 2: Validate YAML/script edits** + +Run: `bash -n .gitlab/reliability/chaos_check.sh && python3 -c "import yaml,sys; yaml.safe_load(open('.gitlab/reliability/.gitlab-ci.yml')); print('yaml ok')"` +Expected: no syntax errors; `yaml ok`. + +- [ ] **Step 3: Add a `CHAOS_JDK` dimension and `dump-storm` to the chaos matrix in `.gitlab/reliability/.gitlab-ci.yml`** + +In `.reliability_chaos_job.parallel.matrix`, add the JDK dimension (keep 21 as control): +```yaml + parallel: + matrix: + - CONFIG: ["profiler", "profiler+tracer"] + ALLOCATOR: ["gmalloc", "jemalloc", "tcmalloc"] + CHAOS_JDK: ["21.0.3-tem", "25.0.3-tem"] +``` +And ensure the run invocation passes the dump-storm antagonist (in `chaos_check.sh`'s harness call, set the antagonist list to include `thread-churn,vthread-churn,dump-storm`). Confirm `25.0.3-tem` (or the closest available 25.x) is a valid SDKMAN candidate; if not, pin the available 25 build. + +- [ ] **Step 4: Re-validate YAML** + +Run: `python3 -c "import yaml; yaml.safe_load(open('.gitlab/reliability/.gitlab-ci.yml')); print('yaml ok')"` +Expected: `yaml ok`. + +- [ ] **Step 5: Commit** + +```bash +git add .gitlab/reliability/chaos_check.sh .gitlab/reliability/.gitlab-ci.yml +git commit -m "ci: run chaos suite on Java 25 with dump-storm antagonist" +``` + +--- + +## Self-Review notes + +- **Spec coverage:** §4.1 Layer 1 → Tasks 1-5; §4.2 Layer 2 `DumpStormAntagonist` → Tasks 6-7, Java 25 matrix → Task 8; §3 gaps (JDK 21, no dump stress, sanitizers gtest-only) all addressed. §6 validation (ASan/TSan on main) → Tasks 4-5. +- **Known unknowns flagged inline (not placeholders):** exact `ThreadFilter::init` arg form (verify vs `threadFilter_ut.cpp`), the chaos module's run/compile task names, the dd-java-agent recording-period env var, and SDKMAN's 25.x candidate string. Each step says what to confirm and against what. +- **Type consistency:** `g_filter`/`g_storage`/`g_dict`/`g_run` used consistently across Tasks 2-3; antagonist `name()` returns `"dump-storm"` matching the `create()` case and the CI antagonist list. From 6ae2a511a7accb53e3a252f01b72cb7a7edbc018 Mon Sep 17 00:00:00 2001 From: Jaroslav Bachorik Date: Sat, 30 May 2026 00:22:54 +0200 Subject: [PATCH 12/12] test: align thread lifecycle stress test locking --- .../test/cpp/stress_threadLifecycle_ut.cpp | 84 +++++++++++++------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp b/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp index 6325c827a..1b1c5db73 100644 --- a/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp +++ b/ddprof-lib/src/test/cpp/stress_threadLifecycle_ut.cpp @@ -15,10 +15,10 @@ #include "callTraceStorage.h" #include "callTraceHashTable.h" -#include "dictionary.h" #include "threadFilter.h" #include "thread.h" #include "arch.h" +#include "spinLock.h" #include #include @@ -35,16 +35,58 @@ static constexpr const char STRESS_TEST_NAME[] = "StressThreadLifecycle"; static constexpr int kChurnWorkers = 16; static constexpr int kChurnIterations = 2000; -// Shared dump-side structures. They are exercised concurrently by the dump -// thread and (for the put() path) by the churn workers. +// Shared dump-side storage. Churn workers write it through the same shard-lock +// protocol as profiler sample paths; the dump thread processes it under +// lock_all(), matching Profiler::rotateDictsAndRun(). static CallTraceStorage g_storage; -static Dictionary g_dict; static std::atomic g_run{false}; -// Record a small fake call trace plus a dictionary lookup, mirroring what the -// profiler does for every sample. ASGCT_CallFrame uses `bci` (jint) and the -// `method_id` union member (see vmEntry.h). -static void record_trace(int salt) { +// Mirrors Profiler::_locks. Sample writers take one shard lock with tryLock(); +// dump takes all shard locks before processing/clearing shared dump-side state. +static constexpr int kLockCount = 16; +static SpinLock g_locks[kLockCount]; + +static u32 lock_index_for_tid(int tid) { + u32 lock_index = tid; + lock_index ^= lock_index >> 8; + lock_index ^= lock_index >> 4; + return lock_index % kLockCount; +} + +static bool try_record_lock(int tid, u32* lock_index) { + *lock_index = lock_index_for_tid(tid); + if (g_locks[*lock_index].tryLock()) { + return true; + } + *lock_index = (*lock_index + 1) % kLockCount; + if (g_locks[*lock_index].tryLock()) { + return true; + } + *lock_index = (*lock_index + 1) % kLockCount; + return g_locks[*lock_index].tryLock(); +} + +static void lock_all() { + for (int i = 0; i < kLockCount; i++) { + g_locks[i].lock(); + } +} + +static void unlock_all() { + for (int i = 0; i < kLockCount; i++) { + g_locks[i].unlock(); + } +} + +// Record a small fake call trace, mirroring profiler sample paths that hold a +// shard lock while writing to CallTraceStorage. ASGCT_CallFrame uses `bci` +// (jint) and the `method_id` union member (see vmEntry.h). +static void record_trace(int salt, int tid) { + u32 lock_index; + if (!try_record_lock(tid, &lock_index)) { + return; + } + ASGCT_CallFrame frames[4]; std::memset(frames, 0, sizeof(frames)); for (int i = 0; i < 4; i++) { @@ -53,7 +95,8 @@ static void record_trace(int salt) { reinterpret_cast(static_cast(0x1000 + i + salt)); } g_storage.put(4, frames, false, 1); - g_dict.lookup("logs-backend-sim"); + + g_locks[lock_index].unlock(); } // onThreadStart -> work -> onThreadEnd loop, mirroring the profiler's per-thread @@ -78,7 +121,7 @@ static void churn_worker(ThreadFilter* filter, bool with_dump) { } if (with_dump) { - record_trace(i); + record_trace(i, self->tid()); } std::this_thread::yield(); @@ -91,19 +134,12 @@ static void churn_worker(ThreadFilter* filter, bool with_dump) { } } -// Continuously dumps the trace storage and clears both the dictionary and the -// storage, racing against concurrent put() / lookup() from churn workers. -// -// Intentional divergence from production: production wraps clear() in -// lockAll()/unlockAll() (profiler.cpp) so clear() never races put(). Here -// we drop that guard deliberately so ASan/TSan can observe a UAF at its -// origin. A crash in this reproducer may surface a real bug or a test-only -// race; cross-reference with the Layer-2 DumpStormAntagonist (JVM-level) -// to confirm which it is. The CallTraceStorage concurrency contract -// (refcount-guard + CriticalSection) prevents permanent corruption from -// clear()-vs-put() racing, so this does not cause silent data loss. +// Continuously processes the trace storage under lock_all(), matching the JFR +// dump path where Profiler::rotateDictsAndRun() holds all shard locks while +// writeStackTraces() calls processCallTraces(). static void dump_thread() { while (g_run.load(std::memory_order_relaxed)) { + lock_all(); g_storage.processTraces([](const std::unordered_set& traces) { volatile size_t n = 0; for (CallTrace* t : traces) { @@ -113,16 +149,12 @@ static void dump_thread() { } (void)n; }); - // dict.clear() races dict.lookup() on churn threads — intentional (see above). - g_dict.clear(); - g_storage.clear(); + unlock_all(); } } TEST(StressThreadLifecycle, Smoke) { CallTraceStorage storage; - Dictionary dict; - dict.lookup("smoke"); storage.clear(); SUCCEED(); }