diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh
index 0b83747c0e..c7499282f4 100755
--- a/qa/L0_cppunittest/test.sh
+++ b/qa/L0_cppunittest/test.sh
@@ -4,6 +4,9 @@
 
 set -e
 
+: ${XML_LOG_DIR:=/logs}
+mkdir -p "$XML_LOG_DIR"
+
 # Find TE
 : ${TE_PATH:=/opt/transformerengine}
 TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
@@ -17,4 +20,4 @@ cd $TE_PATH/tests/cpp
 cmake -GNinja -Bbuild .
 cmake --build build
 export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
-ctest --test-dir build -j$NUM_PARALLEL_JOBS
+ctest --test-dir build -j$NUM_PARALLEL_JOBS --output-junit $XML_LOG_DIR/ctest_cppunittest.xml
diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
index ecd0090a3b..5aaf67061b 100644
--- a/tests/pytorch/attention/test_attention_with_cp.py
+++ b/tests/pytorch/attention/test_attention_with_cp.py
@@ -22,7 +22,7 @@
 
 _current_file = pathlib.Path(__file__).resolve()
 sys.path.append(str(_current_file.parent.parent))
-from utils import ModelConfig, get_available_attention_backends
+from utils import ModelConfig, get_available_attention_backends, run_distributed
 
 pytest_logging_level = logging.getLevelName(logging.root.level)
 
@@ -125,7 +125,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
     if not flash_attn_supported:
         pytest.skip("No attention backend available.")
 
-    subprocess.run(
+    run_distributed(
         get_bash_arguments(
             num_gpus_per_node=num_gpus,
             dtype=dtype,
@@ -135,7 +135,6 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
             cp_comm_type=cp_comm_type,
             log_level=pytest_logging_level,
         ),
-        check=True,
     )
 
 
@@ -368,7 +367,7 @@ def test_cp_with_fused_attention(
     if not fused_attn_supported:
         pytest.skip("No attention backend available.")
 
-    subprocess.run(
+    run_distributed(
         get_bash_arguments(
             num_gpus_per_node=num_gpus,
             dtype=dtype,
@@ -384,5 +383,4 @@ def test_cp_with_fused_attention(
             is_training=is_training,
             log_level=pytest_logging_level,
         ),
-        check=True,
     )
diff --git a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
index 1606641b78..7de6142537 100644
--- a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
+++ b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
@@ -10,6 +10,9 @@
 import sys
 import pathlib
 
+sys.path.append(str(pathlib.Path(__file__).resolve().parent.parent))
+from utils import run_distributed
+
 import pytest
 import torch
 from torch import nn
@@ -1207,7 +1210,7 @@ def test_nvfp4_partial_cast_matches_full(world_size: int) -> None:
         current_file,
         "--parallel-nvfp4-partial",
     ]
-    subprocess.run(command, check=True)
+    run_distributed(command)
 
 
 def test_single_gpu_partial_cast_vs_full():
diff --git a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
index 603433e0da..3dcefd46fd 100644
--- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
+++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
@@ -38,7 +38,7 @@
 # Import utility functions
 _current_file = pathlib.Path(__file__).resolve()
 sys.path.append(str(_current_file.parent.parent))
-from utils import dtype_tols, make_recipe, str_to_dtype
+from utils import dtype_tols, make_recipe, run_distributed, str_to_dtype
 
 # Check if FP8 is supported
 fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True)
@@ -463,7 +463,7 @@ def test_fuser_ops_with_userbuffers(
     env["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0"
 
     # Launch parallel job
-    result = subprocess.run(command, check=True, env=env)
+    run_distributed(command, env=env)
 
 
 def main() -> None:
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
index aca8d6d692..b0a364905f 100644
--- a/tests/pytorch/distributed/test_torch_fsdp2.py
+++ b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -3,9 +3,13 @@
 # See LICENSE for license information.
 
 import os
+import sys
 import subprocess
 from pathlib import Path
 
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from utils import run_distributed
+
 import pytest
 import torch
 
@@ -20,7 +24,7 @@
 def test_fsdp2_model_tests():
     """All FSDP2 model tests (parametrized internally by recipe, fp8_init, sharding, layer)."""
     test_path = _FSDP2_DIR / "run_fsdp2_model.py"
-    result = subprocess.run(
+    run_distributed(
         [
             "torchrun",
             f"--nproc_per_node={NUM_PROCS}",
@@ -32,10 +36,10 @@ def test_fsdp2_model_tests():
             "-s",
             "--tb=short",
         ],
+        valid_returncodes=(0, 5),
         env=os.environ,
         timeout=600,
     )
-    assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
@@ -44,7 +48,7 @@ def test_fsdp2_fused_adam_tests():
     """All FSDP2 FusedAdam tests (parametrized internally by recipe, test variant)."""
     test_path = _FSDP2_DIR / "run_fsdp2_fused_adam.py"
     nproc = min(NUM_PROCS, 2)
-    result = subprocess.run(
+    run_distributed(
         [
             "torchrun",
             f"--nproc_per_node={nproc}",
@@ -56,10 +60,10 @@ def test_fsdp2_fused_adam_tests():
             "-s",
             "--tb=short",
         ],
+        valid_returncodes=(0, 5),
         env=os.environ,
         timeout=600,
     )
-    assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
 def test_dummy() -> None:
diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py
index 317240fb78..929f02453d 100644
--- a/tests/pytorch/utils.py
+++ b/tests/pytorch/utils.py
@@ -6,8 +6,9 @@
 
 import logging
 import os
+import subprocess
 from contextlib import contextmanager
-from typing import Optional, Tuple, Dict, Any, List
+from typing import Optional, Sequence, Tuple, Dict, Any, List
 from packaging.version import Version as PkgVersion
 
 import torch
@@ -407,3 +408,34 @@ def assert_close_grads(
     assert actual is not None
     assert expected is not None
     assert_close(actual.grad, expected.grad, **kwargs)
+
+
+def run_distributed(
+    args: Sequence[str],
+    *,
+    valid_returncodes: Sequence[int] = (0,),
+    **kwargs,
+) -> subprocess.CompletedProcess:
+    """Run a distributed subprocess with stderr capture for better error reporting.
+
+    stdout streams to the terminal in real time for interactive debugging.
+    On failure, stderr (containing Python tracebacks) is included in the
+    AssertionError so pytest writes it into the JUnit XML report.
+
+    Args:
+        args: Command and arguments to run.
+        valid_returncodes: Return codes considered success (default: (0,)).
+            Use (0, 5) for inner pytest runs where 5 means all tests skipped.
+        **kwargs: Passed through to subprocess.run (e.g. env, timeout).
+    """
+    result = subprocess.run(args, stderr=subprocess.PIPE, text=True, **kwargs)
+    if result.returncode not in valid_returncodes:
+        cmd_str = " ".join(str(a) for a in args)
+        msg = f"Command exited with code {result.returncode}:\n  {cmd_str}\n"
+        if result.stderr:
+            stderr_tail = result.stderr[-4000:]
+            if len(result.stderr) > 4000:
+                stderr_tail = "... [truncated] ...\n" + stderr_tail
+            msg += f"\n--- stderr ---\n{stderr_tail}"
+        raise AssertionError(msg)
+    return result