From cf66604c8a83a38118ab687c862015caea73f157 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Wed, 25 Mar 2026 16:27:39 -0700
Subject: [PATCH 1/2] Capture subprocess stderr in distributed tests for better
 CI error reporting

Distributed tests launch subprocesses via torch.distributed.launch/torchrun.
When these fail, pytest only captures the CalledProcessError from the parent
process, not the actual worker traceback. This makes CI JUnit XML reports
show "exit code 1" with no useful error detail.

Add run_distributed() utility to tests/pytorch/utils.py that captures stderr
while letting stdout stream to the terminal. On failure, the worker's stderr
(containing the actual Python traceback) is included in the AssertionError,
which pytest writes into the JUnit XML report.

Behavior:
- Interactive use: stdout streams in real time (unchanged), stderr shown on failure
- CI/JUnit XML: failure reports now include the actual worker traceback

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 .../attention/test_attention_with_cp.py       |  8 ++---
 .../test_cast_master_weights_to_fp8.py        |  5 ++-
 .../test_fusible_ops_with_userbuffers.py      |  4 +--
 tests/pytorch/distributed/test_torch_fsdp2.py | 12 ++++---
 tests/pytorch/utils.py                        | 34 ++++++++++++++++++-
 5 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
index ecd0090a3b..5aaf67061b 100644
--- a/tests/pytorch/attention/test_attention_with_cp.py
+++ b/tests/pytorch/attention/test_attention_with_cp.py
@@ -22,7 +22,7 @@
 
 _current_file = pathlib.Path(__file__).resolve()
 sys.path.append(str(_current_file.parent.parent))
-from utils import ModelConfig, get_available_attention_backends
+from utils import ModelConfig, get_available_attention_backends, run_distributed
 
 pytest_logging_level = logging.getLevelName(logging.root.level)
 
@@ -125,7 +125,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
     if not flash_attn_supported:
         pytest.skip("No attention backend available.")
 
-    subprocess.run(
+    run_distributed(
         get_bash_arguments(
             num_gpus_per_node=num_gpus,
             dtype=dtype,
@@ -135,7 +135,6 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
             cp_comm_type=cp_comm_type,
             log_level=pytest_logging_level,
         ),
-        check=True,
     )
 
 
@@ -368,7 +367,7 @@ def test_cp_with_fused_attention(
     if not fused_attn_supported:
         pytest.skip("No attention backend available.")
 
-    subprocess.run(
+    run_distributed(
         get_bash_arguments(
             num_gpus_per_node=num_gpus,
             dtype=dtype,
@@ -384,5 +383,4 @@ def test_cp_with_fused_attention(
             is_training=is_training,
             log_level=pytest_logging_level,
         ),
-        check=True,
     )
diff --git a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
index 1606641b78..7de6142537 100644
--- a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
+++ b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py
@@ -10,6 +10,9 @@
 import sys
 import pathlib
 
+sys.path.append(str(pathlib.Path(__file__).resolve().parent.parent))
+from utils import run_distributed
+
 import pytest
 import torch
 from torch import nn
@@ -1207,7 +1210,7 @@ def test_nvfp4_partial_cast_matches_full(world_size: int) -> None:
         current_file,
         "--parallel-nvfp4-partial",
     ]
-    subprocess.run(command, check=True)
+    run_distributed(command)
 
 
 def test_single_gpu_partial_cast_vs_full():
diff --git a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
index 603433e0da..3dcefd46fd 100644
--- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
+++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
@@ -38,7 +38,7 @@
 # Import utility functions
 _current_file = pathlib.Path(__file__).resolve()
 sys.path.append(str(_current_file.parent.parent))
-from utils import dtype_tols, make_recipe, str_to_dtype
+from utils import dtype_tols, make_recipe, run_distributed, str_to_dtype
 
 # Check if FP8 is supported
 fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True)
@@ -463,7 +463,7 @@ def test_fuser_ops_with_userbuffers(
     env["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0"
 
     # Launch parallel job
-    result = subprocess.run(command, check=True, env=env)
+    run_distributed(command, env=env)
 
 
 def main() -> None:
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
index aca8d6d692..b0a364905f 100644
--- a/tests/pytorch/distributed/test_torch_fsdp2.py
+++ b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -3,9 +3,13 @@
 # See LICENSE for license information.
 
 import os
+import sys
 import subprocess
 from pathlib import Path
 
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+from utils import run_distributed
+
 import pytest
 import torch
 
@@ -20,7 +24,7 @@
 def test_fsdp2_model_tests():
     """All FSDP2 model tests (parametrized internally by recipe, fp8_init, sharding, layer)."""
     test_path = _FSDP2_DIR / "run_fsdp2_model.py"
-    result = subprocess.run(
+    run_distributed(
         [
             "torchrun",
             f"--nproc_per_node={NUM_PROCS}",
@@ -32,10 +36,10 @@ def test_fsdp2_model_tests():
             "-s",
             "--tb=short",
         ],
+        valid_returncodes=(0, 5),
         env=os.environ,
         timeout=600,
     )
-    assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
 @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
@@ -44,7 +48,7 @@ def test_fsdp2_fused_adam_tests():
     """All FSDP2 FusedAdam tests (parametrized internally by recipe, test variant)."""
     test_path = _FSDP2_DIR / "run_fsdp2_fused_adam.py"
     nproc = min(NUM_PROCS, 2)
-    result = subprocess.run(
+    run_distributed(
         [
             "torchrun",
             f"--nproc_per_node={nproc}",
@@ -56,10 +60,10 @@ def test_fsdp2_fused_adam_tests():
             "-s",
             "--tb=short",
         ],
+        valid_returncodes=(0, 5),
         env=os.environ,
         timeout=600,
     )
-    assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
 def test_dummy() -> None:
diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py
index 317240fb78..929f02453d 100644
--- a/tests/pytorch/utils.py
+++ b/tests/pytorch/utils.py
@@ -6,8 +6,9 @@
 
 import logging
 import os
+import subprocess
 from contextlib import contextmanager
-from typing import Optional, Tuple, Dict, Any, List
+from typing import Optional, Sequence, Tuple, Dict, Any, List
 from packaging.version import Version as PkgVersion
 
 import torch
@@ -407,3 +408,34 @@ def assert_close_grads(
     assert actual is not None
     assert expected is not None
     assert_close(actual.grad, expected.grad, **kwargs)
+
+
+def run_distributed(
+    args: Sequence[str],
+    *,
+    valid_returncodes: Sequence[int] = (0,),
+    **kwargs,
+) -> subprocess.CompletedProcess:
+    """Run a distributed subprocess with stderr capture for better error reporting.
+
+    stdout streams to the terminal in real time for interactive debugging.
+    On failure, stderr (containing Python tracebacks) is included in the
+    AssertionError so pytest writes it into the JUnit XML report.
+
+    Args:
+        args: Command and arguments to run.
+        valid_returncodes: Return codes considered success (default: (0,)).
+            Use (0, 5) for inner pytest runs where 5 means all tests skipped.
+        **kwargs: Passed through to subprocess.run (e.g. env, timeout).
+    """
+    result = subprocess.run(args, stderr=subprocess.PIPE, text=True, **kwargs)
+    if result.returncode not in valid_returncodes:
+        cmd_str = " ".join(str(a) for a in args)
+        msg = f"Command exited with code {result.returncode}:\n  {cmd_str}\n"
+        if result.stderr:
+            stderr_tail = result.stderr[-4000:]
+            if len(result.stderr) > 4000:
+                stderr_tail = "... [truncated] ...\n" + stderr_tail
+            msg += f"\n--- stderr ---\n{stderr_tail}"
+        raise AssertionError(msg)
+    return result

From fb9812c144bc997229f23c5b11dcb7a3f48456c1 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Wed, 25 Mar 2026 18:38:33 -0700
Subject: [PATCH 2/2] Add JUnit XML output to ctest in L0_cppunittest

Add --output-junit flag so ctest writes JUnit XML to /logs/,
matching the pattern used by pytest tests. The XML is written
before ctest exits, so it's captured even on test failure.

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 qa/L0_cppunittest/test.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh
index 0b83747c0e..c7499282f4 100755
--- a/qa/L0_cppunittest/test.sh
+++ b/qa/L0_cppunittest/test.sh
@@ -4,6 +4,9 @@
 
 set -e
 
+: ${XML_LOG_DIR:=/logs}
+mkdir -p "$XML_LOG_DIR"
+
 # Find TE
 : ${TE_PATH:=/opt/transformerengine}
 TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
@@ -17,4 +20,4 @@ cd $TE_PATH/tests/cpp
 cmake -GNinja -Bbuild .
 cmake --build build
 export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
-ctest --test-dir build -j$NUM_PARALLEL_JOBS
+ctest --test-dir build -j$NUM_PARALLEL_JOBS --output-junit $XML_LOG_DIR/ctest_cppunittest.xml