From cf66604c8a83a38118ab687c862015caea73f157 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Wed, 25 Mar 2026 16:27:39 -0700 Subject: [PATCH 1/2] Capture subprocess stderr in distributed tests for better CI error reporting Distributed tests launch subprocesses via torch.distributed.launch/torchrun. When these fail, pytest only captures the CalledProcessError from the parent process, not the actual worker traceback. This makes CI JUnit XML reports show "exit code 1" with no useful error detail. Add run_distributed() utility to tests/pytorch/utils.py that captures stderr while letting stdout stream to the terminal. On failure, the worker's stderr (containing the actual Python traceback) is included in the AssertionError, which pytest writes into the JUnit XML report. Behavior: - Interactive use: stdout streams in real time (unchanged), stderr shown on failure - CI/JUnit XML: failure reports now include the actual worker traceback Signed-off-by: Sudhakar Singh --- .../attention/test_attention_with_cp.py | 8 ++--- .../test_cast_master_weights_to_fp8.py | 5 ++- .../test_fusible_ops_with_userbuffers.py | 4 +-- tests/pytorch/distributed/test_torch_fsdp2.py | 12 ++++--- tests/pytorch/utils.py | 34 ++++++++++++++++++- 5 files changed, 50 insertions(+), 13 deletions(-) diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py index ecd0090a3b..5aaf67061b 100644 --- a/tests/pytorch/attention/test_attention_with_cp.py +++ b/tests/pytorch/attention/test_attention_with_cp.py @@ -22,7 +22,7 @@ _current_file = pathlib.Path(__file__).resolve() sys.path.append(str(_current_file.parent.parent)) -from utils import ModelConfig, get_available_attention_backends +from utils import ModelConfig, get_available_attention_backends, run_distributed pytest_logging_level = logging.getLevelName(logging.root.level) @@ -125,7 +125,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type): if not flash_attn_supported: pytest.skip("No attention backend available.") - subprocess.run( + run_distributed( get_bash_arguments( num_gpus_per_node=num_gpus, dtype=dtype, @@ -135,7 +135,6 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type): cp_comm_type=cp_comm_type, log_level=pytest_logging_level, ), - check=True, ) @@ -368,7 +367,7 @@ def test_cp_with_fused_attention( if not fused_attn_supported: pytest.skip("No attention backend available.") - subprocess.run( + run_distributed( get_bash_arguments( num_gpus_per_node=num_gpus, dtype=dtype, @@ -384,5 +383,4 @@ def test_cp_with_fused_attention( is_training=is_training, log_level=pytest_logging_level, ), - check=True, ) diff --git a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py index 1606641b78..7de6142537 100644 --- a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py +++ b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py @@ -10,6 +10,9 @@ import sys import pathlib +sys.path.append(str(pathlib.Path(__file__).resolve().parent.parent)) +from utils import run_distributed + import pytest import torch from torch import nn @@ -1207,7 +1210,7 @@ def test_nvfp4_partial_cast_matches_full(world_size: int) -> None: current_file, "--parallel-nvfp4-partial", ] - subprocess.run(command, check=True) + run_distributed(command) def test_single_gpu_partial_cast_vs_full(): diff --git a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py index 603433e0da..3dcefd46fd 100644 --- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py +++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py @@ -38,7 +38,7 @@ # Import utility functions _current_file = pathlib.Path(__file__).resolve() sys.path.append(str(_current_file.parent.parent)) -from utils import dtype_tols, make_recipe, str_to_dtype +from utils import dtype_tols, make_recipe, run_distributed, str_to_dtype # Check if FP8 is supported fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True) @@ -463,7 +463,7 @@ def test_fuser_ops_with_userbuffers( env["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0" # Launch parallel job - result = subprocess.run(command, check=True, env=env) + run_distributed(command, env=env) def main() -> None: diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py index aca8d6d692..b0a364905f 100644 --- a/tests/pytorch/distributed/test_torch_fsdp2.py +++ b/tests/pytorch/distributed/test_torch_fsdp2.py @@ -3,9 +3,13 @@ # See LICENSE for license information. import os +import sys import subprocess from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent)) +from utils import run_distributed + import pytest import torch @@ -20,7 +24,7 @@ def test_fsdp2_model_tests(): """All FSDP2 model tests (parametrized internally by recipe, fp8_init, sharding, layer).""" test_path = _FSDP2_DIR / "run_fsdp2_model.py" - result = subprocess.run( + run_distributed( [ "torchrun", f"--nproc_per_node={NUM_PROCS}", @@ -32,10 +36,10 @@ def test_fsdp2_model_tests(): "-s", "--tb=short", ], + valid_returncodes=(0, 5), env=os.environ, timeout=600, ) - assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}" @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs") @@ -44,7 +48,7 @@ def test_fsdp2_fused_adam_tests(): """All FSDP2 FusedAdam tests (parametrized internally by recipe, test variant).""" test_path = _FSDP2_DIR / "run_fsdp2_fused_adam.py" nproc = min(NUM_PROCS, 2) - result = subprocess.run( + run_distributed( [ "torchrun", f"--nproc_per_node={nproc}", @@ -56,10 +60,10 @@ def test_fsdp2_fused_adam_tests(): "-s", "--tb=short", ], + valid_returncodes=(0, 5), env=os.environ, timeout=600, ) - assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}" def test_dummy() -> None: diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py index 317240fb78..929f02453d 100644 --- a/tests/pytorch/utils.py +++ b/tests/pytorch/utils.py @@ -6,8 +6,9 @@ import logging import os +import subprocess from contextlib import contextmanager -from typing import Optional, Tuple, Dict, Any, List +from typing import Optional, Sequence, Tuple, Dict, Any, List from packaging.version import Version as PkgVersion import torch @@ -407,3 +408,34 @@ def assert_close_grads( assert actual is not None assert expected is not None assert_close(actual.grad, expected.grad, **kwargs) + + +def run_distributed( + args: Sequence[str], + *, + valid_returncodes: Sequence[int] = (0,), + **kwargs, +) -> subprocess.CompletedProcess: + """Run a distributed subprocess with stderr capture for better error reporting. + + stdout streams to the terminal in real time for interactive debugging. + On failure, stderr (containing Python tracebacks) is included in the + AssertionError so pytest writes it into the JUnit XML report. + + Args: + args: Command and arguments to run. + valid_returncodes: Return codes considered success (default: (0,)). + Use (0, 5) for inner pytest runs where 5 means all tests skipped. + **kwargs: Passed through to subprocess.run (e.g. env, timeout). + """ + result = subprocess.run(args, stderr=subprocess.PIPE, text=True, **kwargs) + if result.returncode not in valid_returncodes: + cmd_str = " ".join(str(a) for a in args) + msg = f"Command exited with code {result.returncode}:\n {cmd_str}\n" + if result.stderr: + stderr_tail = result.stderr[-4000:] + if len(result.stderr) > 4000: + stderr_tail = "... [truncated] ...\n" + stderr_tail + msg += f"\n--- stderr ---\n{stderr_tail}" + raise AssertionError(msg) + return result From fb9812c144bc997229f23c5b11dcb7a3f48456c1 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Wed, 25 Mar 2026 18:38:33 -0700 Subject: [PATCH 2/2] Add JUnit XML output to ctest in L0_cppunittest Add --output-junit flag so ctest writes JUnit XML to /logs/, matching the pattern used by pytest tests. The XML is written before ctest exits, so it's captured even on test failure. Signed-off-by: Sudhakar Singh --- qa/L0_cppunittest/test.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh index 0b83747c0e..c7499282f4 100755 --- a/qa/L0_cppunittest/test.sh +++ b/qa/L0_cppunittest/test.sh @@ -4,6 +4,9 @@ set -e +: ${XML_LOG_DIR:=/logs} +mkdir -p "$XML_LOG_DIR" + # Find TE : ${TE_PATH:=/opt/transformerengine} TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}') @@ -17,4 +20,4 @@ cd $TE_PATH/tests/cpp cmake -GNinja -Bbuild . cmake --build build export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS)) -ctest --test-dir build -j$NUM_PARALLEL_JOBS +ctest --test-dir build -j$NUM_PARALLEL_JOBS --output-junit $XML_LOG_DIR/ctest_cppunittest.xml