diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh index 0b83747c0e..c7499282f4 100755 --- a/qa/L0_cppunittest/test.sh +++ b/qa/L0_cppunittest/test.sh @@ -4,6 +4,9 @@ set -e +: ${XML_LOG_DIR:=/logs} +mkdir -p "$XML_LOG_DIR" + # Find TE : ${TE_PATH:=/opt/transformerengine} TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}') @@ -17,4 +20,4 @@ cd $TE_PATH/tests/cpp cmake -GNinja -Bbuild . cmake --build build export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS)) -ctest --test-dir build -j$NUM_PARALLEL_JOBS +ctest --test-dir build -j$NUM_PARALLEL_JOBS --output-junit $XML_LOG_DIR/ctest_cppunittest.xml diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py index ecd0090a3b..5aaf67061b 100644 --- a/tests/pytorch/attention/test_attention_with_cp.py +++ b/tests/pytorch/attention/test_attention_with_cp.py @@ -22,7 +22,7 @@ _current_file = pathlib.Path(__file__).resolve() sys.path.append(str(_current_file.parent.parent)) -from utils import ModelConfig, get_available_attention_backends +from utils import ModelConfig, get_available_attention_backends, run_distributed pytest_logging_level = logging.getLevelName(logging.root.level) @@ -125,7 +125,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type): if not flash_attn_supported: pytest.skip("No attention backend available.") - subprocess.run( + run_distributed( get_bash_arguments( num_gpus_per_node=num_gpus, dtype=dtype, @@ -135,7 +135,6 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type): cp_comm_type=cp_comm_type, log_level=pytest_logging_level, ), - check=True, ) @@ -368,7 +367,7 @@ def test_cp_with_fused_attention( if not fused_attn_supported: pytest.skip("No attention backend available.") - subprocess.run( + run_distributed( get_bash_arguments( num_gpus_per_node=num_gpus, dtype=dtype, @@ -384,5 +383,4 @@ def test_cp_with_fused_attention( is_training=is_training, log_level=pytest_logging_level, ), - check=True, ) diff --git a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py index 1606641b78..7de6142537 100644 --- a/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py +++ b/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py @@ -10,6 +10,9 @@ import sys import pathlib +sys.path.append(str(pathlib.Path(__file__).resolve().parent.parent)) +from utils import run_distributed + import pytest import torch from torch import nn @@ -1207,7 +1210,7 @@ def test_nvfp4_partial_cast_matches_full(world_size: int) -> None: current_file, "--parallel-nvfp4-partial", ] - subprocess.run(command, check=True) + run_distributed(command) def test_single_gpu_partial_cast_vs_full(): diff --git a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py index 603433e0da..3dcefd46fd 100644 --- a/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py +++ b/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py @@ -38,7 +38,7 @@ # Import utility functions _current_file = pathlib.Path(__file__).resolve() sys.path.append(str(_current_file.parent.parent)) -from utils import dtype_tols, make_recipe, str_to_dtype +from utils import dtype_tols, make_recipe, run_distributed, str_to_dtype # Check if FP8 is supported fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True) @@ -463,7 +463,7 @@ def test_fuser_ops_with_userbuffers( env["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "0" # Launch parallel job - result = subprocess.run(command, check=True, env=env) + run_distributed(command, env=env) def main() -> None: diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py index aca8d6d692..b0a364905f 100644 --- a/tests/pytorch/distributed/test_torch_fsdp2.py +++ b/tests/pytorch/distributed/test_torch_fsdp2.py @@ -3,9 +3,13 @@ # See LICENSE for license information. import os +import sys import subprocess from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent)) +from utils import run_distributed + import pytest import torch @@ -20,7 +24,7 @@ def test_fsdp2_model_tests(): """All FSDP2 model tests (parametrized internally by recipe, fp8_init, sharding, layer).""" test_path = _FSDP2_DIR / "run_fsdp2_model.py" - result = subprocess.run( + run_distributed( [ "torchrun", f"--nproc_per_node={NUM_PROCS}", @@ -32,10 +36,10 @@ def test_fsdp2_model_tests(): "-s", "--tb=short", ], + valid_returncodes=(0, 5), env=os.environ, timeout=600, ) - assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}" @pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs") @@ -44,7 +48,7 @@ def test_fsdp2_fused_adam_tests(): """All FSDP2 FusedAdam tests (parametrized internally by recipe, test variant).""" test_path = _FSDP2_DIR / "run_fsdp2_fused_adam.py" nproc = min(NUM_PROCS, 2) - result = subprocess.run( + run_distributed( [ "torchrun", f"--nproc_per_node={nproc}", @@ -56,10 +60,10 @@ def test_fsdp2_fused_adam_tests(): "-s", "--tb=short", ], + valid_returncodes=(0, 5), env=os.environ, timeout=600, ) - assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}" def test_dummy() -> None: diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py index 317240fb78..929f02453d 100644 --- a/tests/pytorch/utils.py +++ b/tests/pytorch/utils.py @@ -6,8 +6,9 @@ import logging import os +import subprocess from contextlib import contextmanager -from typing import Optional, Tuple, Dict, Any, List +from typing import Optional, Sequence, Tuple, Dict, Any, List from packaging.version import Version as PkgVersion import torch @@ -407,3 +408,34 @@ def assert_close_grads( assert actual is not None assert expected is not None assert_close(actual.grad, expected.grad, **kwargs) + + +def run_distributed( + args: Sequence[str], + *, + valid_returncodes: Sequence[int] = (0,), + **kwargs, +) -> subprocess.CompletedProcess: + """Run a distributed subprocess with stderr capture for better error reporting. + + stdout streams to the terminal in real time for interactive debugging. + On failure, stderr (containing Python tracebacks) is included in the + AssertionError so pytest writes it into the JUnit XML report. + + Args: + args: Command and arguments to run. + valid_returncodes: Return codes considered success (default: (0,)). + Use (0, 5) for inner pytest runs where 5 means all tests skipped. + **kwargs: Passed through to subprocess.run (e.g. env, timeout). + """ + result = subprocess.run(args, stderr=subprocess.PIPE, text=True, **kwargs) + if result.returncode not in valid_returncodes: + cmd_str = " ".join(str(a) for a in args) + msg = f"Command exited with code {result.returncode}:\n {cmd_str}\n" + if result.stderr: + stderr_tail = result.stderr[-4000:] + if len(result.stderr) > 4000: + stderr_tail = "... [truncated] ...\n" + stderr_tail + msg += f"\n--- stderr ---\n{stderr_tail}" + raise AssertionError(msg) + return result