From 12da756f6a888eff82e34f4b324237d88513fa7d Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Wed, 25 Mar 2026 16:47:49 -0700
Subject: [PATCH 1/3] [PyT][Test] Add xfailing FSDP2 memory leak detection
 tests

Add tests that demonstrate two known memory issues with FSDP2 + FP8:

- Issue #2681: FP8 weight copies created during te.autocast() forward pass
  accumulate across layers instead of being freed between layers, defeating
  FSDP2's memory efficiency. Detected by comparing per-layer forward memory
  increments against a bf16 baseline using layer hooks.

- Issue #2717: Transpose cache tensors (_create_transpose) allocated during
  backward persist until the next forward pass instead of being freed after
  backward completes. Detected by comparing the backward memory delta
  (post_bwd - post_fwd) against a bf16 baseline.

New tests:
- test_bf16_no_excess_forward_memory: control, validates per-layer measurement
- test_bf16_no_excess_backward_memory: control, validates backward delta comparison
- test_fp8_temp_accumulation_across_layers: xfail, detects #2681
- test_transpose_cache_retained_after_backward: xfail, detects #2717

All parametrized over 5 FP8 recipes x {no_quant_init, quant_init}.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../fsdp2_tests/run_fsdp2_mem_leak.py         | 510 ++++++++++++++++++
 tests/pytorch/distributed/test_torch_fsdp2.py |  24 +
 2 files changed, 534 insertions(+)
 create mode 100644 tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
diff --git a/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
new file mode 100644
index 0000000000..54fd97fcac
--- /dev/null
+++ b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
@@ -0,0 +1,510 @@
+#!/usr/bin/python3
+
+# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+"""FSDP2 memory leak detection tests.
+
+These tests verify that temporary TE tensors (FP8 quantized weights, transpose
+caches) are properly freed when moving between layers with FSDP2.
+
+Related issues:
+  - https://github.com/NVIDIA/TransformerEngine/issues/2681
+    Quantized weights created during forward pass accumulate across layers.
+  - https://github.com/NVIDIA/TransformerEngine/issues/2717
+    _create_transpose tensors accumulate across training steps with
+    quantized_model_init + FusedAdam + FSDP2.
+
+Run all tests (via torchrun + pytest):
+  torchrun -m pytest <this_file> -v --tb=short
+
+Run a single test standalone (for debugging):
+  torchrun <this_file> --test <name> --recipe <recipe>
+
+Available --test values:
+  bf16_no_excess_forward_memory, fp8_temp_accumulation_across_layers,
+  transpose_cache_retained_after_backward
+
+Available --recipe values:
+  DelayedScaling, Float8CurrentScaling, Float8BlockScaling,
+  MXFP8BlockScaling, NVFP4BlockScaling
+"""
+
+import argparse
+import gc
+import os
+from contextlib import nullcontext
+
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed.device_mesh import DeviceMesh
+
+import transformer_engine.pytorch as te
+
+from fsdp2_utils import get_recipe_from_string, save_custom_attrs, restore_custom_attrs
+
+
+# ── Constants ────────────────────────────────────────────────────────
+HIDDEN_SIZE = 256
+FFN_HIDDEN_SIZE = 1024
+NUM_ATTENTION_HEADS = 8
+NUM_LAYERS = 8
+SEQ_LEN = 32
+BATCH_PER_RANK = 2
+WARMUP_STEPS = 2
+MEASURED_STEPS = 3
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+def _build_model(num_layers, fp8_init, recipe=None, use_meta_device=True):
+    """Build a Sequential of TransformerLayers, optionally with FP8 init.
+
+    When fp8_init=True and use_meta_device=True (the default), the model is
+    created on the meta device so parameters are materialized after FSDP2
+    sharding via reset_parameters().
+    """
+    if fp8_init:
+        ctx = te.quantized_model_init(enabled=True, recipe=recipe)
+    else:
+        ctx = nullcontext()
+    kwargs = dict(
+        fuse_qkv_params=True,
+        params_dtype=torch.bfloat16,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+    )
+    if fp8_init and use_meta_device:
+        kwargs["device"] = "meta"
+    with ctx:
+        model = torch.nn.Sequential(
+            *[
+                te.TransformerLayer(
+                    HIDDEN_SIZE,
+                    FFN_HIDDEN_SIZE,
+                    NUM_ATTENTION_HEADS,
+                    **kwargs,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    return model
+
+
+def _shard_model(model, world_size):
+    """Apply FSDP2 sharding with save/restore of custom attrs."""
+    has_meta_params = any(p.is_meta for p in model.parameters())
+    custom_attrs = save_custom_attrs(model)
+    mesh = DeviceMesh("cuda", list(range(world_size)))
+    for child in model.children():
+        fully_shard(child, mesh=mesh)
+    fully_shard(model, mesh=mesh)
+    if has_meta_params:
+        for module in model.modules():
+            if hasattr(module, "reset_parameters"):
+                module.reset_parameters()
+    restore_custom_attrs(model, custom_attrs)
+    return model
+
+
+def _get_dist_info():
+    """Get world_size and device from environment."""
+    world_size = int(os.environ["WORLD_SIZE"])
+    device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+    return world_size, device
+
+
+def _run_training_step(model, optimizer, recipe, x, target):
+    """Run one forward + backward + optimizer step."""
+    optimizer.zero_grad(set_to_none=True)
+    with te.autocast(enabled=(recipe is not None), recipe=recipe):
+        output = model(x)
+    loss = F.mse_loss(output, target)
+    loss.backward()
+    optimizer.step()
+    return loss.item()
+
+
+def _measure_backward_memory_delta(model, optimizer, recipe, x, target):
+    """Run a training step and return (post_bwd - post_fwd) memory delta.
+
+    This delta captures memory added during backward that persists afterward.
+    In a healthy system, backward frees activations and adds only gradients.
+    If transpose caches or other FP8 temps persist, the delta will be larger.
+    """
+    optimizer.zero_grad(set_to_none=True)
+    with te.autocast(enabled=(recipe is not None), recipe=recipe):
+        output = model(x)
+    torch.cuda.synchronize()
+    mem_post_fwd = torch.cuda.memory_allocated()
+
+    loss = F.mse_loss(output, target)
+    loss.backward()
+    torch.cuda.synchronize()
+    mem_post_bwd = torch.cuda.memory_allocated()
+
+    optimizer.step()
+    return mem_post_bwd - mem_post_fwd
+
+
+def _maybe_skip(recipe_name, quantized_model_init):
+    """Skip configurations that fail for reasons unrelated to memory leaks."""
+    if recipe_name == "NVFP4BlockScaling" and quantized_model_init:
+        pytest.skip(
+            "NVFP4BlockScaling + quantized_model_init: not supported with FSDP2 "
+            "(block tensor dequantized before FSDP2 flatten)"
+        )
+
+
+class _LayerMemoryTracker:
+    """Register forward hooks on Sequential children to measure per-layer memory."""
+
+    def __init__(self):
+        self.post_forward_mem = []
+        self._handles = []
+
+    def attach(self, model):
+        for i, layer in enumerate(model.children()):
+
+            def make_hook(idx):
+                def hook(module, args, output):
+                    torch.cuda.synchronize()
+                    self.post_forward_mem.append(torch.cuda.memory_allocated())
+
+                return hook
+
+            self._handles.append(layer.register_forward_hook(make_hook(i)))
+
+    def clear(self):
+        self.post_forward_mem.clear()
+
+    def detach(self):
+        for h in self._handles:
+            h.remove()
+        self._handles.clear()
+
+    def per_layer_increments(self):
+        """Return list of memory increments between consecutive post-forward hooks."""
+        return [
+            self.post_forward_mem[i] - self.post_forward_mem[i - 1]
+            for i in range(1, len(self.post_forward_mem))
+        ]
+
+
+def _measure_forward_increments(model, optimizer, recipe, x, target):
+    """Run a single training step with hooks and return per-layer forward memory increments."""
+    tracker = _LayerMemoryTracker()
+    tracker.attach(model)
+    try:
+        _run_training_step(model, optimizer, recipe, x, target)
+        return tracker.per_layer_increments()
+    finally:
+        tracker.detach()
+
+
+# ── Fixtures ─────────────────────────────────────────────────────────
+@pytest.fixture(params=[False, True], ids=["no_quant_init", "quant_init"])
+def quantized_model_init(request):
+    return request.param
+
+
+# ── Tests ────────────────────────────────────────────────────────────
+def test_bf16_no_excess_forward_memory():
+    """Control test: bf16 (no FP8) should have stable per-layer forward memory.
+
+    With FSDP2 and bf16 params (no FP8), the per-layer memory growth during
+    forward should only be activation saves for autograd. There should be no
+    FP8 temporary accumulation. This test validates the measurement approach.
+    """
+    world_size, device = _get_dist_info()
+
+    model = _build_model(NUM_LAYERS, fp8_init=False)
+    model = _shard_model(model, world_size)
+
+    optimizer = te.optimizers.FusedAdam(
+        model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+    target = torch.randn_like(x)
+
+    # Warmup
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(model, optimizer, None, x, target)
+
+    # Measure
+    increments = _measure_forward_increments(model, optimizer, None, x, target)
+
+    # bf16 per-layer increments should be consistent (activation saves only)
+    # and should NOT grow over layers (each layer saves similar activations).
+    avg_increment = sum(increments) / len(increments)
+    max_deviation = max(abs(inc - avg_increment) for inc in increments)
+
+    # Allow 10% deviation from mean -- bf16 increments should be very uniform
+    assert max_deviation <= 0.1 * abs(avg_increment) + 1024, (
+        "bf16 per-layer increments are not uniform. "
+        f"Increments (KiB): {[f'{inc/1024:.1f}' for inc in increments]}. "
+        f"Average: {avg_increment/1024:.1f} KiB, max deviation: {max_deviation/1024:.1f} KiB"
+    )
+
+
+@pytest.mark.xfail(
+    strict=False,
+    reason=(
+        "Issue #2681: Quantized weights created during forward pass are not "
+        "deallocated between layers. Each layer's FP8 copies accumulate, "
+        "adding per-layer memory overhead beyond what bf16 autograd saves require."
+    ),
+)
+def test_fp8_temp_accumulation_across_layers(recipe_name, quantized_model_init):
+    """Detect FP8 weight temporaries accumulating across layers during forward.
+
+    Strategy: measure per-layer memory growth during forward for both bf16
+    (baseline) and FP8. With FSDP2, per-layer params are unsharded then
+    resharded, so the only per-layer memory growth should be activation saves
+    for autograd (same as bf16). If FP8 adds excess per-layer growth, it means
+    FP8 weight copies are accumulating across layers instead of being freed.
+    """
+    _maybe_skip(recipe_name, quantized_model_init)
+
+    recipe = get_recipe_from_string(recipe_name)
+    world_size, device = _get_dist_info()
+
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+    target = torch.randn_like(x)
+
+    # ── bf16 baseline ──
+    bf16_model = _build_model(NUM_LAYERS, fp8_init=False)
+    bf16_model = _shard_model(bf16_model, world_size)
+    bf16_optimizer = te.optimizers.FusedAdam(
+        bf16_model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(bf16_model, bf16_optimizer, None, x, target)
+    bf16_increments = _measure_forward_increments(bf16_model, bf16_optimizer, None, x, target)
+    bf16_avg = sum(bf16_increments) / len(bf16_increments)
+
+    del bf16_model, bf16_optimizer
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # ── FP8 model ──
+    fp8_model = _build_model(NUM_LAYERS, fp8_init=quantized_model_init, recipe=recipe)
+    fp8_model = _shard_model(fp8_model, world_size)
+    fp8_optimizer = te.optimizers.FusedAdam(
+        fp8_model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(fp8_model, fp8_optimizer, recipe, x, target)
+    fp8_increments = _measure_forward_increments(fp8_model, fp8_optimizer, recipe, x, target)
+    fp8_avg = sum(fp8_increments) / len(fp8_increments)
+
+    # ── Assert: FP8 per-layer excess should be bounded ──
+    # If FP8 temps are properly freed between layers, per-layer increment
+    # should be similar to bf16 (just activation saves). Any excess indicates
+    # FP8 weight copies accumulating.
+    excess_per_layer = fp8_avg - bf16_avg
+
+    # Allow up to 50 KiB per layer for FP8 scale/amax metadata.
+    # FP8 weight copies (~0.68 MiB/layer for this model) should NOT persist.
+    tolerance_per_layer = 50 * 1024  # 50 KiB
+
+    assert excess_per_layer <= tolerance_per_layer, (
+        "FP8 per-layer forward memory increment exceeds bf16 baseline by "
+        f"{excess_per_layer/1024:.1f} KiB/layer (tolerance: {tolerance_per_layer/1024:.1f} KiB). "
+        f"bf16 avg: {bf16_avg/1024:.1f} KiB/layer, FP8 avg: {fp8_avg/1024:.1f} KiB/layer. "
+        f"FP8 increments (KiB): {[f'{inc/1024:.1f}' for inc in fp8_increments]}. "
+        "FP8 weight copies are likely accumulating across layers (Issue #2681)."
+    )
+
+
+def test_bf16_no_excess_backward_memory():
+    """Control test: two identical bf16 models should show zero backward excess.
+
+    This mirrors the structure of test_transpose_cache_retained_after_backward
+    but compares bf16 vs bf16 instead of FP8 vs bf16. The excess should be
+    zero, proving the comparison methodology works.
+    """
+    world_size, device = _get_dist_info()
+
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+    target = torch.randn_like(x)
+
+    # Build and measure first bf16 model (acts as "baseline")
+    model_a = _build_model(NUM_LAYERS, fp8_init=False)
+    model_a = _shard_model(model_a, world_size)
+    opt_a = te.optimizers.FusedAdam(
+        model_a.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(model_a, opt_a, None, x, target)
+    delta_a = _measure_backward_memory_delta(model_a, opt_a, None, x, target)
+
+    del model_a, opt_a
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # Build and measure second bf16 model (acts as "test")
+    model_b = _build_model(NUM_LAYERS, fp8_init=False)
+    model_b = _shard_model(model_b, world_size)
+    opt_b = te.optimizers.FusedAdam(
+        model_b.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(model_b, opt_b, None, x, target)
+    delta_b = _measure_backward_memory_delta(model_b, opt_b, None, x, target)
+
+    excess = delta_b - delta_a
+    tolerance = 256 * 1024  # 256 KiB
+
+    assert abs(excess) <= tolerance, (
+        "Two identical bf16 models show backward delta excess of "
+        f"{excess/1024:.1f} KiB (tolerance: {tolerance/1024:.0f} KiB). "
+        f"delta_a={delta_a/1024**2:.2f} MiB, delta_b={delta_b/1024**2:.2f} MiB."
+    )
+
+
+@pytest.mark.xfail(
+    strict=False,
+    reason=(
+        "Issue #2717: _create_transpose tensor allocated in "
+        "float8_tensor_storage.py persists after backward pass until the next "
+        "forward pass frees it. These tensors should be released when backward "
+        "completes, not retained across step boundaries."
+    ),
+)
+def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_init):
+    """Detect transpose caches persisting after backward completes.
+
+    When FP8 backward runs, _create_transpose allocates tensors for transposed
+    weight copies. These should be freed when backward completes, but instead
+    they persist until the next forward pass. This test measures the backward
+    memory delta (post_bwd - post_fwd) and compares it to a bf16 baseline.
+    In bf16, backward frees activations and adds gradients (net negative delta).
+    With FP8, retained transpose caches make the delta significantly more positive.
+    """
+    _maybe_skip(recipe_name, quantized_model_init)
+
+    recipe = get_recipe_from_string(recipe_name)
+    world_size, device = _get_dist_info()
+
+    x = torch.randn(SEQ_LEN, BATCH_PER_RANK, HIDDEN_SIZE, dtype=torch.bfloat16, device=device)
+    target = torch.randn_like(x)
+
+    # ── bf16 baseline ──
+    bf16_model = _build_model(NUM_LAYERS, fp8_init=False)
+    bf16_model = _shard_model(bf16_model, world_size)
+    bf16_optimizer = te.optimizers.FusedAdam(
+        bf16_model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(bf16_model, bf16_optimizer, None, x, target)
+    bf16_bwd_delta = _measure_backward_memory_delta(
+        bf16_model,
+        bf16_optimizer,
+        None,
+        x,
+        target,
+    )
+
+    del bf16_model, bf16_optimizer
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # ── FP8 model ──
+    fp8_model = _build_model(NUM_LAYERS, fp8_init=quantized_model_init, recipe=recipe)
+    fp8_model = _shard_model(fp8_model, world_size)
+    fp8_optimizer = te.optimizers.FusedAdam(
+        fp8_model.parameters(),
+        lr=1e-3,
+        master_weights=True,
+        master_weight_dtype=torch.float32,
+    )
+    for _ in range(WARMUP_STEPS):
+        _run_training_step(fp8_model, fp8_optimizer, recipe, x, target)
+    fp8_bwd_delta = _measure_backward_memory_delta(
+        fp8_model,
+        fp8_optimizer,
+        recipe,
+        x,
+        target,
+    )
+
+    # ── Assert: FP8 backward should not retain excess memory ──
+    # In bf16, backward frees activations and adds gradients (typically net negative).
+    # If FP8 transpose caches persist after backward, the FP8 delta will be
+    # significantly more positive than bf16.
+    excess = fp8_bwd_delta - bf16_bwd_delta
+
+    # Allow 256 KiB total for FP8 scale/amax bookkeeping.
+    # Transpose caches (~3 MiB for this 4-layer model) should NOT persist.
+    tolerance = 256 * 1024
+
+    assert excess <= tolerance, (
+        f"FP8 backward retains {excess/1024**2:.2f} MiB more than bf16 baseline. "
+        f"bf16 backward delta: {bf16_bwd_delta/1024**2:.2f} MiB, "
+        f"FP8 backward delta: {fp8_bwd_delta/1024**2:.2f} MiB. "
+        "Transpose caches from backward are likely not being freed (Issue #2717)."
+    )
+
+
+# ── Standalone runner ────────────────────────────────────────────────
+TESTS = {
+    "bf16_no_excess_forward_memory": test_bf16_no_excess_forward_memory,
+    "bf16_no_excess_backward_memory": test_bf16_no_excess_backward_memory,
+    "fp8_temp_accumulation_across_layers": test_fp8_temp_accumulation_across_layers,
+    "transpose_cache_retained_after_backward": test_transpose_cache_retained_after_backward,
+}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="FSDP2 memory leak tests (standalone)")
+    parser.add_argument("--test", required=True, choices=list(TESTS.keys()))
+    parser.add_argument(
+        "--recipe",
+        type=str,
+        default="DelayedScaling",
+        choices=[
+            "DelayedScaling",
+            "Float8CurrentScaling",
+            "Float8BlockScaling",
+            "MXFP8BlockScaling",
+            "NVFP4BlockScaling",
+        ],
+    )
+    parser.add_argument("--quantized-model-init", action="store_true", default=False)
+    args = parser.parse_args()
+
+    local_rank = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="cpu:gloo,cuda:nccl")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    try:
+        TESTS[args.test](args.recipe, args.quantized_model_init)
+    finally:
+        if dist.is_initialized():
+            dist.destroy_process_group()
+        gc.collect()
+        torch.cuda.empty_cache()
diff --git a/tests/pytorch/distributed/test_torch_fsdp2.py b/tests/pytorch/distributed/test_torch_fsdp2.py
index aca8d6d692..9cbbc3933c 100644
--- a/tests/pytorch/distributed/test_torch_fsdp2.py
+++ b/tests/pytorch/distributed/test_torch_fsdp2.py
@@ -62,6 +62,30 @@ def test_fsdp2_fused_adam_tests():
     assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
 
 
+@pytest.mark.skipif(NUM_PROCS < 2, reason="Requires 2+ GPUs")
+@pytest.mark.skipif(not te.torch_version() >= (2, 4, 0), reason="Requires PyTorch 2.4.0+")
+def test_fsdp2_mem_leak_tests():
+    """FSDP2 memory leak detection tests (parametrized internally by recipe, quantized_model_init)."""
+    test_path = _FSDP2_DIR / "run_fsdp2_mem_leak.py"
+    nproc = min(NUM_PROCS, 2)
+    result = subprocess.run(
+        [
+            "torchrun",
+            f"--nproc_per_node={nproc}",
+            "--local-ranks-filter=0",
+            "-m",
+            "pytest",
+            str(test_path),
+            "-v",
+            "-s",
+            "--tb=short",
+        ],
+        env=os.environ,
+        timeout=600,
+    )
+    assert result.returncode in (0, 5), f"Inner pytest failed with exit code {result.returncode}"
+
+
 def test_dummy() -> None:
     """Dummy test
 

From 29cd628ca5522c755ba814bc27762ac9bd4d3c06 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Wed, 25 Mar 2026 16:53:58 -0700
Subject: [PATCH 2/3] Address review comments: fix standalone runner, stale
 comment, unused constant

- Fix standalone runner to not pass recipe/quantized_model_init args to
  bf16 control tests (which take no arguments)
- Fix stale comment referencing 4-layer model (now 8 layers)
- Remove unused MEASURED_STEPS constant

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../distributed/fsdp2_tests/run_fsdp2_mem_leak.py  | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
index 54fd97fcac..387d3a9644 100644
--- a/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
+++ b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
@@ -56,7 +56,6 @@
 SEQ_LEN = 32
 BATCH_PER_RANK = 2
 WARMUP_STEPS = 2
-MEASURED_STEPS = 3
 
 
 # ── Helpers ──────────────────────────────────────────────────────────
@@ -458,7 +457,7 @@ def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_in
     excess = fp8_bwd_delta - bf16_bwd_delta
 
     # Allow 256 KiB total for FP8 scale/amax bookkeeping.
-    # Transpose caches (~3 MiB for this 4-layer model) should NOT persist.
+    # Transpose caches (~3 MiB for this 8-layer model) should NOT persist.
     tolerance = 256 * 1024
 
     assert excess <= tolerance, (
@@ -501,8 +500,17 @@ def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_in
     torch.manual_seed(42)
     torch.cuda.manual_seed(42)
 
+    _PARAMETRIZED_TESTS = {
+        "fp8_temp_accumulation_across_layers",
+        "transpose_cache_retained_after_backward",
+    }
+
     try:
-        TESTS[args.test](args.recipe, args.quantized_model_init)
+        test_fn = TESTS[args.test]
+        if args.test in _PARAMETRIZED_TESTS:
+            test_fn(args.recipe, args.quantized_model_init)
+        else:
+            test_fn()
     finally:
         if dist.is_initialized():
             dist.destroy_process_group()

From d0763af80407d4deccec2f093e1d519b99788f9c Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Wed, 25 Mar 2026 18:16:14 -0700
Subject: [PATCH 3/3] [PyT] Fix FSDP2 memory leaks for FP8 weight workspaces
 and transpose caches

Fix memory leaks where FP8 quantized weight copies and transpose caches
accumulate during FSDP2 training, defeating FSDP2's per-layer memory
savings (Issues #2681, #2717).

Changes to layernorm_mlp.py, layernorm_linear.py, linear.py:
- Detect FSDP2 via _get_module_fsdp_state; guard to tensor-scaling and
  MXFP8 quantizers whose backward re-creation is validated.
- Skip columnwise/transpose creation on weight quantizers during forward
  so FP8 caches don't accumulate across layers.
- Disable workspace caching (cache_name=None) under FSDP2 to prevent
  _fp8_workspaces from retaining per-layer copies.
- Don't save separate FP8 workspace copies for backward; re-create from
  the FSDP2 all-gathered weight in backward instead.
- Clear Float8TensorStorage._transpose after backward dgrad GEMMs to
  prevent transpose data persisting on FSDP2's reusable buffers.

Test changes (run_fsdp2_mem_leak.py):
- Remove xfail markers for fixed recipes (DelayedScaling,
  Float8CurrentScaling, MXFP8BlockScaling).
- Add targeted xfail for Float8BlockScaling/NVFP4BlockScaling whose
  blockwise storage classes have separate internal caching.
- Increase backward test tolerance to 1 MiB to account for temporary
  workspace re-creation during backward.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../fsdp2_tests/run_fsdp2_mem_leak.py         | 39 ++++----
 .../pytorch/module/layernorm_linear.py        | 52 ++++++++++-
 .../pytorch/module/layernorm_mlp.py           | 92 +++++++++++++++++--
 transformer_engine/pytorch/module/linear.py   | 53 ++++++++++-
 4 files changed, 203 insertions(+), 33 deletions(-)

diff --git a/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
index 387d3a9644..be83837da4 100644
--- a/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
+++ b/tests/pytorch/distributed/fsdp2_tests/run_fsdp2_mem_leak.py
@@ -149,6 +149,9 @@ def _measure_backward_memory_delta(model, optimizer, recipe, x, target):
     return mem_post_bwd - mem_post_fwd
 
 
+_BLOCKWISE_RECIPES = {"Float8BlockScaling", "NVFP4BlockScaling"}
+
+
 def _maybe_skip(recipe_name, quantized_model_init):
     """Skip configurations that fail for reasons unrelated to memory leaks."""
     if recipe_name == "NVFP4BlockScaling" and quantized_model_init:
@@ -158,6 +161,16 @@ def _maybe_skip(recipe_name, quantized_model_init):
         )
 
 
+def _maybe_xfail_blockwise(recipe_name):
+    """Mark blockwise recipes as xfail — their storage classes use separate
+    internal caching that is not yet cleaned up by the FSDP2 workspace fix."""
+    if recipe_name in _BLOCKWISE_RECIPES:
+        pytest.xfail(
+            f"{recipe_name} uses blockwise tensor storage with separate "
+            "internal caching not yet addressed by the FSDP2 memory fix."
+        )
+
+
 class _LayerMemoryTracker:
     """Register forward hooks on Sequential children to measure per-layer memory."""
 
@@ -253,14 +266,6 @@ def test_bf16_no_excess_forward_memory():
     )
 
 
-@pytest.mark.xfail(
-    strict=False,
-    reason=(
-        "Issue #2681: Quantized weights created during forward pass are not "
-        "deallocated between layers. Each layer's FP8 copies accumulate, "
-        "adding per-layer memory overhead beyond what bf16 autograd saves require."
-    ),
-)
 def test_fp8_temp_accumulation_across_layers(recipe_name, quantized_model_init):
     """Detect FP8 weight temporaries accumulating across layers during forward.
 
@@ -271,6 +276,7 @@ def test_fp8_temp_accumulation_across_layers(recipe_name, quantized_model_init):
     FP8 weight copies are accumulating across layers instead of being freed.
     """
     _maybe_skip(recipe_name, quantized_model_init)
+    _maybe_xfail_blockwise(recipe_name)
 
     recipe = get_recipe_from_string(recipe_name)
     world_size, device = _get_dist_info()
@@ -381,15 +387,6 @@ def test_bf16_no_excess_backward_memory():
     )
 
 
-@pytest.mark.xfail(
-    strict=False,
-    reason=(
-        "Issue #2717: _create_transpose tensor allocated in "
-        "float8_tensor_storage.py persists after backward pass until the next "
-        "forward pass frees it. These tensors should be released when backward "
-        "completes, not retained across step boundaries."
-    ),
-)
 def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_init):
     """Detect transpose caches persisting after backward completes.
 
@@ -401,6 +398,7 @@ def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_in
     With FP8, retained transpose caches make the delta significantly more positive.
     """
     _maybe_skip(recipe_name, quantized_model_init)
+    _maybe_xfail_blockwise(recipe_name)
 
     recipe = get_recipe_from_string(recipe_name)
     world_size, device = _get_dist_info()
@@ -456,9 +454,10 @@ def test_transpose_cache_retained_after_backward(recipe_name, quantized_model_in
     # significantly more positive than bf16.
     excess = fp8_bwd_delta - bf16_bwd_delta
 
-    # Allow 256 KiB total for FP8 scale/amax bookkeeping.
-    # Transpose caches (~3 MiB for this 8-layer model) should NOT persist.
-    tolerance = 256 * 1024
+    # Allow 1 MiB for FP8 scale/amax bookkeeping and temporary workspace
+    # re-creation during backward. The key check is that transpose caches
+    # (~3 MiB for this 8-layer model) do NOT persist across steps.
+    tolerance = 1024 * 1024
 
     assert excess <= tolerance, (
         f"FP8 backward retains {excess/1024**2:.2f} MiB more than bf16 baseline. "
diff --git a/transformer_engine/pytorch/module/layernorm_linear.py b/transformer_engine/pytorch/module/layernorm_linear.py
index ed91bc1235..adfee4c760 100644
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -63,6 +63,7 @@
     restore_from_func_ctx,
 )
 from ...debug.pytorch.debug_state import TEDebugState
+from ..tensor.float8_tensor import Float8Quantizer, Float8CurrentScalingQuantizer, Float8Tensor
 from ..tensor.mxfp8_tensor import MXFP8Quantizer
 from ..cpu_offload import (
     is_cpu_offload_enabled,
@@ -286,6 +287,19 @@ def forward(
         # ------------------------------------------------------
         weightmat = weight
         is_weight_param_quantized = False
+        try:
+            from ..distributed import _get_module_fsdp_state
+
+            _get_module_fsdp_state(module)
+            is_fsdp2 = True
+        except (RuntimeError, ImportError):
+            is_fsdp2 = False
+        # FSDP2 workspace optimization only applies to quantizer types
+        # whose backward re-creation is validated.
+        _fsdp2_safe = isinstance(
+            weight_quantizer, (Float8Quantizer, Float8CurrentScalingQuantizer, MXFP8Quantizer)
+        ) or isinstance(weight, Float8Tensor)
+        is_fsdp2 = is_fsdp2 and _fsdp2_safe
         if fp8 or debug:
             is_weight_param_quantized = isinstance(weight, QuantizedTensorStorage)
 
@@ -295,14 +309,23 @@ def forward(
             if is_weight_param_quantized and not debug:
                 weight_quantizer = weight._quantizer
             elif weight_quantizer is not None:
-                weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
+                # FSDP2: Skip columnwise/transpose creation during forward
+                # to avoid accumulating caches across layers. Backward's
+                # FSDP2 all-gather will recreate them. (Issue #2681)
+                weight_quantizer.set_usage(
+                    rowwise=True,
+                    columnwise=is_grad_enabled and not is_fsdp2,
+                )
 
             # Get quantized weight
+            # FSDP2: Don't cache workspaces — they would persist across
+            # layers, defeating FSDP2 memory savings. (Issue #2681)
             update_workspace = is_first_microbatch is None or is_first_microbatch
+            wt_cache = None if (is_first_microbatch is None or is_fsdp2) else "weight"
             weightmat = module.get_weight_workspace(
                 tensor=weight,
                 quantizer=weight_quantizer,
-                cache_name=(None if is_first_microbatch is None else "weight"),
+                cache_name=wt_cache,
                 update_workspace=update_workspace,
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
@@ -459,9 +482,15 @@ def forward(
                     # weights if weights are externally touched outside this module
                     ctx.weight_object = weight
 
+            # FSDP2: Don't save FP8 workspace for non-quantized weights.
+            # Backward will re-quantize from FSDP2 all-gathered weight.
+            # (Issue #2681)
+            wt_save = weightmat
+            if is_fsdp2 and weightmat is not weight:
+                wt_save = None
             tensors_to_save, tensor_objects = prepare_for_saving(
                 inputmat,
-                weightmat,
+                wt_save,
                 weight,
                 bias,
                 ln_weight,
@@ -474,6 +503,7 @@ def forward(
             ctx.requires_dgrad = inp_requires_grad
             ctx.requires_wgrad = weight.requires_grad
             ctx.is_weight_param_quantized = is_weight_param_quantized
+            ctx.is_fsdp2 = is_fsdp2
             if fuse_wgrad_accumulation and weight.requires_grad:
                 # This check is needed to ensure that main_grad is not created
                 # during the forward pass when using MCore FSDP as it creates
@@ -694,6 +724,16 @@ def backward(
             # Note: Gradient w.r.t. GEMM input (i.e. norm output).
             # --------------------------------------------------
 
+            # FSDP2: Re-create workspace from all-gathered weight when
+            # workspace was not saved. (Issue #2681)
+            if weight is None:
+                if isinstance(origin_weight, QuantizedTensorStorage):
+                    origin_weight.update_usage(columnwise_usage=True)
+                    weight = origin_weight
+                elif ctx.weight_quantizer is not None:
+                    ctx.weight_quantizer.set_usage(rowwise=False, columnwise=True)
+                    weight = ctx.weight_quantizer(origin_weight)
+
             # Make sure required data is available
             if isinstance(grad_output, QuantizedTensorStorage):
                 grad_output.update_usage(rowwise_usage=True)
@@ -740,6 +780,12 @@ def backward(
             )
             nvtx_range_pop(f"{nvtx_label}.dgrad_gemm")
 
+            # FSDP2: Clear FP8 transpose cache after dgrad GEMM. (Issue #2717)
+            if getattr(ctx, "is_fsdp2", False) and hasattr(weight, "_transpose"):
+                if getattr(weight, "_transpose", None) is not None:
+                    weight._transpose = None
+                    weight._transpose_invalid = True
+
             # Prepare grad input tensor
             # Note: Perform tensor-parallel communication
             dgrad = None
diff --git a/transformer_engine/pytorch/module/layernorm_mlp.py b/transformer_engine/pytorch/module/layernorm_mlp.py
index cc3dcc4064..e40879d14d 100644
--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -465,6 +465,22 @@ def _forward(
         # Cast weights to expected dtype
         fc1_weight_final = fc1_weight
         fc2_weight_final = fc2_weight
+        try:
+            from ..distributed import _get_module_fsdp_state
+
+            _get_module_fsdp_state(module)
+            is_fsdp2 = True
+        except (RuntimeError, ImportError):
+            is_fsdp2 = False
+        # FSDP2: Skip columnwise/transpose creation during forward (not
+        # recompute) to avoid accumulating FP8 caches across layers.
+        # Backward's FSDP2 all-gather will recreate them. (Issue #2681)
+        # Only for quantizer types whose backward re-creation is validated.
+        _fsdp2_safe_quantizers = (Float8Quantizer, Float8CurrentScalingQuantizer, MXFP8Quantizer)
+        _is_safe_for_fsdp2 = isinstance(fc1_weight_quantizer, _fsdp2_safe_quantizers) or isinstance(
+            fc1_weight, Float8Tensor
+        )
+        fsdp2_skip_columnwise = is_fsdp2 and not is_recomputation and _is_safe_for_fsdp2
         if fp8 or debug:
             # If weights are not quantized, we call get_weight_workspace,
             # which handles weight caching etc.
@@ -475,17 +491,31 @@ def _forward(
             if isinstance(fc1_weight, QuantizedTensorStorage) and not debug:
                 fc1_weight_quantizer = fc1_weight._quantizer
             elif fc1_weight_quantizer is not None:
-                fc1_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
+                fc1_weight_quantizer.set_usage(
+                    rowwise=True,
+                    columnwise=is_grad_enabled and not fsdp2_skip_columnwise,
+                )
 
             if isinstance(fc2_weight, QuantizedTensorStorage) and not debug:
                 fc2_weight_quantizer = fc2_weight._quantizer
             elif fc2_weight_quantizer is not None:
-                fc2_weight_quantizer.set_usage(rowwise=True, columnwise=is_grad_enabled)
+                fc2_weight_quantizer.set_usage(
+                    rowwise=True,
+                    columnwise=is_grad_enabled and not fsdp2_skip_columnwise,
+                )
 
+            # FSDP2: Don't cache workspaces — they would persist across
+            # layers, defeating FSDP2 memory savings. (Issue #2681)
+            fc1_cache = (
+                None if (is_first_microbatch is None or fsdp2_skip_columnwise) else "fc1_weight"
+            )
+            fc2_cache = (
+                None if (is_first_microbatch is None or fsdp2_skip_columnwise) else "fc2_weight"
+            )
             fc1_weight_final = module.get_weight_workspace(
                 tensor=fc1_weight,
                 quantizer=fc1_weight_quantizer,
-                cache_name=(None if is_first_microbatch is None else "fc1_weight"),
+                cache_name=fc1_cache,
                 update_workspace=update_workspace,
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
@@ -494,7 +524,7 @@ def _forward(
             fc2_weight_final = module.get_weight_workspace(
                 tensor=fc2_weight,
                 quantizer=fc2_weight_quantizer,
-                cache_name=(None if is_first_microbatch is None else "fc2_weight"),
+                cache_name=fc2_cache,
                 update_workspace=update_workspace,
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
@@ -743,17 +773,27 @@ def _forward(
                         fc2_weight,
                         fc2_bias,
                     )
+                # FSDP2: Don't save FP8 workspace copies for non-quantized
+                # weights. Backward will re-quantize from the FSDP2
+                # all-gathered weight parameter. (Issue #2681)
+                fc1_wt_save = fc1_weight_final
+                fc2_wt_save = fc2_weight_final
+                if fsdp2_skip_columnwise:
+                    if fc1_weight_final is not fc1_weight:
+                        fc1_wt_save = None
+                    if fc2_weight_final is not fc2_weight:
+                        fc2_wt_save = None
                 tensors_to_save, tensor_objects = prepare_for_saving(
                     inputmat,
                     ln_weight,
                     ln_out,
-                    fc1_weight_final,
+                    fc1_wt_save,
                     fc1_weight,
                     fc1_bias,
                     fc1_out,
                     fc1_out_without_bias,
                     act_out,
-                    fc2_weight_final,
+                    fc2_wt_save,
                     fc2_weight,
                     fc2_bias,
                     mu,
@@ -793,6 +833,7 @@ def _forward(
             ctx.fc2_weight_requires_grad = fc2_weight.requires_grad
             ctx.fc1_weight = fc1_weight
             ctx.fc2_weight = fc2_weight
+            ctx.is_fsdp2 = fsdp2_skip_columnwise
 
             ctx.device = device
             ctx.activation_dtype = activation_dtype
@@ -1107,6 +1148,17 @@ def backward(
                 and (not ctx.debug)
             )
 
+            # FSDP2: Re-create workspace from all-gathered weight when
+            # workspace was not saved to avoid forward memory
+            # accumulation. (Issue #2681)
+            if fc2_weight is None:
+                if isinstance(origin_fc2_weight, QuantizedTensorStorage):
+                    origin_fc2_weight.update_usage(columnwise_usage=True)
+                    fc2_weight = origin_fc2_weight
+                elif ctx.fc2_weight_quantizer is not None:
+                    ctx.fc2_weight_quantizer.set_usage(rowwise=False, columnwise=True)
+                    fc2_weight = ctx.fc2_weight_quantizer(origin_fc2_weight)
+
             # Make sure required data is available
             if isinstance(grad_output, QuantizedTensorStorage):
                 grad_output.update_usage(rowwise_usage=True)
@@ -1134,6 +1186,15 @@ def backward(
                 ub_type=tex.CommOverlapType.AG if ctx.ub_overlap_ag else None,
             )
 
+            # FSDP2: Clear FP8 transpose cache after dgrad GEMM to prevent
+            # it from persisting on the all-gathered buffer across step
+            # boundaries. Only for Float8TensorStorage which uses _transpose.
+            # (Issue #2717)
+            if getattr(ctx, "is_fsdp2", False) and hasattr(fc2_weight, "_transpose"):
+                if getattr(fc2_weight, "_transpose", None) is not None:
+                    fc2_weight._transpose = None
+                    fc2_weight._transpose_invalid = True
+
             # Prepare input grad tensor
             dact = None
             fc2_dgrad = None
@@ -1361,9 +1422,19 @@ def fc2_wgrad_gemm(
             # FC1 DGRAD
             # --------------------------------------------------
 
+            # FSDP2: Re-create workspace from all-gathered weight when
+            # workspace was not saved. (Issue #2681)
+            if fc1_weight is None:
+                if isinstance(origin_fc1_weight, QuantizedTensorStorage):
+                    origin_fc1_weight.update_usage(columnwise_usage=True)
+                    fc1_weight = origin_fc1_weight
+                elif ctx.fc1_weight_quantizer is not None:
+                    ctx.fc1_weight_quantizer.set_usage(rowwise=False, columnwise=True)
+                    fc1_weight = ctx.fc1_weight_quantizer(origin_fc1_weight)
+
             # Make sure required data is available
             if ctx.fc1_weight_quantizer is not None and isinstance(
-                ctx.fc1_weight_quantizer, QuantizedTensorStorage
+                ctx.fc1_weight, QuantizedTensorStorage
             ):
                 ctx.fc1_weight.update_usage(columnwise_usage=True)
 
@@ -1393,6 +1464,13 @@ def fc2_wgrad_gemm(
                 bulk_overlap=ctx.ub_bulk_dgrad,
             )
 
+            # FSDP2: Clear FP8 transpose cache after FC1 dgrad GEMM.
+            # (Issue #2717)
+            if getattr(ctx, "is_fsdp2", False) and hasattr(fc1_weight, "_transpose"):
+                if getattr(fc1_weight, "_transpose", None) is not None:
+                    fc1_weight._transpose = None
+                    fc1_weight._transpose_invalid = True
+
             # Prepare grad input tensor
             # Note: Perform tensor-parallel communication
             fc1_dgrad = None
diff --git a/transformer_engine/pytorch/module/linear.py b/transformer_engine/pytorch/module/linear.py
index ea921341a4..7bc4883280 100644
--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -63,7 +63,7 @@
     prepare_for_saving,
     restore_from_func_ctx,
 )
-from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer
+from ..tensor.float8_tensor import Float8CurrentScalingQuantizer, Float8Quantizer, Float8Tensor
 from ..tensor.mxfp8_tensor import MXFP8Quantizer
 from ..tensor.utils import is_custom
 from ..export import is_in_onnx_export_mode, assert_warmed_up
@@ -248,6 +248,21 @@ def forward(
         # Prepare weight tensor
         # ------------------------------------------------------
         weightmat = weight
+        try:
+            from ..distributed import _get_module_fsdp_state
+
+            _get_module_fsdp_state(module)
+            is_fsdp2 = True
+        except (RuntimeError, ImportError):
+            is_fsdp2 = False
+        # FSDP2 workspace optimization only applies to quantizer types
+        # whose backward re-creation is validated.
+        from ..tensor.mxfp8_tensor import MXFP8Quantizer
+
+        _fsdp2_safe = isinstance(
+            weight_quantizer, (Float8Quantizer, Float8CurrentScalingQuantizer, MXFP8Quantizer)
+        ) or isinstance(weight, Float8Tensor)
+        is_fsdp2 = is_fsdp2 and _fsdp2_safe
         if fp8 or debug:
             # Configure quantizer
             # No need to set the quantizer states if weight is already quantized
@@ -259,16 +274,24 @@ def forward(
                         is_fp8_activation_recompute_enabled()
                         and not in_fp8_activation_recompute_phase()
                     )
+                # FSDP2: Skip columnwise/transpose creation during forward
+                # to avoid accumulating caches across layers. Backward's
+                # FSDP2 all-gather will recreate them. (Issue #2681)
+                if is_fsdp2:
+                    columnwise_usage = False
                 weight_quantizer.set_usage(rowwise=True, columnwise=columnwise_usage)
             elif isinstance(weight, QuantizedTensor):
                 # If weight is already quantized, no need to set quantizer states
                 weight_quantizer = weight._quantizer
             # Get quantized weight
+            # FSDP2: Don't cache workspaces — they would persist across
+            # layers, defeating FSDP2 memory savings. (Issue #2681)
             update_workspace = is_first_microbatch is None or is_first_microbatch
+            wt_cache = None if (is_first_microbatch is None or is_fsdp2) else "weight"
             weightmat = module.get_weight_workspace(
                 tensor=weight,
                 quantizer=weight_quantizer,
-                cache_name=(None if is_first_microbatch is None else "weight"),
+                cache_name=wt_cache,
                 update_workspace=update_workspace,
                 skip_update_flag=skip_fp8_weight_update,
                 fsdp_group=fsdp_group,
@@ -430,14 +453,21 @@ def forward(
                 mark_not_offload(weight, weightmat, bias)
 
             # TODO(ksivamani): Check memory usage
+            # FSDP2: Don't save FP8 workspace for non-quantized weights.
+            # Backward will re-quantize from FSDP2 all-gathered weight.
+            # (Issue #2681)
+            wt_save = weightmat
+            if is_fsdp2 and weightmat is not weight:
+                wt_save = None
             tensors_to_save, tensor_objects = prepare_for_saving(
                 saved_inputmat,
-                weightmat,
+                wt_save,
                 weight,
                 bias,
             )
             ctx.save_for_backward(*tensors_to_save)
             ctx.tensor_objects = tensor_objects
+            ctx.is_fsdp2 = is_fsdp2
 
             ctx.activation_dtype = activation_dtype
             ctx.fp8 = fp8
@@ -681,6 +711,16 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
             dgrad_work = None
             if ctx.requires_dgrad:
 
+                # FSDP2: Re-create workspace from all-gathered weight
+                # when workspace was not saved. (Issue #2681)
+                if weight_fp8 is None:
+                    if isinstance(weight, QuantizedTensorStorage):
+                        weight.update_usage(columnwise_usage=True)
+                        weight_fp8 = weight
+                    elif ctx.weight_quantizer is not None:
+                        ctx.weight_quantizer.set_usage(rowwise=False, columnwise=True)
+                        weight_fp8 = ctx.weight_quantizer(weight)
+
                 # Make sure required data is available
                 if isinstance(grad_output, QuantizedTensorStorage):
                     grad_output.update_usage(rowwise_usage=True)
@@ -730,6 +770,13 @@ def backward(ctx, grad_output: torch.Tensor) -> Tuple[Union[torch.Tensor, None],
                 )
                 nvtx_range_pop(f"{nvtx_label}.dgrad_gemm")
 
+                # FSDP2: Clear FP8 transpose cache after dgrad GEMM.
+                # (Issue #2717)
+                if getattr(ctx, "is_fsdp2", False) and hasattr(weight_fp8, "_transpose"):
+                    if getattr(weight_fp8, "_transpose", None) is not None:
+                        weight_fp8._transpose = None
+                        weight_fp8._transpose_invalid = True
+
                 # Prepare grad input tensor
                 # Note: Perform tensor-parallel communication
                 if ctx.ub_overlap_rs_dgrad: