From 9ed4506553b363c554c1ff7051d20ccb7ee098ba Mon Sep 17 00:00:00 2001
From: nileshnegi <Nilesh.Negi@amd.com>
Date: Tue, 19 May 2026 18:04:04 -0500
Subject: [PATCH 1/3] Fix memory allocation bug to use the correct hipDevice

Not using hipSetDevice before allocating memory can use unintended
deviceIdx when executing fabric-handle based transfers
---
 src/header/TransferBench.hpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 5c16a78..e012080 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -1495,6 +1495,14 @@ namespace {
       deviceIdx = GetClosestCpuNumaToGpu(memDevice.memIndex);
     }
 
+    if (IsCpuMemType(memType)) {
+      // Set NUMA policy prior to call to hipHostMalloc
+      numa_set_preferred(deviceIdx);
+    } else if (IsGpuMemType(memType)) {
+      // Switch to the appropriate GPU
+      ERR_CHECK(hipSetDevice(deviceIdx));
+    }
+
     // If memHandle is provided, allocate sharable memory
     if (memHandle != NULL) {
 #ifdef POD_COMM_ENABLED
@@ -1532,6 +1540,7 @@ namespace {
         memset(*memPtr, 0, roundedUpBytes);
         // Check that the allocated pages are actually on the correct NUMA node
         ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx));
+        numa_set_preferred(-1);
       } else if (IsGpuMemType(memType)) {
         ERR_CHECK(hipSetDevice(memDevice.memIndex));
         ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
@@ -1547,9 +1556,6 @@ namespace {
 
     if (IsCpuMemType(memType)) {
 
-      // Set NUMA policy prior to call to hipHostMalloc
-      numa_set_preferred(deviceIdx);
-
       // Allocate host-pinned memory (should respect NUMA mem policy)
       int flags = 0;
 #if !defined (__NVCC__)
@@ -1590,8 +1596,6 @@ namespace {
       // Reset to default numa mem policy
       numa_set_preferred(-1);
     } else if (IsGpuMemType(memType)) {
-      // Switch to the appropriate GPU
-      ERR_CHECK(hipSetDevice(memDevice.memIndex));
 
       if (memType == MEM_GPU) {
         // Allocate GPU memory on appropriate device

From 625ed6cd489725218031ac3b366aa7e8a91ec298 Mon Sep 17 00:00:00 2001
From: nileshnegi <Nilesh.Negi@amd.com>
Date: Tue, 19 May 2026 22:34:09 -0500
Subject: [PATCH 2/3] Fix NUMA policy leak, redundant hipSetDevice, and
 device-guard gaps

- Reset numa_set_preferred(-1) before ERR_FATAL early return in the
  non-POD_COMM_ENABLED path; without this the NUMA policy stays dirty
  for subsequent CPU allocations in the same process
- Use memDevice.memIndex directly in the top-level hipSetDevice call
  instead of deviceIdx, which is NUMA-remapped for CPU types only;
  documents that the MEM_CPU_CLOSEST remapping does not apply to GPU
- Remove now-redundant hipSetDevice inside the POD_COMM GPU memHandle
  branch; device was already set at the top of AllocateMemory
- Guard CollectTopology GPU agent probe loop with hipSetDevice(i) so
  each AllocateMemory call targets the correct device

Co-authored-by: Claude <claude@anthropic.com>
---
 src/header/TransferBench.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index e012080..05cb72d 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -1499,8 +1499,9 @@ namespace {
       // Set NUMA policy prior to call to hipHostMalloc
       numa_set_preferred(deviceIdx);
     } else if (IsGpuMemType(memType)) {
-      // Switch to the appropriate GPU
-      ERR_CHECK(hipSetDevice(deviceIdx));
+      // Switch to the appropriate GPU — use memDevice.memIndex directly since
+      // the MEM_CPU_CLOSEST NUMA remapping above only applies to CPU types
+      ERR_CHECK(hipSetDevice(memDevice.memIndex));
     }
 
     // If memHandle is provided, allocate sharable memory
@@ -1542,12 +1543,12 @@ namespace {
         ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx));
         numa_set_preferred(-1);
       } else if (IsGpuMemType(memType)) {
-        ERR_CHECK(hipSetDevice(memDevice.memIndex));
         ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
         ERR_CHECK(hipDeviceSynchronize());
       }
       return ERR_NONE;
 #else
+      if (IsCpuMemType(memType)) numa_set_preferred(-1);
       return {ERR_FATAL, "Unable to allocate sharable memory if not compiled with pod communication support"};
 #endif
     } else {
@@ -7787,6 +7788,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       gpuAgents.clear();
       char *tempBuffer;
       for (int i = 0; i < numGpus; i++) {
+        if (hipSetDevice(i) != hipSuccess) continue;
         AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer);
         hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL);
         gpuAgents.push_back(info.agentOwner);

From 7b8f335af49ae6875faaf5021d827fca8dda3360 Mon Sep 17 00:00:00 2001
From: nileshnegi <Nilesh.Negi@amd.com>
Date: Wed, 20 May 2026 01:24:55 -0500
Subject: [PATCH 3/3] Update bug fix

---
 src/header/TransferBench.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp
index 05cb72d..4a841e7 100644
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -1499,9 +1499,9 @@ namespace {
       // Set NUMA policy prior to call to hipHostMalloc
       numa_set_preferred(deviceIdx);
     } else if (IsGpuMemType(memType)) {
-      // Switch to the appropriate GPU — use memDevice.memIndex directly since
-      // the MEM_CPU_CLOSEST NUMA remapping above only applies to CPU types
-      ERR_CHECK(hipSetDevice(memDevice.memIndex));
+      // Switch to the appropriate GPU
+      // IMP: if the remapping above changes, remember to modify this!
+      ERR_CHECK(hipSetDevice(deviceIdx));
     }
 
     // If memHandle is provided, allocate sharable memory
@@ -7788,7 +7788,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
       gpuAgents.clear();
       char *tempBuffer;
       for (int i = 0; i < numGpus; i++) {
-        if (hipSetDevice(i) != hipSuccess) continue;
         AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer);
         hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL);
         gpuAgents.push_back(info.agentOwner);