From 9ed4506553b363c554c1ff7051d20ccb7ee098ba Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Tue, 19 May 2026 18:04:04 -0500 Subject: [PATCH 1/3] Fix memory allocation bug to use the correct hipDevice Not using hipSetDevice before allocating memory can use unintended deviceIdx when executing fabric-handle based transfers --- src/header/TransferBench.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 5c16a78..e012080 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -1495,6 +1495,14 @@ namespace { deviceIdx = GetClosestCpuNumaToGpu(memDevice.memIndex); } + if (IsCpuMemType(memType)) { + // Set NUMA policy prior to call to hipHostMalloc + numa_set_preferred(deviceIdx); + } else if (IsGpuMemType(memType)) { + // Switch to the appropriate GPU + ERR_CHECK(hipSetDevice(deviceIdx)); + } + // If memHandle is provided, allocate sharable memory if (memHandle != NULL) { #ifdef POD_COMM_ENABLED @@ -1532,6 +1540,7 @@ namespace { memset(*memPtr, 0, roundedUpBytes); // Check that the allocated pages are actually on the correct NUMA node ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx)); + numa_set_preferred(-1); } else if (IsGpuMemType(memType)) { ERR_CHECK(hipSetDevice(memDevice.memIndex)); ERR_CHECK(hipMemset(*memPtr, 0, numBytes)); @@ -1547,9 +1556,6 @@ namespace { if (IsCpuMemType(memType)) { - // Set NUMA policy prior to call to hipHostMalloc - numa_set_preferred(deviceIdx); - // Allocate host-pinned memory (should respect NUMA mem policy) int flags = 0; #if !defined (__NVCC__) @@ -1590,8 +1596,6 @@ namespace { // Reset to default numa mem policy numa_set_preferred(-1); } else if (IsGpuMemType(memType)) { - // Switch to the appropriate GPU - ERR_CHECK(hipSetDevice(memDevice.memIndex)); if (memType == MEM_GPU) { // Allocate GPU memory on appropriate device From 625ed6cd489725218031ac3b366aa7e8a91ec298 Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Tue, 19 May 2026 22:34:09 -0500 Subject: [PATCH 2/3] Fix NUMA policy leak, redundant hipSetDevice, and device-guard gaps - Reset numa_set_preferred(-1) before ERR_FATAL early return in the non-POD_COMM_ENABLED path; without this the NUMA policy stays dirty for subsequent CPU allocations in the same process - Use memDevice.memIndex directly in the top-level hipSetDevice call instead of deviceIdx, which is NUMA-remapped for CPU types only; documents that the MEM_CPU_CLOSEST remapping does not apply to GPU - Remove now-redundant hipSetDevice inside the POD_COMM GPU memHandle branch; device was already set at the top of AllocateMemory - Guard CollectTopology GPU agent probe loop with hipSetDevice(i) so each AllocateMemory call targets the correct device Co-authored-by: Claude --- src/header/TransferBench.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index e012080..05cb72d 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -1499,8 +1499,9 @@ namespace { // Set NUMA policy prior to call to hipHostMalloc numa_set_preferred(deviceIdx); } else if (IsGpuMemType(memType)) { - // Switch to the appropriate GPU - ERR_CHECK(hipSetDevice(deviceIdx)); + // Switch to the appropriate GPU — use memDevice.memIndex directly since + // the MEM_CPU_CLOSEST NUMA remapping above only applies to CPU types + ERR_CHECK(hipSetDevice(memDevice.memIndex)); } // If memHandle is provided, allocate sharable memory @@ -1542,12 +1543,12 @@ namespace { ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx)); numa_set_preferred(-1); } else if (IsGpuMemType(memType)) { - ERR_CHECK(hipSetDevice(memDevice.memIndex)); ERR_CHECK(hipMemset(*memPtr, 0, numBytes)); ERR_CHECK(hipDeviceSynchronize()); } return ERR_NONE; #else + if (IsCpuMemType(memType)) numa_set_preferred(-1); return {ERR_FATAL, "Unable to allocate sharable memory if not compiled with pod communication support"}; #endif } else { @@ -7787,6 +7788,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) gpuAgents.clear(); char *tempBuffer; for (int i = 0; i < numGpus; i++) { + if (hipSetDevice(i) != hipSuccess) continue; AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer); hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL); gpuAgents.push_back(info.agentOwner); From 7b8f335af49ae6875faaf5021d827fca8dda3360 Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Wed, 20 May 2026 01:24:55 -0500 Subject: [PATCH 3/3] Update bug fix --- src/header/TransferBench.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 05cb72d..4a841e7 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -1499,9 +1499,9 @@ namespace { // Set NUMA policy prior to call to hipHostMalloc numa_set_preferred(deviceIdx); } else if (IsGpuMemType(memType)) { - // Switch to the appropriate GPU — use memDevice.memIndex directly since - // the MEM_CPU_CLOSEST NUMA remapping above only applies to CPU types - ERR_CHECK(hipSetDevice(memDevice.memIndex)); + // Switch to the appropriate GPU + // IMP: if the remapping above changes, remember to modify this! + ERR_CHECK(hipSetDevice(deviceIdx)); } // If memHandle is provided, allocate sharable memory @@ -7788,7 +7788,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) gpuAgents.clear(); char *tempBuffer; for (int i = 0; i < numGpus; i++) { - if (hipSetDevice(i) != hipSuccess) continue; AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer); hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL); gpuAgents.push_back(info.agentOwner);