ROCm · pemeliya · Jun 12, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/include/mori/application/transport/sdma/anvil.hpp b/include/mori/application/transport/sdma/anvil.hpp
@@ -102,16 +102,25 @@ class AnvilLib {
 
   int getSdmaEngineId(int srcDeviceId, int dstDeviceId);
 
+  // KFD topology node id for a HIP device id.
+  uint32_t getNodeId(int deviceId);
+
+  // Bitmask of SDMA engine ids KFD recommends for the src->dst xGMI link to
+  // reach maximum bandwidth (sysfs recommended_sdma_engine_id_mask). Returns 0
+  // if the link or property is unavailable, in which case callers fall back to
+  // the static OAM map.
+  uint32_t getRecommendedEngineMask(int srcDeviceId, int dstDeviceId);
+
   struct PairHash {
     std::size_t operator()(const std::pair<int, int>& p) const {
       return std::hash<int>()(p.first) ^ (std::hash<int>()(p.second) << 16);
     }
   };
+  using SdmaQueueVector = std::vector<std::unique_ptr<SdmaQueue>>;
 
   std::once_flag init_flag;
   std::mutex channels_mutex_;
-  std::unordered_map<std::pair<int, int>, std::vector<std::unique_ptr<SdmaQueue>>, PairHash>
-      sdma_channels_;
+  std::unordered_map<std::pair<int, int>, SdmaQueueVector, PairHash> sdma_channels_;
 };
 
 extern AnvilLib& anvil;

diff --git a/src/application/transport/sdma/anvil.cpp b/src/application/transport/sdma/anvil.cpp
@@ -29,9 +29,11 @@
 
 #include "mori/application/transport/sdma/anvil.hpp"
 
+#include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <stdexcept>
 namespace anvil {
 
 auto checkHsaError = [](hsa_status_t s, const char* msg, const char* file, int line) {
@@ -160,13 +162,6 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
     // return status;
   }
 
-  // std::cout << "Allocating queue for engine " << engineId << " on device " << localDeviceId << "
-  // to device "
-  //           << remoteDeviceId << std::endl;
-  // std::cout << "original device id: " << originalDeviceId << " local " << localDeviceId << "
-  // remote " << remoteDeviceId
-  //           << " local node " << localNodeId << std::endl;
-
   // Allocate SDMA queue buffer on device side, requires ExecuteAccess
   HsaMemFlags memFlags = {};
   memFlags.ui32.NonPaged = 1;
@@ -187,10 +182,10 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
   // TODO needed here?
   memset(&queue_, 0, sizeof(HsaQueueResource));
 
-  CHECK_HSAKMT_SUCCESS(hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID,
-                                            DEFAULT_QUEUE_PERCENTAGE, DEFAULT_PRIORITY, engineId,
-                                            queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_),
-                       "Failed");
+  CHECK_HSAKMT_SUCCESS(
+      hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
+                           engineId, queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_),
+      "Failed");
 
   // Populate Device Handle
   // TODO uncached
@@ -220,7 +215,6 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
 }
 
 SdmaQueue::~SdmaQueue() {
-  // TODO catch exception?
   CHECK_HSAKMT_SUCCESS(hsaKmtDestroyQueue(queue_.QueueId), "Failed to destroy queue.");
   CHECK_HIP_ERROR(hipFree(deviceHandle_));
   CHECK_HIP_ERROR(hipFree(cachedWptr_));
@@ -265,16 +259,64 @@ void AnvilLib::init() {
 }
 
 bool AnvilLib::connect(int srcDeviceId, int dstDeviceId, int numChannels) {
-  uint32_t engineId = getSdmaEngineId(srcDeviceId, dstDeviceId);
   std::lock_guard<std::mutex> lock(channels_mutex_);
+  // Spread the channels across the engines recommended for this peer link. On
+  // MI350 the mask typically reports 2 engines per peer; on platforms with a
+  // single recommended engine all channels share it.
+  std::vector<uint32_t> engines;
+  if (srcDeviceId == dstDeviceId) {
+    // A loopback copy never traverses xGMI and has no self io_link, so KFD
+    // reports no recommended engine. Use a general (non-xGMI) SDMA engine.
+    engines.push_back(0);
+  } else {
+    uint32_t mask = getRecommendedEngineMask(srcDeviceId, dstDeviceId);
+    for (uint32_t b = 0; b < 32; ++b) {
+      if (mask & (1u << b)) engines.push_back(b);
+    }
+    // Fall back to the static OAM table if KFD did not report a mask.
+    if (engines.empty()) {
+      int e = getSdmaEngineId(srcDeviceId, dstDeviceId);
+      engines.push_back(e);
+    }
+  }
+  int numEngines = static_cast<int>(engines.size());
+
   auto key = std::make_pair(srcDeviceId, dstDeviceId);
   for (int c = 0; c < numChannels; ++c) {
+    uint32_t engineId = engines[c % numEngines];
     sdma_channels_[key].emplace_back(
         std::make_unique<SdmaQueue>(srcDeviceId, dstDeviceId, gpuAgents_[srcDeviceId], engineId));
   }
   return true;
 }
 
+uint32_t AnvilLib::getNodeId(int deviceId) {
+  uint32_t nodeId = 0;
+  CHECK_HSA_ERROR(hsa_agent_get_info(gpuAgents_[deviceId], HSA_AGENT_INFO_NODE, &nodeId));
+  return nodeId;
+}
+
+uint32_t AnvilLib::getRecommendedEngineMask(int srcDeviceId, int dstDeviceId) {
+  uint32_t srcNode = getNodeId(srcDeviceId), dstNode = getNodeId(dstDeviceId);
+
+  HsaNodeProperties props{};
+  if (hsaKmtGetNodeProperties(srcNode, &props) != HSAKMT_STATUS_SUCCESS || props.NumIOLinks == 0) {
+    return 0;
+  }
+
+  std::vector<HsaIoLinkProperties> links(props.NumIOLinks);
+  if (hsaKmtGetNodeIoLinkProperties(srcNode, props.NumIOLinks, links.data()) !=
+      HSAKMT_STATUS_SUCCESS) {
+    return 0;
+  }
+  for (const auto& link : links) {
+    if (link.NodeTo == dstNode) {
+      return link.RecSdmaEngIdMask;
+    }
+  }
+  return 0;
+}
+
 SdmaQueue* AnvilLib::getSdmaQueue(int srcDeviceId, int dstDeviceId, int channel_idx) {
   std::lock_guard<std::mutex> lock(channels_mutex_);
   auto key = std::make_pair(srcDeviceId, dstDeviceId);