Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions include/mori/application/transport/sdma/anvil.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,25 @@ class AnvilLib {

int getSdmaEngineId(int srcDeviceId, int dstDeviceId);

// KFD topology node id for a HIP device id.
uint32_t getNodeId(int deviceId);

// Bitmask of SDMA engine ids KFD recommends for the src->dst xGMI link to
// reach maximum bandwidth (sysfs recommended_sdma_engine_id_mask). Returns 0
// if the link or property is unavailable, in which case callers fall back to
// the static OAM map.
uint32_t getRecommendedEngineMask(int srcDeviceId, int dstDeviceId);

struct PairHash {
std::size_t operator()(const std::pair<int, int>& p) const {
return std::hash<int>()(p.first) ^ (std::hash<int>()(p.second) << 16);
}
};
using SdmaQueueVector = std::vector<std::unique_ptr<SdmaQueue>>;

std::once_flag init_flag;
std::mutex channels_mutex_;
std::unordered_map<std::pair<int, int>, std::vector<std::unique_ptr<SdmaQueue>>, PairHash>
sdma_channels_;
std::unordered_map<std::pair<int, int>, SdmaQueueVector, PairHash> sdma_channels_;
};

extern AnvilLib& anvil;
Expand Down
68 changes: 55 additions & 13 deletions src/application/transport/sdma/anvil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@

#include "mori/application/transport/sdma/anvil.hpp"

#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <stdexcept>
Comment thread
Copilot marked this conversation as resolved.
namespace anvil {

auto checkHsaError = [](hsa_status_t s, const char* msg, const char* file, int line) {
Expand Down Expand Up @@ -160,13 +162,6 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
// return status;
}

// std::cout << "Allocating queue for engine " << engineId << " on device " << localDeviceId << "
// to device "
// << remoteDeviceId << std::endl;
// std::cout << "original device id: " << originalDeviceId << " local " << localDeviceId << "
// remote " << remoteDeviceId
// << " local node " << localNodeId << std::endl;

// Allocate SDMA queue buffer on device side, requires ExecuteAccess
HsaMemFlags memFlags = {};
memFlags.ui32.NonPaged = 1;
Expand All @@ -187,10 +182,10 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
// TODO needed here?
memset(&queue_, 0, sizeof(HsaQueueResource));

CHECK_HSAKMT_SUCCESS(hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID,
DEFAULT_QUEUE_PERCENTAGE, DEFAULT_PRIORITY, engineId,
queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_),
"Failed");
CHECK_HSAKMT_SUCCESS(
hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID, 100, HSA_QUEUE_PRIORITY_MAXIMUM,
engineId, queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_),
"Failed");
Comment on lines -190 to +188

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the motivation for bumping the SDMA queue priority from NORMAL (0) to MAXIMUM (3) here? Just want to make sure we understand the impact on other queues (compute / other SDMA) sharing the GPU. Was this something you saw help in practice?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this only affects SDMA queues: I took it from ROCR runtime which also uses max priority.

QueuePercentage = 100
This is the max share of the engine's scheduling quantum the queue may consume; 100 means "no artificial throttle." It's passed straight through to KFD. In practice it does not give you fine-grained SDMA bandwidth control — partial values just cap the queue's scheduling slice, and every real user (ROCr included) passes 100. So 100 is the right value; anything lower would only throttle yourself.

Priority = HSA_QUEUE_PRIORITY_MAXIMUM
The thunk remaps the enum (-3..3) into KFD's 0–15 range:

static uint32_t priority_map[] = {0, 3, 5, 7, 9, 11, 15};
So MAXIMUM (3) → KFD priority 15, vs NORMAL (0) → 7. The kernel hardware scheduler uses this only for arbitration ordering among queues contending for the same SDMA engine.

Two things worth keeping in mind:

Priority is relative. It only helps you win against lower-priority queues. Since ROCr's internal copy queues are also MAX, you won't out-prioritize them — you'll be on par (which is the intended behavior).


// Populate Device Handle
// TODO uncached
Expand Down Expand Up @@ -220,7 +215,6 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg
}

SdmaQueue::~SdmaQueue() {
// TODO catch exception?
CHECK_HSAKMT_SUCCESS(hsaKmtDestroyQueue(queue_.QueueId), "Failed to destroy queue.");
CHECK_HIP_ERROR(hipFree(deviceHandle_));
CHECK_HIP_ERROR(hipFree(cachedWptr_));
Expand Down Expand Up @@ -265,16 +259,64 @@ void AnvilLib::init() {
}

bool AnvilLib::connect(int srcDeviceId, int dstDeviceId, int numChannels) {
uint32_t engineId = getSdmaEngineId(srcDeviceId, dstDeviceId);
std::lock_guard<std::mutex> lock(channels_mutex_);
// Spread the channels across the engines recommended for this peer link. On
// MI350 the mask typically reports 2 engines per peer; on platforms with a
// single recommended engine all channels share it.
std::vector<uint32_t> engines;
if (srcDeviceId == dstDeviceId) {
// A loopback copy never traverses xGMI and has no self io_link, so KFD
// reports no recommended engine. Use a general (non-xGMI) SDMA engine.
engines.push_back(0);
} else {
uint32_t mask = getRecommendedEngineMask(srcDeviceId, dstDeviceId);
for (uint32_t b = 0; b < 32; ++b) {
if (mask & (1u << b)) engines.push_back(b);
}
// Fall back to the static OAM table if KFD did not report a mask.
if (engines.empty()) {
int e = getSdmaEngineId(srcDeviceId, dstDeviceId);
engines.push_back(e);
}
}
int numEngines = static_cast<int>(engines.size());

auto key = std::make_pair(srcDeviceId, dstDeviceId);
for (int c = 0; c < numChannels; ++c) {
uint32_t engineId = engines[c % numEngines];
sdma_channels_[key].emplace_back(
std::make_unique<SdmaQueue>(srcDeviceId, dstDeviceId, gpuAgents_[srcDeviceId], engineId));
}
return true;
}

uint32_t AnvilLib::getNodeId(int deviceId) {
uint32_t nodeId = 0;
CHECK_HSA_ERROR(hsa_agent_get_info(gpuAgents_[deviceId], HSA_AGENT_INFO_NODE, &nodeId));
return nodeId;
}

uint32_t AnvilLib::getRecommendedEngineMask(int srcDeviceId, int dstDeviceId) {
uint32_t srcNode = getNodeId(srcDeviceId), dstNode = getNodeId(dstDeviceId);

HsaNodeProperties props{};
if (hsaKmtGetNodeProperties(srcNode, &props) != HSAKMT_STATUS_SUCCESS || props.NumIOLinks == 0) {
return 0;
}

std::vector<HsaIoLinkProperties> links(props.NumIOLinks);
if (hsaKmtGetNodeIoLinkProperties(srcNode, props.NumIOLinks, links.data()) !=
HSAKMT_STATUS_SUCCESS) {
return 0;
}
for (const auto& link : links) {
if (link.NodeTo == dstNode) {
return link.RecSdmaEngIdMask;
}
}
return 0;
}

SdmaQueue* AnvilLib::getSdmaQueue(int srcDeviceId, int dstDeviceId, int channel_idx) {
std::lock_guard<std::mutex> lock(channels_mutex_);
auto key = std::make_pair(srcDeviceId, dstDeviceId);
Expand Down
Loading