-
Notifications
You must be signed in to change notification settings - Fork 57
SDMA engine selection #429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 5 commits
a23aea2
698afbf
9beeb0a
62a1326
e77d684
d870d8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,9 +29,11 @@ | |
|
|
||
| #include "mori/application/transport/sdma/anvil.hpp" | ||
|
|
||
| #include <cstdlib> | ||
| #include <cstring> | ||
| #include <fstream> | ||
| #include <iostream> | ||
| #include <stdexcept> | ||
| namespace anvil { | ||
|
|
||
| auto checkHsaError = [](hsa_status_t s, const char* msg, const char* file, int line) { | ||
|
|
@@ -160,13 +162,6 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg | |
| // return status; | ||
| } | ||
|
|
||
| // std::cout << "Allocating queue for engine " << engineId << " on device " << localDeviceId << " | ||
| // to device " | ||
| // << remoteDeviceId << std::endl; | ||
| // std::cout << "original device id: " << originalDeviceId << " local " << localDeviceId << " | ||
| // remote " << remoteDeviceId | ||
| // << " local node " << localNodeId << std::endl; | ||
|
|
||
| // Allocate SDMA queue buffer on device side, requires ExecuteAccess | ||
| HsaMemFlags memFlags = {}; | ||
| memFlags.ui32.NonPaged = 1; | ||
|
|
@@ -187,10 +182,10 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg | |
| // TODO needed here? | ||
| memset(&queue_, 0, sizeof(HsaQueueResource)); | ||
|
|
||
| CHECK_HSAKMT_SUCCESS(hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID, | ||
| DEFAULT_QUEUE_PERCENTAGE, DEFAULT_PRIORITY, engineId, | ||
| queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_), | ||
| "Failed"); | ||
| CHECK_HSAKMT_SUCCESS( | ||
| hsaKmtCreateQueueExt(localNodeId, HSA_QUEUE_SDMA_BY_ENG_ID, 100, HSA_QUEUE_PRIORITY_MAXIMUM, | ||
| engineId, queueBuffer_, SDMA_QUEUE_SIZE, nullptr, &queue_), | ||
| "Failed"); | ||
|
Comment on lines
-190
to
+188
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the motivation for bumping the SDMA queue priority from NORMAL (0) to MAXIMUM (3) here? Just want to make sure we understand the impact on other queues (compute / other SDMA) sharing the GPU. Was this something you saw help in practice?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this only affects SDMA queues: I took it from ROCR runtime which also uses max priority. QueuePercentage = 100 Priority = HSA_QUEUE_PRIORITY_MAXIMUM
Two things worth keeping in mind: Priority is relative. It only helps you win against lower-priority queues. Since ROCr's internal copy queues are also MAX, you won't out-prioritize them — you'll be on par (which is the intended behavior). |
||
|
|
||
| // Populate Device Handle | ||
| // TODO uncached | ||
|
|
@@ -219,14 +214,16 @@ SdmaQueue::SdmaQueue(int localDeviceId, int remoteDeviceId, hsa_agent_t& localAg | |
| hipMemcpy(committedWptr_, &committedWptr, sizeof(uint64_t), hipMemcpyHostToDevice)); | ||
| } | ||
|
|
||
| SdmaQueue::~SdmaQueue() { | ||
| // TODO catch exception? | ||
| SdmaQueue::~SdmaQueue() try { | ||
| CHECK_HSAKMT_SUCCESS(hsaKmtDestroyQueue(queue_.QueueId), "Failed to destroy queue."); | ||
| CHECK_HIP_ERROR(hipFree(deviceHandle_)); | ||
| CHECK_HIP_ERROR(hipFree(cachedWptr_)); | ||
| CHECK_HIP_ERROR(hipFree(committedWptr_)); | ||
| CHECK_HSAKMT_SUCCESS(hsaKmtUnmapMemoryToGPU(queueBuffer_), "Failed"); | ||
| CHECK_HSAKMT_SUCCESS(hsaKmtFreeMemory(queueBuffer_, SDMA_QUEUE_SIZE), "Failed"); | ||
| } catch (...) { | ||
| std::cerr << "Exception in ~SdmaQueue()" << std::endl; | ||
| std::exit(EXIT_FAILURE); | ||
| } | ||
|
Comment on lines
-222
to
224
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since both CHECK_HSAKMT_SUCCESS and CHECK_HIP_ERROR call exit() directly rather than throw, nothing in ~SdmaQueue() actually throws — so the try/catch(...) never gets hit. Was that intentional, or should the macros throw for it to be useful?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh, indeed, I was confused with CHECK_HSA_ERROR but these two macros never throw, so I have removed this block |
||
|
|
||
| SdmaQueueDeviceHandle* SdmaQueue::deviceHandle() const { return deviceHandle_; } | ||
|
|
@@ -265,16 +262,64 @@ void AnvilLib::init() { | |
| } | ||
|
|
||
| bool AnvilLib::connect(int srcDeviceId, int dstDeviceId, int numChannels) { | ||
| uint32_t engineId = getSdmaEngineId(srcDeviceId, dstDeviceId); | ||
| std::lock_guard<std::mutex> lock(channels_mutex_); | ||
| // Spread the channels across the engines recommended for this peer link. On | ||
| // MI350 the mask typically reports 2 engines per peer; on platforms with a | ||
| // single recommended engine all channels share it. | ||
| std::vector<uint32_t> engines; | ||
| if (srcDeviceId == dstDeviceId) { | ||
| // A loopback copy never traverses xGMI and has no self io_link, so KFD | ||
| // reports no recommended engine. Use a general (non-xGMI) SDMA engine. | ||
| engines.push_back(0); | ||
| } else { | ||
| uint32_t mask = getRecommendedEngineMask(srcDeviceId, dstDeviceId); | ||
| for (uint32_t b = 0; b < 32; ++b) { | ||
| if (mask & (1u << b)) engines.push_back(b); | ||
| } | ||
| // Fall back to the static OAM table if KFD did not report a mask. | ||
| if (engines.empty()) { | ||
| int e = getSdmaEngineId(srcDeviceId, dstDeviceId); | ||
| engines.push_back(e); | ||
| } | ||
| } | ||
| int numEngines = static_cast<int>(engines.size()); | ||
|
|
||
| auto key = std::make_pair(srcDeviceId, dstDeviceId); | ||
| for (int c = 0; c < numChannels; ++c) { | ||
| uint32_t engineId = engines[c % numEngines]; | ||
| sdma_channels_[key].emplace_back( | ||
| std::make_unique<SdmaQueue>(srcDeviceId, dstDeviceId, gpuAgents_[srcDeviceId], engineId)); | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| uint32_t AnvilLib::getNodeId(int deviceId) { | ||
| uint32_t nodeId = 0; | ||
| CHECK_HSA_ERROR(hsa_agent_get_info(gpuAgents_[deviceId], HSA_AGENT_INFO_NODE, &nodeId)); | ||
| return nodeId; | ||
| } | ||
|
|
||
| uint32_t AnvilLib::getRecommendedEngineMask(int srcDeviceId, int dstDeviceId) { | ||
| uint32_t srcNode = getNodeId(srcDeviceId), dstNode = getNodeId(dstDeviceId); | ||
|
|
||
| HsaNodeProperties props{}; | ||
| if (hsaKmtGetNodeProperties(srcNode, &props) != HSAKMT_STATUS_SUCCESS || props.NumIOLinks == 0) { | ||
| return 0; | ||
| } | ||
|
|
||
| std::vector<HsaIoLinkProperties> links(props.NumIOLinks); | ||
| if (hsaKmtGetNodeIoLinkProperties(srcNode, props.NumIOLinks, links.data()) != | ||
| HSAKMT_STATUS_SUCCESS) { | ||
| return 0; | ||
| } | ||
| for (const auto& link : links) { | ||
| if (link.NodeTo == dstNode) { | ||
| return link.RecSdmaEngIdMask; | ||
| } | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| SdmaQueue* AnvilLib::getSdmaQueue(int srcDeviceId, int dstDeviceId, int channel_idx) { | ||
| std::lock_guard<std::mutex> lock(channels_mutex_); | ||
| auto key = std::make_pair(srcDeviceId, dstDeviceId); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.