Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Src/Base/AMReX_Gpu.H
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ namespace amrex::Cuda {}

#include <AMReX_GpuAllocators.H>
#include <AMReX_GpuContainers.H>
#include <AMReX_GpuParallelReduce.H>
#include <AMReX_CudaGraph.H>

namespace amrex::Gpu {
Expand Down
147 changes: 147 additions & 0 deletions Src/Base/AMReX_GpuParallelReduce.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#ifndef AMREX_GPU_PARALLEL_REDUCE_H_
#define AMREX_GPU_PARALLEL_REDUCE_H_
#include <AMReX_Config.H>

#include <AMReX_GpuContainers.H>
#include <AMReX_INT.H>
#include <AMReX_ParallelDescriptor.H>
#include <AMReX_ParallelReduce.H>

#include <cstddef>

//
// GPU-aware MPI collectives that operate in place on a Gpu::DeviceVector.
//
// These overloads complement the pointer/scalar overloads in
// AMReX_ParallelReduce.H and AMReX_ParallelDescriptor.H. They live in a
// separate header (rather than in those low-level headers) because they need
// the Gpu container/copy machinery (AMReX_GpuContainers.H) that would bloat the
// ParallelReduce.H headers. This header is also pulled into the AMReX_Gpu.H
// umbrella for convenience.
//
// When AMReX is configured with GPU-aware MPI (ParallelDescriptor::UseGpuAwareMpi())
// the device buffer is handed to MPI directly, otherwise the data is staged
// through host (pinned) memory for the collective.
//

namespace amrex {

namespace ParallelAllReduce {

/// \ingroup amrex_mpi
/** Sum-reduce the elements of a Gpu::DeviceVector in place, on all ranks. */
template <typename T>
void Sum (Gpu::DeviceVector<T>& v, MPI_Comm comm)
{
// GPU-unaware case
#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
if (!ParallelDescriptor::UseGpuAwareMpi()) {
Gpu::PinnedVector<T> hv(v.size());
Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
Sum(hv.data(), static_cast<int>(hv.size()), comm);
Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
return;
}
#endif

// GPU-aware case
Sum(v.data(), static_cast<int>(v.size()), comm);
}

} // namespace ParallelAllReduce

namespace ParallelReduce {

/// \ingroup amrex_mpi
/** Sum-reduce the elements of a Gpu::DeviceVector in place, onto \p root. */
template <typename T>
void Sum (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
{
// GPU-unaware case
#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
if (!ParallelDescriptor::UseGpuAwareMpi()) {
Gpu::PinnedVector<T> hv(v.size());
// every rank stages its contribution to host for the reduction
Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
Sum(hv.data(), static_cast<int>(hv.size()), root, comm);
// only the root receives the reduced result, so only it copies back
if (ParallelDescriptor::MyProc(comm) == root) {
Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
}
return;
}
#endif

// GPU-aware case
Sum(v.data(), static_cast<int>(v.size()), root, comm);
}

} // namespace ParallelReduce

namespace ParallelDescriptor {

/// \ingroup amrex_mpi
/** Broadcast a Gpu::DeviceVector from \p root to all ranks in \p comm.
*
* Contract: on every receiving rank \p v must already be allocated to the same
* length as on \p root (the caller is responsible for sizing it, e.g. via
* resize). Calling this with a wrong-sized or empty receiver is undefined
* behavior. This mirrors the pointer overload Bcast(T*, n, root), which also
* requires a correctly-sized buffer on every rank.
*
* In a debug build (AMREX_DEBUG) the contract is checked: the root's length is
* broadcast first and asserted against each rank's size before the data are
* broadcast.
*
* The device buffer is passed to MPI directly when GPU-aware MPI is enabled
* (UseGpuAwareMpi()), otherwise it is staged through host (pinned) memory.
*/
template <typename T>
void Bcast (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
{
#ifdef AMREX_USE_MPI
auto const n = v.size();

#ifdef AMREX_DEBUG
// verify the pre-allocation contract (the length broadcast happens on every
// rank, so it is collectively safe and cannot deadlock)
Long n_root = static_cast<Long>(n);
Bcast(&n_root, std::size_t(1), root, comm);
AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_root == static_cast<Long>(n),
"ParallelDescriptor::Bcast(Gpu::DeviceVector): receiver must be pre-allocated to the root's length");
#endif

// trivial case: 1 rank
if (n == 0) { return; }

// GPU-unaware case
#ifdef AMREX_USE_GPU
if (!UseGpuAwareMpi()) {
Gpu::PinnedVector<T> hv(n);
const bool is_root = (MyProc(comm) == root);
// only the root needs to stage its data to host before the broadcast
if (is_root) {
Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
}
Bcast(hv.data(), static_cast<std::size_t>(n), root, comm);
// only the receivers need to copy the broadcast result back to device
if (!is_root) {
Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
}
return;
}
#endif

// GPU-aware case
Bcast(v.data(), static_cast<std::size_t>(n), root, comm);

#else // AMREX_USE_MPI
amrex::ignore_unused(v, root, comm);
#endif
}

} // namespace ParallelDescriptor

} // namespace amrex

#endif /*AMREX_GPU_PARALLEL_REDUCE_H_*/
1 change: 1 addition & 0 deletions Src/Base/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
AMReX_OpenMP.H
AMReX_OpenMP.cpp
AMReX_ParallelReduce.H
AMReX_GpuParallelReduce.H
AMReX_ForkJoin.H
AMReX_ForkJoin.cpp
AMReX_ParallelContext.H
Expand Down
1 change: 1 addition & 0 deletions Src/Base/Make.package
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ C$(AMREX_BASE)_headers += AMReX_GpuContainers.H
C$(AMREX_BASE)_headers += AMReX_TrackedVector.H

C$(AMREX_BASE)_headers += AMReX_GpuAllocators.H
C$(AMREX_BASE)_headers += AMReX_GpuParallelReduce.H

C$(AMREX_BASE)_headers += AMReX_MFParallelFor.H
C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H
Expand Down
2 changes: 1 addition & 1 deletion Tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ else()
# List of subdirectories to search for CMakeLists.
#
set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut Base CallNoinline CLZ CommType CTOParFor DeviceGlobal
Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
Enum GpuParallelReduce HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
ParserUserFn Reducer ReduceToPlaneGuards ReduceToPlanePatchy Reinit RoundoffDomain SIMD
SmallMatrix SumBoundary TOML)

Expand Down
9 changes: 9 additions & 0 deletions Tests/GpuParallelReduce/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
foreach(D IN LISTS AMReX_SPACEDIM)
set(_sources main.cpp)
set(_input_files)

setup_test(${D} _sources _input_files)

unset(_sources)
unset(_input_files)
endforeach()
24 changes: 24 additions & 0 deletions Tests/GpuParallelReduce/GNUmakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
AMREX_HOME := ../..

DEBUG = FALSE

DIM = 3

COMP = gcc

USE_MPI = TRUE
USE_OMP = FALSE
USE_CUDA = FALSE
USE_HIP = FALSE
USE_SYCL = FALSE

BL_NO_FORT = TRUE

TINY_PROFILE = FALSE

include $(AMREX_HOME)/Tools/GNUMake/Make.defs

include ./Make.package
include $(AMREX_HOME)/Src/Base/Make.package

include $(AMREX_HOME)/Tools/GNUMake/Make.rules
1 change: 1 addition & 0 deletions Tests/GpuParallelReduce/Make.package
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CEXE_sources += main.cpp
111 changes: 111 additions & 0 deletions Tests/GpuParallelReduce/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Header under test first, to test transient includes
#include <AMReX_GpuParallelReduce.H>

#include <AMReX.H>
#include <AMReX_Print.H>

//
// Unit test for the Gpu::DeviceVector overloads of the MPI collectives in
// AMReX_GpuParallelReduce.H:
// - ParallelAllReduce::Sum(Gpu::DeviceVector&, comm)
// - ParallelReduce::Sum (Gpu::DeviceVector&, root, comm)
// - ParallelDescriptor::Bcast(Gpu::DeviceVector&, root, comm)
//
// Each rank seeds a device vector with rank-dependent values. The test checks
// the reduced/broadcast results against the analytic expectation. It is
// meaningful with >1 MPI rank but also passes serially, and exercises the
// device<->host staging path when built for GPU without GPU-aware MPI.
//

using namespace amrex;

namespace {

Gpu::DeviceVector<Real> to_device (Vector<Real> const& h)
{
Gpu::DeviceVector<Real> d(h.size());
Gpu::copy(Gpu::hostToDevice, h.begin(), h.end(), d.begin());
return d;
}

Vector<Real> to_host (Gpu::DeviceVector<Real> const& d)
{
Vector<Real> h(d.size());
Gpu::copy(Gpu::deviceToHost, d.begin(), d.end(), h.begin());
return h;
}

}

int main (int argc, char* argv[])
{
amrex::Initialize(argc, argv);
{
const int nprocs = ParallelDescriptor::NProcs();
const int myproc = ParallelDescriptor::MyProc();
const int ioproc = ParallelDescriptor::IOProcessorNumber();
MPI_Comm comm = ParallelDescriptor::Communicator();

const int n = 10;

// rank p seeds entry i with (p + i), so the sum over all ranks is
// sum_{p=0}^{nprocs-1} (p + i)
auto expected_sum = [=] (int i) -> Real {
Real s = Real(0);
for (int p = 0; p < nprocs; ++p) { s += Real(p + i); }
return s;
};

// ParallelAllReduce::Sum -> result valid on every rank
{
Vector<Real> h(n);
for (int i = 0; i < n; ++i) { h[i] = Real(myproc + i); }
auto d = to_device(h);

ParallelAllReduce::Sum(d, comm);

auto const r = to_host(d);
for (int i = 0; i < n; ++i) {
AMREX_ALWAYS_ASSERT(r[i] == expected_sum(i));
}
}

// ParallelReduce::Sum -> result valid on root only
{
Vector<Real> h(n);
for (int i = 0; i < n; ++i) { h[i] = Real(myproc + i); }
auto d = to_device(h);

ParallelReduce::Sum(d, ioproc, comm);

if (myproc == ioproc) {
auto const r = to_host(d);
for (int i = 0; i < n; ++i) {
AMREX_ALWAYS_ASSERT(r[i] == expected_sum(i));
}
}
}

// ParallelDescriptor::Bcast -> root's data to every rank.
// Contract: every rank pre-allocates the receiver to the root's length.
{
Gpu::DeviceVector<Real> d(n, Real(0));
if (myproc == ioproc) {
Vector<Real> h(n);
for (int i = 0; i < n; ++i) { h[i] = Real(100 + i); }
Gpu::copy(Gpu::hostToDevice, h.begin(), h.end(), d.begin());
}

ParallelDescriptor::Bcast(d, ioproc, comm);

auto const r = to_host(d);
for (int i = 0; i < n; ++i) {
AMREX_ALWAYS_ASSERT(r[i] == Real(100 + i));
}
}

amrex::Print() << "GpuParallelReduce: all tests passed on " << nprocs
<< " rank(s).\n";
}
amrex::Finalize();
}
Loading