AMReX-Codes · WeiqunZhang · Jun 27, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
@@ -30,6 +30,7 @@ namespace amrex::Cuda {}
 
 #include <AMReX_GpuAllocators.H>
 #include <AMReX_GpuContainers.H>
+#include <AMReX_GpuParallelReduce.H>
 #include <AMReX_CudaGraph.H>
 
 namespace amrex::Gpu {

diff --git a/Src/Base/AMReX_GpuParallelReduce.H b/Src/Base/AMReX_GpuParallelReduce.H
@@ -0,0 +1,147 @@
+#ifndef AMREX_GPU_PARALLEL_REDUCE_H_
+#define AMREX_GPU_PARALLEL_REDUCE_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_GpuContainers.H>
+#include <AMReX_INT.H>
+#include <AMReX_ParallelDescriptor.H>
+#include <AMReX_ParallelReduce.H>
+
+#include <cstddef>
+
+//
+// GPU-aware MPI collectives that operate in place on a Gpu::DeviceVector.
+//
+// These overloads complement the pointer/scalar overloads in
+// AMReX_ParallelReduce.H and AMReX_ParallelDescriptor.H. They live in a
+// separate header (rather than in those low-level headers) because they need
+// the Gpu container/copy machinery (AMReX_GpuContainers.H) that would bloat the
+// ParallelReduce.H headers. This header is also pulled into the AMReX_Gpu.H
+// umbrella for convenience.
+//
+// When AMReX is configured with GPU-aware MPI (ParallelDescriptor::UseGpuAwareMpi())
+// the device buffer is handed to MPI directly, otherwise the data is staged
+// through host (pinned) memory for the collective.
+//
+
+namespace amrex {
+
+namespace ParallelAllReduce {
+
+/// \ingroup amrex_mpi
+/** Sum-reduce the elements of a Gpu::DeviceVector in place, on all ranks. */
+template <typename T>
+void Sum (Gpu::DeviceVector<T>& v, MPI_Comm comm)
+{
+    // GPU-unaware case
+#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
+    if (!ParallelDescriptor::UseGpuAwareMpi()) {
+        Gpu::PinnedVector<T> hv(v.size());
+        Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
+        Sum(hv.data(), static_cast<int>(hv.size()), comm);
+        Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
+        return;
+    }
+#endif
+
+    // GPU-aware case
+    Sum(v.data(), static_cast<int>(v.size()), comm);
+}
+
+} // namespace ParallelAllReduce
+
+namespace ParallelReduce {
+
+/// \ingroup amrex_mpi
+/** Sum-reduce the elements of a Gpu::DeviceVector in place, onto \p root. */
+template <typename T>
+void Sum (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
+{
+    // GPU-unaware case
+#if defined(AMREX_USE_MPI) && defined(AMREX_USE_GPU)
+    if (!ParallelDescriptor::UseGpuAwareMpi()) {
+        Gpu::PinnedVector<T> hv(v.size());
+        // every rank stages its contribution to host for the reduction
+        Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
+        Sum(hv.data(), static_cast<int>(hv.size()), root, comm);
+        // only the root receives the reduced result, so only it copies back
+        if (ParallelDescriptor::MyProc(comm) == root) {
+            Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
+        }
+        return;
+    }
+#endif
+
+    // GPU-aware case
+    Sum(v.data(), static_cast<int>(v.size()), root, comm);
+}
+
+} // namespace ParallelReduce
+
+namespace ParallelDescriptor {
+
+/// \ingroup amrex_mpi
+/** Broadcast a Gpu::DeviceVector from \p root to all ranks in \p comm.
+ *
+ * Contract: on every receiving rank \p v must already be allocated to the same
+ * length as on \p root (the caller is responsible for sizing it, e.g. via
+ * resize). Calling this with a wrong-sized or empty receiver is undefined
+ * behavior. This mirrors the pointer overload Bcast(T*, n, root), which also
+ * requires a correctly-sized buffer on every rank.
+ *
+ * In a debug build (AMREX_DEBUG) the contract is checked: the root's length is
+ * broadcast first and asserted against each rank's size before the data are
+ * broadcast.
+ *
+ * The device buffer is passed to MPI directly when GPU-aware MPI is enabled
+ * (UseGpuAwareMpi()), otherwise it is staged through host (pinned) memory.
+ */
+template <typename T>
+void Bcast (Gpu::DeviceVector<T>& v, int root, MPI_Comm comm)
+{
+#ifdef AMREX_USE_MPI
+    auto const n = v.size();
+
+#ifdef AMREX_DEBUG
+    // verify the pre-allocation contract (the length broadcast happens on every
+    // rank, so it is collectively safe and cannot deadlock)
+    Long n_root = static_cast<Long>(n);
+    Bcast(&n_root, std::size_t(1), root, comm);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(n_root == static_cast<Long>(n),
+        "ParallelDescriptor::Bcast(Gpu::DeviceVector): receiver must be pre-allocated to the root's length");
+#endif
+
+    // trivial case: 1 rank
+    if (n == 0) { return; }
+
+    // GPU-unaware case
+#ifdef AMREX_USE_GPU
+    if (!UseGpuAwareMpi()) {
+        Gpu::PinnedVector<T> hv(n);
+        const bool is_root = (MyProc(comm) == root);
+        // only the root needs to stage its data to host before the broadcast
+        if (is_root) {
+            Gpu::copy(Gpu::deviceToHost, v.begin(), v.end(), hv.begin());
+        }
+        Bcast(hv.data(), static_cast<std::size_t>(n), root, comm);
+        // only the receivers need to copy the broadcast result back to device
+        if (!is_root) {
+            Gpu::copy(Gpu::hostToDevice, hv.begin(), hv.end(), v.begin());
+        }
+        return;
+    }
+#endif
+
+    // GPU-aware case
+    Bcast(v.data(), static_cast<std::size_t>(n), root, comm);
+
+#else  // AMREX_USE_MPI
+    amrex::ignore_unused(v, root, comm);
+#endif
+}
+
+} // namespace ParallelDescriptor
+
+} // namespace amrex
+
+#endif /*AMREX_GPU_PARALLEL_REDUCE_H_*/
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
@@ -62,6 +62,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        AMReX_OpenMP.H
        AMReX_OpenMP.cpp
        AMReX_ParallelReduce.H
+       AMReX_GpuParallelReduce.H
        AMReX_ForkJoin.H
        AMReX_ForkJoin.cpp
        AMReX_ParallelContext.H

diff --git a/Src/Base/Make.package b/Src/Base/Make.package
@@ -112,6 +112,7 @@ C$(AMREX_BASE)_headers += AMReX_GpuContainers.H
 C$(AMREX_BASE)_headers += AMReX_TrackedVector.H
 
 C$(AMREX_BASE)_headers += AMReX_GpuAllocators.H
+C$(AMREX_BASE)_headers += AMReX_GpuParallelReduce.H
 
 C$(AMREX_BASE)_headers += AMReX_MFParallelFor.H
 C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H

diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
@@ -134,7 +134,7 @@ else()
    # List of subdirectories to search for CMakeLists.
    #
    set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut Base CallNoinline CLZ CommType CTOParFor DeviceGlobal
-                            Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
+                            Enum GpuParallelReduce HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
                             ParserUserFn Reducer ReduceToPlaneGuards ReduceToPlanePatchy Reinit RoundoffDomain SIMD
                             SmallMatrix SumBoundary TOML)
 

diff --git a/Tests/GpuParallelReduce/CMakeLists.txt b/Tests/GpuParallelReduce/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/GpuParallelReduce/GNUmakefile b/Tests/GpuParallelReduce/GNUmakefile
@@ -0,0 +1,24 @@
+AMREX_HOME := ../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/GpuParallelReduce/Make.package b/Tests/GpuParallelReduce/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/GpuParallelReduce/main.cpp b/Tests/GpuParallelReduce/main.cpp
@@ -0,0 +1,111 @@
+// Header under test first, to test transient includes
+#include <AMReX_GpuParallelReduce.H>
+
+#include <AMReX.H>
+#include <AMReX_Print.H>
+
+//
+// Unit test for the Gpu::DeviceVector overloads of the MPI collectives in
+// AMReX_GpuParallelReduce.H:
+//   - ParallelAllReduce::Sum(Gpu::DeviceVector&, comm)
+//   - ParallelReduce::Sum   (Gpu::DeviceVector&, root, comm)
+//   - ParallelDescriptor::Bcast(Gpu::DeviceVector&, root, comm)
+//
+// Each rank seeds a device vector with rank-dependent values. The test checks
+// the reduced/broadcast results against the analytic expectation. It is
+// meaningful with >1 MPI rank but also passes serially, and exercises the
+// device<->host staging path when built for GPU without GPU-aware MPI.
+//
+
+using namespace amrex;
+
+namespace {
+
+Gpu::DeviceVector<Real> to_device (Vector<Real> const& h)
+{
+    Gpu::DeviceVector<Real> d(h.size());
+    Gpu::copy(Gpu::hostToDevice, h.begin(), h.end(), d.begin());
+    return d;
+}
+
+Vector<Real> to_host (Gpu::DeviceVector<Real> const& d)
+{
+    Vector<Real> h(d.size());
+    Gpu::copy(Gpu::deviceToHost, d.begin(), d.end(), h.begin());
+    return h;
+}
+
+}
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        const int nprocs = ParallelDescriptor::NProcs();
+        const int myproc = ParallelDescriptor::MyProc();
+        const int ioproc = ParallelDescriptor::IOProcessorNumber();
+        MPI_Comm comm = ParallelDescriptor::Communicator();
+
+        const int n = 10;
+
+        // rank p seeds entry i with (p + i), so the sum over all ranks is
+        //   sum_{p=0}^{nprocs-1} (p + i)
+        auto expected_sum = [=] (int i) -> Real {
+            Real s = Real(0);
+            for (int p = 0; p < nprocs; ++p) { s += Real(p + i); }
+            return s;
+        };
+
+        // ParallelAllReduce::Sum -> result valid on every rank
+        {
+            Vector<Real> h(n);
+            for (int i = 0; i < n; ++i) { h[i] = Real(myproc + i); }
+            auto d = to_device(h);
+
+            ParallelAllReduce::Sum(d, comm);
+
+            auto const r = to_host(d);
+            for (int i = 0; i < n; ++i) {
+                AMREX_ALWAYS_ASSERT(r[i] == expected_sum(i));
+            }
+        }
+
+        // ParallelReduce::Sum -> result valid on root only
+        {
+            Vector<Real> h(n);
+            for (int i = 0; i < n; ++i) { h[i] = Real(myproc + i); }
+            auto d = to_device(h);
+
+            ParallelReduce::Sum(d, ioproc, comm);
+
+            if (myproc == ioproc) {
+                auto const r = to_host(d);
+                for (int i = 0; i < n; ++i) {
+                    AMREX_ALWAYS_ASSERT(r[i] == expected_sum(i));
+                }
+            }
+        }
+
+        // ParallelDescriptor::Bcast -> root's data to every rank.
+        // Contract: every rank pre-allocates the receiver to the root's length.
+        {
+            Gpu::DeviceVector<Real> d(n, Real(0));
+            if (myproc == ioproc) {
+                Vector<Real> h(n);
+                for (int i = 0; i < n; ++i) { h[i] = Real(100 + i); }
+                Gpu::copy(Gpu::hostToDevice, h.begin(), h.end(), d.begin());
+            }
+
+            ParallelDescriptor::Bcast(d, ioproc, comm);
+
+            auto const r = to_host(d);
+            for (int i = 0; i < n; ++i) {
+                AMREX_ALWAYS_ASSERT(r[i] == Real(100 + i));
+            }
+        }
+
+        amrex::Print() << "GpuParallelReduce: all tests passed on " << nprocs
+                       << " rank(s).\n";
+    }
+    amrex::Finalize();
+}