Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/caches/ci/cpu.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_NPU NO CACHE BOOL "")
set(BUILD_MLU NO CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(WITH_ONEDNN YES CACHE BOOL "")
Expand Down
1 change: 1 addition & 0 deletions cmake/caches/cn/cpu.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_NPU NO CACHE BOOL "")
set(BUILD_MLU NO CACHE BOOL "")
set(BUILD_SHARED_LIBS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
27 changes: 27 additions & 0 deletions cmake/oneflow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,37 @@ if(BUILD_CUDA)
PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
endif()

# init flag of devices
set(DEVICES_ENABLED FALSE)

if(BUILD_NPU)
add_definitions(-DWITH_NPU)
set(DEVICES_ENABLED TRUE)
endif()
message(STATUS "NPU support enabled.")

if(BUILD_MLU)
add_definitions(-DWITH_MLU)
set(DEVICES_ENABLED TRUE)
endif()
message(STATUS "MLU support enabled.")

if(DEVICES_ENABLED)
# check WITH_DEVICES defined or not
get_directory_property(EXISTING_DEFS COMPILE_DEFINITIONS)

if(NOT "WITH_DEVICES" IN_LIST EXISTING_DEFS)
add_definitions(-DWITH_DEVICES)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

当编译npu/xpu/mlu等设备时,DEVICES_ENABLED即为True,会定义一一个WITH_DEVICES的宏表示多设备支持,后续在代码中方便统一管理。

// before:
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU) || defined(WITH_XPU) ...

// after: 
#if defined(WITH_CUDA) || defined(WITH_DEVICES)

message(STATUS "Added generic device support definition")
else()
message(STATUS "Generic device support already defined")
endif()
endif()

# show all devices status
message(STATUS "BUILD_MLU: ${BUILD_MLU}")
message(STATUS "BUILD_NPU: ${BUILD_NPU}")
message(STATUS "Generic device support: ${DEVICE_ENABLED}")

if(BUILD_CUDA AND WITH_CUTLASS)
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
Expand Down
16 changes: 16 additions & 0 deletions oneflow/api/python/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
#endif // WITH_CUDA
});

m.def("with_devices", []() {
#ifdef WITH_DEVICES
return true;
#else
return false;
#endif // WITH_DEVICES
});

m.def("with_npu", []() {
#ifdef WITH_NPU
return true;
Expand All @@ -37,6 +45,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
#endif // WITH_NPU
});

m.def("with_mlu", []() {
#ifdef WITH_MLU
return true;
#else
return false;
#endif // WITH_MLU
});

m.def("cuda_version", []() {
#ifdef WITH_CUDA
return CUDA_VERSION;
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/auto_parallel/boxing_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
return Maybe<void>::Ok();
}

#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
// Use a general basic communication if no P in the consumer
if (((Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()
&& producer_parallel_desc == consumer_parallel_desc)
Expand All @@ -600,7 +600,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
// Otherwise, one-step transfer
return Maybe<void>::Ok();
}
#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES

if (JUST(ComputeLazyCopyCostBetweenNdSbp(sbp_producer, sbp_consumer, logical_blob_desc,
producer_parallel_desc, consumer_parallel_desc,
Expand Down
3 changes: 2 additions & 1 deletion oneflow/core/common/auto_registration_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ struct AutoRegistrationFactory {
bool has_creators() const { return creators_.get() != nullptr; }

const HashMap<Key, Creator>& creators() const {
CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name();
CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name()
<< "Base type name:" << typeid(Base).name();
return *creators_.get();
}

Expand Down
2 changes: 1 addition & 1 deletion oneflow/core/functional/impl/nn_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2238,7 +2238,7 @@ class SparseSoftmaxCrossEntropyFunctor {
const std::shared_ptr<one::Tensor>& label) const {
if (!(logits->is_global() && label->is_global())) { return false; }
// npu-implementation not support ms version yet
#ifdef WITH_NPU
#if defined(WITH_DEVICES)
return false;
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,18 @@ CollectiveBoxingSubTskGphBuilder::CollectiveBoxingSubTskGphBuilder() {
if (collective_boxing_conf.nccl_enable_all_to_all()) {
#if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kCUDA));
#elif defined(WITH_NPU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kNPU));
#else
LOG(WARNING) << "nccl_enable_all_to_all is unavailable unless NCCL_VERSION > 2.7.0";
#endif

#if defined(WITH_DEVICES)
#if defined(WITH_NPU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kNPU));
#elif defined(WITH_MLU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kMLU));
#elif defined(WITH_XPU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kXPU));
#endif
#endif
}
chain_builder_.reset(new ChainSubTskGphBuilder(builders));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB
if (in_parallel_desc.device_type() == out_parallel_desc.device_type()
&& in_parallel_desc.device_type() != DeviceType::kCPU
&& !NdSbpHasPartialParallel(out_nd_sbp)) {
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
ParallelConf merged_parallel_conf;
MergeParallelConf(in_parallel_desc.parallel_conf(), out_parallel_desc.parallel_conf(),
&merged_parallel_conf);
Expand Down
12 changes: 11 additions & 1 deletion oneflow/core/job/resource_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,21 @@ CollectiveBoxingConf ResourceDesc::collective_boxing_conf() const {
bool ResourceDesc::nccl_use_compute_stream() const {
#if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700
return resource_.nccl_use_compute_stream();
#elif defined(WITH_NPU)
#else
return false;
#endif

#if defined(WITH_DEVICES)
#if defined(WITH_NPU)
return resource_.nccl_use_compute_stream();
#elif defined(WITH_MLU)
return resource_.nccl_use_compute_stream();
#elif defined(WITH_XPU)
return resource_.nccl_use_compute_stream();
#else
return false;
#endif
#endif
}

void ResourceDesc::DumpCudnnConf(const JobConfigProto& job_conf) {
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/job/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ Runtime::Runtime(
Singleton<RuntimeJobDescs>::Get()->AddPlan(plan);
collective_boxing_scheduler_plan_token_ =
Singleton<boxing::collective::Scheduler>::Get()->AddPlan(plan);
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
const auto& vaild_ccl_comm_mgr_device_types =
EagerCclCommMgrBuilder::Get().vaild_ccl_comm_mgr_device_types();
if (!vaild_ccl_comm_mgr_device_types.empty() && !Singleton<EagerCclCommMgr>::Get()) {
Singleton<EagerCclCommMgr>::SetAllocated(
EagerCclCommMgrBuilder::Get().NewCclCommMgr(vaild_ccl_comm_mgr_device_types.front()));
}
Singleton<EagerCclCommMgr>::Get()->CreateCommFromPlan(plan);
#endif // defined(WITH_CUDA) || defined(WITH_NPU)
#endif // defined(WITH_CUDA) || WITH_DEVICES
}
std::vector<const TaskProto*> source_tasks;
source_tasks.reserve(plan.task().size());
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/job/nd_sbp_util.h"
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/framework/nd_sbp.h"
#include "oneflow/core/framework/instructions_builder.h"
Expand Down Expand Up @@ -883,4 +883,4 @@ REGISTER_JOB_PASS("InsertNcclLogicalOpPass", InsertNcclLogicalOpPass);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/job_completer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
compile_tc->Count("[GraphCompile]" + job_name + " SystemOpFillJobNamePass", 1, true);
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
if (Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()) {
// NOTE(chengcheng): this pass need as last pass for insert correct op with nccl boxing.
JUST(JobPass4Name("InsertNcclLogicalOpPass")(job, &job_pass_ctx));
Expand All @@ -169,7 +169,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
}
#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
JUST(JobPass4Name("LogicalChainPass")(job, &job_pass_ctx));
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/job/nd_sbp_util.h"
#include "oneflow/core/framework/framework.h"
Expand Down Expand Up @@ -210,4 +210,4 @@ REGISTER_JOB_PASS("NcclLogicalChainStrictOrderPass", NcclLogicalChainStrictOrder

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/nccl_logical_op_fusion_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_DEVICES)
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/job/nd_sbp_util.h"
#include "oneflow/core/framework/framework.h"
Expand Down Expand Up @@ -293,4 +293,4 @@ REGISTER_JOB_PASS("NcclLogicalOpFusionPass", NcclLogicalOpFusionPass);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
4 changes: 2 additions & 2 deletions oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ limitations under the License.
#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)

namespace oneflow {

Expand Down Expand Up @@ -254,4 +254,4 @@ REGISTER_SYSTEM_OP_KERNEL_UNIFIED_CCL_COMM_INIT(OperatorConf::kNcclSendRecvBoxin

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
2 changes: 1 addition & 1 deletion oneflow/user/kernels/eager_nccl_s2s_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ limitations under the License.
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down
4 changes: 2 additions & 2 deletions oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ limitations under the License.
#include "oneflow/user/kernels/collective_communication/include/all_gather.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)

namespace oneflow {

Expand Down Expand Up @@ -554,4 +554,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_2D_same_dim1_all_reduc

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
7 changes: 4 additions & 3 deletions oneflow/user/kernels/nccl_logical_fusion_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ limitations under the License.
#include "collective_communication/include/all_to_all.h"
#include "collective_communication/include/reduce_scatter.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)

namespace oneflow {

Expand Down Expand Up @@ -703,12 +703,13 @@ size_t InferNcclLogicalFusionKernelTmpBufferSize(user_op::InferContext* ctx) {
REGISTER_USER_KERNEL("_nccl_logical_fusion")
.SetCreateFn<CclLogicalFusionKernel>()
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
|| (user_op::HobDeviceType() == DeviceType::kNPU))
|| (user_op::HobDeviceType() == DeviceType::kNPU)
|| (user_op::HobDeviceType() == DeviceType::kMLU))
.SetInferTmpSizeFn(InferNcclLogicalFusionKernelTmpBufferSize);

// TODO: SetIsMatchedHob support multi devices(not including cpu)
} // namespace

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
4 changes: 2 additions & 2 deletions oneflow/user/kernels/nccl_logical_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ limitations under the License.
#include "oneflow/user/kernels/collective_communication/include/broadcast.h"
#include "oneflow/user/kernels/collective_communication/include/reduce.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)

namespace oneflow {

Expand Down Expand Up @@ -640,4 +640,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_s2s");

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
7 changes: 4 additions & 3 deletions oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ limitations under the License.
#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)

namespace oneflow {

Expand Down Expand Up @@ -288,9 +288,10 @@ size_t InferTmpBufferSize(user_op::InferContext* ctx) {
REGISTER_USER_KERNEL("_nccl_logical_send_recv")
.SetCreateFn<CclLogicalSendRecv>()
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
|| (user_op::HobDeviceType() == DeviceType::kNPU))
|| (user_op::HobDeviceType() == DeviceType::kNPU)
|| (user_op::HobDeviceType() == DeviceType::kMLU))
.SetInferTmpSizeFn(InferTmpBufferSize);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_DEVICES
9 changes: 9 additions & 0 deletions oneflow/user/kernels/pack_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ class PackKernel final : public user_op::OpKernel {
(user_op::HobDeviceType() == device));

OF_PP_FOR_EACH_TUPLE(REGISTER_PACK_KERNEL, DEVICE_TYPE_SEQ)
#if defined(WITH_DEVICES)
#if defined(WITH_NPU)
REGISTER_PACK_KERNEL(DeviceType::kNPU)
#elif defined(WITH_MLU)
REGISTER_PACK_KERNEL(DeviceType::kMLU)
#elif defined(WITH_XPU)
REGISTER_PACK_KERNEL(DeviceType::kXPU)
#endif
#endif

#undef REGISTER_PACK_KERNEL

Expand Down
10 changes: 10 additions & 0 deletions oneflow/user/kernels/unpack_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ class UnpackKernel final : public user_op::OpKernel {

OF_PP_FOR_EACH_TUPLE(REGISTER_UNPACK_KERNEL, DEVICE_TYPE_SEQ)

#if defined(WITH_DEVICES)
#if defined(WITH_NPU)
REGISTER_UNPACK_KERNEL(DeviceType::kNPU)
#elif defined(WITH_MLU)
REGISTER_UNPACK_KERNEL(DeviceType::kMLU)
#elif defined(WITH_XPU)
REGISTER_UNPACK_KERNEL(DeviceType::kXPU)
#endif
#endif

#undef REGISTER_UNPACK_KERNEL

} // namespace
Expand Down
Loading