Oneflow-Inc · fpzh2011 · Jul 31, 2025 · Jun 11, 2025 · Jun 17, 2025 · Jul 1, 2025
diff --git a/cmake/caches/ci/cpu.cmake b/cmake/caches/ci/cpu.cmake
@@ -1,5 +1,6 @@
 set(BUILD_CUDA NO CACHE BOOL "")
 set(BUILD_NPU NO CACHE BOOL "")
+set(BUILD_MLU NO CACHE BOOL "")
 set(BUILD_GIT_VERSION YES CACHE BOOL "")
 set(BUILD_TESTING YES CACHE BOOL "")
 set(WITH_ONEDNN YES CACHE BOOL "")

diff --git a/cmake/caches/cn/cpu.cmake b/cmake/caches/cn/cpu.cmake
@@ -1,5 +1,6 @@
 set(BUILD_CUDA NO CACHE BOOL "")
 set(BUILD_NPU NO CACHE BOOL "")
+set(BUILD_MLU NO CACHE BOOL "")
 set(BUILD_SHARED_LIBS YES CACHE BOOL "")
 set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
 set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake
@@ -353,10 +353,37 @@ if(BUILD_CUDA)
                               PROPERTIES COMPILE_FLAGS "-DCUDA_REAL_ARCHS=\"${CUDA_REAL_ARCHS}\"")
 endif()
 
+# init flag of devices
+set(DEVICES_ENABLED FALSE)
+
 if(BUILD_NPU)
   add_definitions(-DWITH_NPU)
+  set(DEVICES_ENABLED TRUE)
+endif()
+message(STATUS "NPU support enabled.")
+
+if(BUILD_MLU)
+  add_definitions(-DWITH_MLU)
+  set(DEVICES_ENABLED TRUE)
 endif()
+message(STATUS "MLU support enabled.")
+
+if(DEVICES_ENABLED)
+  # check WITH_DEVICES defined or not
+  get_directory_property(EXISTING_DEFS COMPILE_DEFINITIONS)
+
+  if(NOT "WITH_DEVICES" IN_LIST EXISTING_DEFS)
+    add_definitions(-DWITH_DEVICES)
+    message(STATUS "Added generic device support definition")
+  else()
+    message(STATUS "Generic device support already defined")
+  endif()
+endif()
+
+# show all devices status
+message(STATUS "BUILD_MLU: ${BUILD_MLU}")
 message(STATUS "BUILD_NPU: ${BUILD_NPU}")
+message(STATUS "Generic device support: ${DEVICE_ENABLED}")
 
 if(BUILD_CUDA AND WITH_CUTLASS)
   if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")

diff --git a/oneflow/api/python/flags.cpp b/oneflow/api/python/flags.cpp
@@ -29,6 +29,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
 #endif  // WITH_CUDA
   });
 
+  m.def("with_devices", []() {
+#ifdef WITH_DEVICES
+    return true;
+#else
+    return false;
+#endif  // WITH_DEVICES
+  });
+
   m.def("with_npu", []() {
 #ifdef WITH_NPU
     return true;
@@ -37,6 +45,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
 #endif  // WITH_NPU
   });
 
+  m.def("with_mlu", []() {
+#ifdef WITH_MLU
+    return true;
+#else
+    return false;
+#endif  // WITH_MLU
+  });
+
   m.def("cuda_version", []() {
 #ifdef WITH_CUDA
     return CUDA_VERSION;

diff --git a/oneflow/core/auto_parallel/boxing_collector.cpp b/oneflow/core/auto_parallel/boxing_collector.cpp
@@ -581,7 +581,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
     return Maybe<void>::Ok();
   }
 
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
   // Use a general basic communication if no P in the consumer
   if (((Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()
         && producer_parallel_desc == consumer_parallel_desc)
@@ -600,7 +600,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
     // Otherwise, one-step transfer
     return Maybe<void>::Ok();
   }
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
 
   if (JUST(ComputeLazyCopyCostBetweenNdSbp(sbp_producer, sbp_consumer, logical_blob_desc,
                                            producer_parallel_desc, consumer_parallel_desc,

diff --git a/oneflow/core/common/auto_registration_factory.h b/oneflow/core/common/auto_registration_factory.h
@@ -68,7 +68,8 @@ struct AutoRegistrationFactory {
   bool has_creators() const { return creators_.get() != nullptr; }
 
   const HashMap<Key, Creator>& creators() const {
-    CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name();
+    CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name()
+                          << "Base type name:" << typeid(Base).name();
     return *creators_.get();
   }
 

@@ -2238,7 +2238,7 @@ class SparseSoftmaxCrossEntropyFunctor {
                                const std::shared_ptr<one::Tensor>& label) const {
     if (!(logits->is_global() && label->is_global())) { return false; }
     // npu-implementation not support ms version yet
-#ifdef WITH_NPU
+#if defined(WITH_DEVICES)
     return false;
 #endif
 

diff --git a/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp b/oneflow/core/graph/boxing/collective_boxing_sub_task_graph_builder.cpp
@@ -35,10 +35,18 @@ CollectiveBoxingSubTskGphBuilder::CollectiveBoxingSubTskGphBuilder() {
   if (collective_boxing_conf.nccl_enable_all_to_all()) {
 #if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700
     builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kCUDA));
-#elif defined(WITH_NPU)
-    builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kNPU));
 #else
     LOG(WARNING) << "nccl_enable_all_to_all is unavailable unless NCCL_VERSION > 2.7.0";
+#endif
+
+#if defined(WITH_DEVICES)
+#if defined(WITH_NPU)
+    builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kNPU));
+#elif defined(WITH_MLU)
+    builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kMLU));
+#elif defined(WITH_XPU)
+    builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kXPU));
+#endif
 #endif
   }
   chain_builder_.reset(new ChainSubTskGphBuilder(builders));

diff --git a/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp b/oneflow/core/graph/boxing/hierarchical_sub_task_graph_builder_impl.cpp
@@ -102,7 +102,7 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB
     if (in_parallel_desc.device_type() == out_parallel_desc.device_type()
         && in_parallel_desc.device_type() != DeviceType::kCPU
         && !NdSbpHasPartialParallel(out_nd_sbp)) {
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
       ParallelConf merged_parallel_conf;
       MergeParallelConf(in_parallel_desc.parallel_conf(), out_parallel_desc.parallel_conf(),
                         &merged_parallel_conf);

diff --git a/oneflow/core/job/resource_desc.cpp b/oneflow/core/job/resource_desc.cpp
@@ -73,11 +73,21 @@ CollectiveBoxingConf ResourceDesc::collective_boxing_conf() const {
 bool ResourceDesc::nccl_use_compute_stream() const {
 #if defined(WITH_CUDA) && NCCL_VERSION_CODE > 2700
   return resource_.nccl_use_compute_stream();
-#elif defined(WITH_NPU)
+#else
+  return false;
+#endif
+
+#if defined(WITH_DEVICES)
+#if defined(WITH_NPU)
+  return resource_.nccl_use_compute_stream();
+#elif defined(WITH_MLU)
+  return resource_.nccl_use_compute_stream();
+#elif defined(WITH_XPU)
   return resource_.nccl_use_compute_stream();
 #else
   return false;
 #endif
+#endif
 }
 
 void ResourceDesc::DumpCudnnConf(const JobConfigProto& job_conf) {

diff --git a/oneflow/core/job/runtime.cpp b/oneflow/core/job/runtime.cpp
@@ -70,15 +70,15 @@ Runtime::Runtime(
     Singleton<RuntimeJobDescs>::Get()->AddPlan(plan);
     collective_boxing_scheduler_plan_token_ =
         Singleton<boxing::collective::Scheduler>::Get()->AddPlan(plan);
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
     const auto& vaild_ccl_comm_mgr_device_types =
         EagerCclCommMgrBuilder::Get().vaild_ccl_comm_mgr_device_types();
     if (!vaild_ccl_comm_mgr_device_types.empty() && !Singleton<EagerCclCommMgr>::Get()) {
       Singleton<EagerCclCommMgr>::SetAllocated(
           EagerCclCommMgrBuilder::Get().NewCclCommMgr(vaild_ccl_comm_mgr_device_types.front()));
     }
     Singleton<EagerCclCommMgr>::Get()->CreateCommFromPlan(plan);
-#endif  // defined(WITH_CUDA) || defined(WITH_NPU)
+#endif  // defined(WITH_CUDA) || WITH_DEVICES
   }
   std::vector<const TaskProto*> source_tasks;
   source_tasks.reserve(plan.task().size());

diff --git a/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp b/oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
@@ -16,7 +16,7 @@ limitations under the License.
 #include "oneflow/core/auto_parallel/auto_memory.h"
 #include "oneflow/core/common/util.h"
 #include "oneflow/core/job/nd_sbp_util.h"
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
 #include "oneflow/core/framework/framework.h"
 #include "oneflow/core/framework/nd_sbp.h"
 #include "oneflow/core/framework/instructions_builder.h"
@@ -883,4 +883,4 @@ REGISTER_JOB_PASS("InsertNcclLogicalOpPass", InsertNcclLogicalOpPass);
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/core/job_rewriter/job_completer.cpp b/oneflow/core/job_rewriter/job_completer.cpp
@@ -153,7 +153,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
   compile_tc->Count("[GraphCompile]" + job_name + " SystemOpFillJobNamePass", 1, true);
   JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
   compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
   if (Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()) {
     // NOTE(chengcheng): this pass need as last pass for insert correct op with nccl boxing.
     JUST(JobPass4Name("InsertNcclLogicalOpPass")(job, &job_pass_ctx));
@@ -169,7 +169,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
     JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
     compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
   }
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
   JUST(JobPass4Name("LogicalChainPass")(job, &job_pass_ctx));
   JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
 

diff --git a/oneflow/core/job_rewriter/nccl_logical_chain_strict_order_pass.cpp b/oneflow/core/job_rewriter/nccl_logical_chain_strict_order_pass.cpp
@@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
 #include "oneflow/core/auto_parallel/auto_memory.h"
 #include "oneflow/core/job/nd_sbp_util.h"
 #include "oneflow/core/framework/framework.h"
@@ -210,4 +210,4 @@ REGISTER_JOB_PASS("NcclLogicalChainStrictOrderPass", NcclLogicalChainStrictOrder
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/core/job_rewriter/nccl_logical_op_fusion_pass.cpp b/oneflow/core/job_rewriter/nccl_logical_op_fusion_pass.cpp
@@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
-#if defined(WITH_CUDA) || defined(WITH_NPU)
+#if defined(WITH_CUDA) || defined(WITH_DEVICES)
 #include "oneflow/core/auto_parallel/auto_memory.h"
 #include "oneflow/core/job/nd_sbp_util.h"
 #include "oneflow/core/framework/framework.h"
@@ -293,4 +293,4 @@ REGISTER_JOB_PASS("NcclLogicalOpFusionPass", NcclLogicalOpFusionPass);
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp b/oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp
@@ -22,7 +22,7 @@ limitations under the License.
 #include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
 #include "oneflow/user/kernels/collective_communication/include/all_to_all.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
 
 namespace oneflow {
 
@@ -254,4 +254,4 @@ REGISTER_SYSTEM_OP_KERNEL_UNIFIED_CCL_COMM_INIT(OperatorConf::kNcclSendRecvBoxin
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
@@ -23,7 +23,7 @@ limitations under the License.
 #include "oneflow/core/ep/cuda/cuda_stream.h"
 #include "oneflow/user/kernels/collective_communication/include/all_to_all.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)
 
 namespace oneflow {
 

diff --git a/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp b/oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp
@@ -25,7 +25,7 @@ limitations under the License.
 #include "oneflow/user/kernels/collective_communication/include/all_gather.h"
 #include "oneflow/user/kernels/collective_communication/include/all_to_all.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
 
 namespace oneflow {
 
@@ -554,4 +554,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_2D_same_dim1_all_reduc
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/user/kernels/nccl_logical_fusion_kernel.cpp b/oneflow/user/kernels/nccl_logical_fusion_kernel.cpp
@@ -28,7 +28,7 @@ limitations under the License.
 #include "collective_communication/include/all_to_all.h"
 #include "collective_communication/include/reduce_scatter.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
 
 namespace oneflow {
 
@@ -703,12 +703,13 @@ size_t InferNcclLogicalFusionKernelTmpBufferSize(user_op::InferContext* ctx) {
 REGISTER_USER_KERNEL("_nccl_logical_fusion")
     .SetCreateFn<CclLogicalFusionKernel>()
     .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     || (user_op::HobDeviceType() == DeviceType::kNPU))
+                     || (user_op::HobDeviceType() == DeviceType::kNPU)
+                     || (user_op::HobDeviceType() == DeviceType::kMLU))
     .SetInferTmpSizeFn(InferNcclLogicalFusionKernelTmpBufferSize);
 
 // TODO: SetIsMatchedHob support multi devices(not including cpu)
 }  // namespace
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/user/kernels/nccl_logical_kernels.cpp b/oneflow/user/kernels/nccl_logical_kernels.cpp
@@ -28,7 +28,7 @@ limitations under the License.
 #include "oneflow/user/kernels/collective_communication/include/broadcast.h"
 #include "oneflow/user/kernels/collective_communication/include/reduce.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
 
 namespace oneflow {
 
@@ -640,4 +640,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_s2s");
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp b/oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp
@@ -30,7 +30,7 @@ limitations under the License.
 #include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
 #include "oneflow/user/kernels/collective_communication/include/all_to_all.h"
 
-#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
+#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_DEVICES)
 
 namespace oneflow {
 
@@ -288,9 +288,10 @@ size_t InferTmpBufferSize(user_op::InferContext* ctx) {
 REGISTER_USER_KERNEL("_nccl_logical_send_recv")
     .SetCreateFn<CclLogicalSendRecv>()
     .SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
-                     || (user_op::HobDeviceType() == DeviceType::kNPU))
+                     || (user_op::HobDeviceType() == DeviceType::kNPU)
+                     || (user_op::HobDeviceType() == DeviceType::kMLU))
     .SetInferTmpSizeFn(InferTmpBufferSize);
 
 }  // namespace oneflow
 
-#endif  // WITH_CUDA || WITH_NPU
+#endif  // WITH_CUDA || WITH_DEVICES
diff --git a/oneflow/user/kernels/pack_kernel.cpp b/oneflow/user/kernels/pack_kernel.cpp
@@ -69,6 +69,15 @@ class PackKernel final : public user_op::OpKernel {
       (user_op::HobDeviceType() == device));
 
 OF_PP_FOR_EACH_TUPLE(REGISTER_PACK_KERNEL, DEVICE_TYPE_SEQ)
+#if defined(WITH_DEVICES)
+#if defined(WITH_NPU)
+REGISTER_PACK_KERNEL(DeviceType::kNPU)
+#elif defined(WITH_MLU)
+REGISTER_PACK_KERNEL(DeviceType::kMLU)
+#elif defined(WITH_XPU)
+REGISTER_PACK_KERNEL(DeviceType::kXPU)
+#endif
+#endif
 
 #undef REGISTER_PACK_KERNEL
 

diff --git a/oneflow/user/kernels/unpack_kernel.cpp b/oneflow/user/kernels/unpack_kernel.cpp
@@ -63,6 +63,16 @@ class UnpackKernel final : public user_op::OpKernel {
 
 OF_PP_FOR_EACH_TUPLE(REGISTER_UNPACK_KERNEL, DEVICE_TYPE_SEQ)
 
+#if defined(WITH_DEVICES)
+#if defined(WITH_NPU)
+REGISTER_UNPACK_KERNEL(DeviceType::kNPU)
+#elif defined(WITH_MLU)
+REGISTER_UNPACK_KERNEL(DeviceType::kMLU)
+#elif defined(WITH_XPU)
+REGISTER_UNPACK_KERNEL(DeviceType::kXPU)
+#endif
+#endif
+
 #undef REGISTER_UNPACK_KERNEL
 
 }  // namespace