Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/caches/ci/cpu.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_NPU NO CACHE BOOL "")
set(BUILD_MLU NO CACHE BOOL "")
set(BUILD_GIT_VERSION YES CACHE BOOL "")
set(BUILD_TESTING YES CACHE BOOL "")
set(WITH_ONEDNN YES CACHE BOOL "")
Expand Down
1 change: 1 addition & 0 deletions cmake/caches/cn/cpu.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(BUILD_CUDA NO CACHE BOOL "")
set(BUILD_NPU NO CACHE BOOL "")
set(BUILD_MLU NO CACHE BOOL "")
set(BUILD_SHARED_LIBS YES CACHE BOOL "")
set(THIRD_PARTY_MIRROR aliyun CACHE STRING "")
set(PIP_INDEX_MIRROR "https://pypi.tuna.tsinghua.edu.cn/simple" CACHE STRING "")
5 changes: 5 additions & 0 deletions cmake/oneflow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,11 @@ if(BUILD_NPU)
endif()
message(STATUS "BUILD_NPU: ${BUILD_NPU}")

if(BUILD_MLU)
add_definitions(-DWITH_MLU)
endif()
message(STATUS "BUILD_MLU: ${BUILD_MLU}")

if(BUILD_CUDA AND WITH_CUTLASS)
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.1")
add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
Expand Down
8 changes: 8 additions & 0 deletions oneflow/api/python/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ ONEFLOW_API_PYBIND11_MODULE("flags", m) {
#endif // WITH_NPU
});

m.def("with_mlu", []() {
#ifdef WITH_MLU
return true;
#else
return false;
#endif // WITH_MLU
});

m.def("cuda_version", []() {
#ifdef WITH_CUDA
return CUDA_VERSION;
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/auto_parallel/boxing_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
return Maybe<void>::Ok();
}

#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
// Use a general basic communication if no P in the consumer
if (((Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()
&& producer_parallel_desc == consumer_parallel_desc)
Expand All @@ -600,7 +600,7 @@ Maybe<void> BoxingCollector::AskSbpCombination(const NdSbp& sbp_producer, const
// Otherwise, one-step transfer
return Maybe<void>::Ok();
}
#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || defined(WITH_MLU)

if (JUST(ComputeLazyCopyCostBetweenNdSbp(sbp_producer, sbp_consumer, logical_blob_desc,
producer_parallel_desc, consumer_parallel_desc,
Expand Down
3 changes: 2 additions & 1 deletion oneflow/core/common/auto_registration_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ struct AutoRegistrationFactory {
bool has_creators() const { return creators_.get() != nullptr; }

const HashMap<Key, Creator>& creators() const {
CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name();
CHECK(has_creators()) << "Unregistered key type: " << typeid(Key).name()
<< "Base type name:" << typeid(Base).name();
return *creators_.get();
}

Expand Down
2 changes: 1 addition & 1 deletion oneflow/core/functional/impl/nn_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2238,7 +2238,7 @@ class SparseSoftmaxCrossEntropyFunctor {
const std::shared_ptr<one::Tensor>& label) const {
if (!(logits->is_global() && label->is_global())) { return false; }
// npu-implementation not support ms version yet
#ifdef WITH_NPU
#if defined(WITH_NPU) || defined(WITH_MLU)
return false;
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ CollectiveBoxingSubTskGphBuilder::CollectiveBoxingSubTskGphBuilder() {
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kCUDA));
#elif defined(WITH_NPU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kNPU));
#elif defined(WITH_MLU)
builders.emplace_back(new CclAll2AllSubTskGphBuilder(DeviceType::kMLU));
#else
LOG(WARNING) << "nccl_enable_all_to_all is unavailable unless NCCL_VERSION > 2.7.0";
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ class NDNcclSendRecvBoxingSubTskGphBuilder final : public HierarchicalSubTskGphB
if (in_parallel_desc.device_type() == out_parallel_desc.device_type()
&& in_parallel_desc.device_type() != DeviceType::kCPU
&& !NdSbpHasPartialParallel(out_nd_sbp)) {
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)
ParallelConf merged_parallel_conf;
MergeParallelConf(in_parallel_desc.parallel_conf(), out_parallel_desc.parallel_conf(),
&merged_parallel_conf);
Expand Down
2 changes: 2 additions & 0 deletions oneflow/core/job/resource_desc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ bool ResourceDesc::nccl_use_compute_stream() const {
return resource_.nccl_use_compute_stream();
#elif defined(WITH_NPU)
return resource_.nccl_use_compute_stream();
#elif defined(WITH_MLU)
return resource_.nccl_use_compute_stream();
#else
return false;
#endif
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/job/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,15 @@ Runtime::Runtime(
Singleton<RuntimeJobDescs>::Get()->AddPlan(plan);
collective_boxing_scheduler_plan_token_ =
Singleton<boxing::collective::Scheduler>::Get()->AddPlan(plan);
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
const auto& vaild_ccl_comm_mgr_device_types =
EagerCclCommMgrBuilder::Get().vaild_ccl_comm_mgr_device_types();
if (!vaild_ccl_comm_mgr_device_types.empty() && !Singleton<EagerCclCommMgr>::Get()) {
Singleton<EagerCclCommMgr>::SetAllocated(
EagerCclCommMgrBuilder::Get().NewCclCommMgr(vaild_ccl_comm_mgr_device_types.front()));
}
Singleton<EagerCclCommMgr>::Get()->CreateCommFromPlan(plan);
#endif // defined(WITH_CUDA) || defined(WITH_NPU)
#endif // defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
}
std::vector<const TaskProto*> source_tasks;
source_tasks.reserve(plan.task().size());
Expand Down
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/insert_nccl_logical_op_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/common/util.h"
#include "oneflow/core/job/nd_sbp_util.h"
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/framework/nd_sbp.h"
#include "oneflow/core/framework/instructions_builder.h"
Expand Down Expand Up @@ -883,4 +883,4 @@ REGISTER_JOB_PASS("InsertNcclLogicalOpPass", InsertNcclLogicalOpPass);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || defined(WITH_MLU)
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/job_completer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
compile_tc->Count("[GraphCompile]" + job_name + " SystemOpFillJobNamePass", 1, true);
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
if (Singleton<ResourceDesc, ForSession>::Get()->nccl_use_compute_stream()) {
// NOTE(chengcheng): this pass need as last pass for insert correct op with nccl boxing.
JUST(JobPass4Name("InsertNcclLogicalOpPass")(job, &job_pass_ctx));
Expand All @@ -169,7 +169,7 @@ Maybe<void> JobCompleter::Complete(Job* job) {
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));
compile_tc->Count("[GraphCompile]" + job_name + " DumpBlobParallelConfPass", 1, true);
}
#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
JUST(JobPass4Name("LogicalChainPass")(job, &job_pass_ctx));
JUST(JobPass4Name("DumpBlobParallelConfPass")(job, &job_pass_ctx));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/job/nd_sbp_util.h"
#include "oneflow/core/framework/framework.h"
Expand Down Expand Up @@ -210,4 +210,4 @@ REGISTER_JOB_PASS("NcclLogicalChainStrictOrderPass", NcclLogicalChainStrictOrder

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
4 changes: 2 additions & 2 deletions oneflow/core/job_rewriter/nccl_logical_op_fusion_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#if defined(WITH_CUDA) || defined(WITH_NPU)
#if defined(WITH_CUDA) || defined(WITH_NPU) || defined(WITH_MLU)
#include "oneflow/core/auto_parallel/auto_memory.h"
#include "oneflow/core/job/nd_sbp_util.h"
#include "oneflow/core/framework/framework.h"
Expand Down Expand Up @@ -293,4 +293,4 @@ REGISTER_JOB_PASS("NcclLogicalOpFusionPass", NcclLogicalOpFusionPass);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
4 changes: 2 additions & 2 deletions oneflow/core/kernel/nccl_send_recv_boxing_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ limitations under the License.
#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down Expand Up @@ -254,4 +254,4 @@ REGISTER_SYSTEM_OP_KERNEL_UNIFIED_CCL_COMM_INIT(OperatorConf::kNcclSendRecvBoxin

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
2 changes: 1 addition & 1 deletion oneflow/user/kernels/eager_nccl_s2s_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ limitations under the License.
#include "oneflow/core/ep/cuda/cuda_stream.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down
4 changes: 2 additions & 2 deletions oneflow/user/kernels/nccl_logical_2d_sbp_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ limitations under the License.
#include "oneflow/user/kernels/collective_communication/include/all_gather.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down Expand Up @@ -554,4 +554,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_2D_same_dim1_all_reduc

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
7 changes: 4 additions & 3 deletions oneflow/user/kernels/nccl_logical_fusion_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ limitations under the License.
#include "collective_communication/include/all_to_all.h"
#include "collective_communication/include/reduce_scatter.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down Expand Up @@ -703,12 +703,13 @@ size_t InferNcclLogicalFusionKernelTmpBufferSize(user_op::InferContext* ctx) {
REGISTER_USER_KERNEL("_nccl_logical_fusion")
.SetCreateFn<CclLogicalFusionKernel>()
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
|| (user_op::HobDeviceType() == DeviceType::kNPU))
|| (user_op::HobDeviceType() == DeviceType::kNPU)
|| (user_op::HobDeviceType() == DeviceType::kMLU))
.SetInferTmpSizeFn(InferNcclLogicalFusionKernelTmpBufferSize);

// TODO: SetIsMatchedHob support multi devices(not including cpu)
} // namespace

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
4 changes: 2 additions & 2 deletions oneflow/user/kernels/nccl_logical_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ limitations under the License.
#include "oneflow/user/kernels/collective_communication/include/broadcast.h"
#include "oneflow/user/kernels/collective_communication/include/reduce.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down Expand Up @@ -640,4 +640,4 @@ REGISTER_USER_KERNEL_UNIFIED_CCL_COMM_INIT("_nccl_logical_s2s");

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
7 changes: 4 additions & 3 deletions oneflow/user/kernels/nccl_logical_send_recv_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ limitations under the License.
#include "oneflow/core/operator/nccl_send_recv_boxing_op_util.h"
#include "oneflow/user/kernels/collective_communication/include/all_to_all.h"

#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU)
#if (defined(WITH_CUDA) && (NCCL_VERSION_CODE > 2700)) || defined(WITH_NPU) || defined(WITH_MLU)

namespace oneflow {

Expand Down Expand Up @@ -288,9 +288,10 @@ size_t InferTmpBufferSize(user_op::InferContext* ctx) {
REGISTER_USER_KERNEL("_nccl_logical_send_recv")
.SetCreateFn<CclLogicalSendRecv>()
.SetIsMatchedHob((user_op::HobDeviceType() == DeviceType::kCUDA)
|| (user_op::HobDeviceType() == DeviceType::kNPU))
|| (user_op::HobDeviceType() == DeviceType::kNPU)
|| (user_op::HobDeviceType() == DeviceType::kMLU))
.SetInferTmpSizeFn(InferTmpBufferSize);

} // namespace oneflow

#endif // WITH_CUDA || WITH_NPU
#endif // WITH_CUDA || WITH_NPU || WITH_MLU
4 changes: 3 additions & 1 deletion oneflow/user/kernels/pack_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ class PackKernel final : public user_op::OpKernel {
(user_op::HobDeviceType() == device));

OF_PP_FOR_EACH_TUPLE(REGISTER_PACK_KERNEL, DEVICE_TYPE_SEQ)

#if defined(WITH_MLU)
REGISTER_PACK_KERNEL(DeviceType::kMLU)
#endif
#undef REGISTER_PACK_KERNEL

} // namespace
Expand Down
4 changes: 3 additions & 1 deletion oneflow/user/kernels/unpack_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ class UnpackKernel final : public user_op::OpKernel {
(user_op::HobDeviceType() == device));

OF_PP_FOR_EACH_TUPLE(REGISTER_UNPACK_KERNEL, DEVICE_TYPE_SEQ)

#if defined(WITH_MLU)
REGISTER_UNPACK_KERNEL(DeviceType::kMLU)
#endif
#undef REGISTER_UNPACK_KERNEL

} // namespace
Expand Down