Skip to content

Commit ebc041c

Browse files
authored
force subgroup 32 for cooperative matrix shader atm (#6100)
1 parent 2ef954b commit ebc041c

3 files changed

Lines changed: 60 additions & 13 deletions

File tree

src/gpu.cpp

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,19 +1186,6 @@ void GpuInfoPrivate::query_extension_properties()
11861186
}
11871187
}
11881188

1189-
if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_RADV || queryDriverProperties.driverID == VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA)
1190-
{
1191-
// cooperative matrix produces wrong result on mesa vulkan drivers :(
1192-
// https://gitlab.freedesktop.org/mesa/mesa/-/issues/10847
1193-
queryCooperativeMatrixFeatures.cooperativeMatrix = VK_FALSE;
1194-
queryCooperativeMatrixFeaturesNV.cooperativeMatrix = VK_FALSE;
1195-
1196-
support_cooperative_matrix_8_8_16 = false;
1197-
support_cooperative_matrix_16_8_8 = false;
1198-
support_cooperative_matrix_16_8_16 = false;
1199-
support_cooperative_matrix_16_16_16 = false;
1200-
}
1201-
12021189
if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_TURNIP)
12031190
{
12041191
// turnip crash when compiling large shader with full subgroup

src/layer/vulkan/convolution_vulkan.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
188188
{
189189
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
190190
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
191+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
192+
{
193+
use_cooperative_matrix_16_8_8 = false;
194+
use_cooperative_matrix_16_16_16 = false;
195+
}
191196

192197
// winograd43 transform kernel
193198
if (opt.use_winograd43_convolution)
@@ -427,10 +432,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
427432
pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
428433
if (use_cooperative_matrix_16_8_8)
429434
{
435+
pipeline_convolution_3x3s1d1_winograd43_gemm->set_subgroup_size(32);
430436
pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1);
431437
}
432438
else if (use_cooperative_matrix_16_16_16)
433439
{
440+
pipeline_convolution_3x3s1d1_winograd43_gemm->set_subgroup_size(32);
434441
pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1);
435442
}
436443
else if (opt.use_shader_local_memory)
@@ -705,10 +712,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
705712
pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
706713
if (use_cooperative_matrix_16_8_8)
707714
{
715+
pipeline_convolution_3x3s1d1_winograd23_gemm->set_subgroup_size(32);
708716
pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1);
709717
}
710718
else if (use_cooperative_matrix_16_16_16)
711719
{
720+
pipeline_convolution_3x3s1d1_winograd23_gemm->set_subgroup_size(32);
712721
pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1);
713722
}
714723
else if (opt.use_shader_local_memory)
@@ -755,6 +764,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
755764
{
756765
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
757766
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
767+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
768+
{
769+
use_cooperative_matrix_16_8_8 = false;
770+
use_cooperative_matrix_16_16_16 = false;
771+
}
758772

759773
if (use_cooperative_matrix_16_8_8)
760774
{
@@ -846,6 +860,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
846860
{
847861
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
848862
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
863+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
864+
{
865+
use_cooperative_matrix_16_8_8 = false;
866+
use_cooperative_matrix_16_16_16 = false;
867+
}
849868

850869
if (use_cooperative_matrix_16_8_8)
851870
{
@@ -948,6 +967,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
948967
{
949968
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
950969
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
970+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
971+
{
972+
use_cooperative_matrix_16_8_8 = false;
973+
use_cooperative_matrix_16_16_16 = false;
974+
}
951975

952976
// check blob shape
953977
if (!vkdev->shape_support_image_storage(shape_bordered_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
@@ -1020,10 +1044,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
10201044
pipeline_convolution_gemm = new Pipeline(vkdev);
10211045
if (use_cooperative_matrix_16_8_8)
10221046
{
1047+
pipeline_convolution_gemm->set_subgroup_size(32);
10231048
pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8
10241049
}
10251050
else if (use_cooperative_matrix_16_16_16)
10261051
{
1052+
pipeline_convolution_gemm->set_subgroup_size(32);
10271053
pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16
10281054
}
10291055
else if (opt.use_shader_local_memory)
@@ -1040,6 +1066,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
10401066
{
10411067
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
10421068
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
1069+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
1070+
{
1071+
use_cooperative_matrix_16_8_8 = false;
1072+
use_cooperative_matrix_16_16_16 = false;
1073+
}
10431074

10441075
std::vector<vk_specialization_type> specializations(4 + 8);
10451076
specializations[0].i = bias_term;
@@ -1084,10 +1115,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
10841115
pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
10851116
if (use_cooperative_matrix_16_8_8)
10861117
{
1118+
pipeline_convolution_1x1s1d1->set_subgroup_size(32);
10871119
pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_8_8
10881120
}
10891121
else if (use_cooperative_matrix_16_16_16)
10901122
{
1123+
pipeline_convolution_1x1s1d1->set_subgroup_size(32);
10911124
pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_16_16
10921125
}
10931126
else if (opt.use_shader_local_memory)
@@ -1404,6 +1437,11 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
14041437
{
14051438
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
14061439
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
1440+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
1441+
{
1442+
use_cooperative_matrix_16_8_8 = false;
1443+
use_cooperative_matrix_16_16_16 = false;
1444+
}
14071445

14081446
bool pre_winograd43 = opt.use_winograd43_convolution;
14091447
if (opt.use_winograd23_convolution)
@@ -1630,6 +1668,11 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
16301668
{
16311669
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
16321670
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
1671+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
1672+
{
1673+
use_cooperative_matrix_16_8_8 = false;
1674+
use_cooperative_matrix_16_16_16 = false;
1675+
}
16331676

16341677
// gemm
16351678
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
@@ -1678,6 +1721,11 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
16781721
{
16791722
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
16801723
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
1724+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
1725+
{
1726+
use_cooperative_matrix_16_8_8 = false;
1727+
use_cooperative_matrix_16_16_16 = false;
1728+
}
16811729

16821730
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
16831731
if (top_blob.empty())

src/layer/vulkan/deconvolution_vulkan.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
161161
{
162162
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
163163
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
164+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
165+
{
166+
use_cooperative_matrix_16_8_8 = false;
167+
use_cooperative_matrix_16_16_16 = false;
168+
}
164169

165170
// src = kw-kh-inch-outch
166171
// dst = pa-pb-inch/pa-kw-kh-outch/pb (sgemm)
@@ -312,10 +317,12 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
312317
pipeline_deconvolution_gemm = new Pipeline(vkdev);
313318
if (use_cooperative_matrix_16_8_8)
314319
{
320+
pipeline_deconvolution_gemm->set_subgroup_size(32);
315321
pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8
316322
}
317323
else if (use_cooperative_matrix_16_16_16)
318324
{
325+
pipeline_deconvolution_gemm->set_subgroup_size(32);
319326
pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16
320327
}
321328
else if (opt.use_shader_local_memory)
@@ -573,6 +580,11 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
573580
{
574581
bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
575582
bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
583+
if (vkdev->info.subgroup_size() != 32 && (!vkdev->info.support_subgroup_size_control() || vkdev->info.min_subgroup_size() > 32 || vkdev->info.max_subgroup_size() < 32))
584+
{
585+
use_cooperative_matrix_16_8_8 = false;
586+
use_cooperative_matrix_16_16_16 = false;
587+
}
576588

577589
const int maxk = kernel_w * kernel_h;
578590

0 commit comments

Comments
 (0)