Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions src/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ VkCompute::~VkCompute()

void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt)
{
// NCNN_LOGE("record_upload buffer");
// NCNN_LOGE("record_upload buffer");

Mat src_fp16;
if (src.elemsize == src.elempack * 4u)
Expand Down Expand Up @@ -410,12 +410,20 @@ void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt)
dst_elempack = elemcount % 4 == 0 ? 4 : 1;

// gpu cast to fp16 on the fly (integrated gpu)
vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
int cast_type_to = 0;
if (vkdev->info.type() != 0)
{
if (opt.use_fp16_storage || opt.use_fp16_packed)
cast_type_to = 2;
else
cast_type_to = 1;
Comment thread
nihui marked this conversation as resolved.
}
vkdev->convert_packing(dst_staging, dst, dst_elempack, cast_type_to, *this, opt);
}

void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
{
// NCNN_LOGE("record_download buffer");
// NCNN_LOGE("record_download buffer");

// resolve dst_elempack
int dims = src.dims;
Expand All @@ -432,18 +440,18 @@ void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)

// gpu cast to fp32 on the fly (integrated gpu)
Option opt_staging = opt;
if (vkdev->info.type() != 0)
{
opt_staging.use_fp16_packed = false;
opt_staging.use_fp16_storage = false;
}
if (!opt_staging.blob_vkallocator->mappable)
{
opt_staging.blob_vkallocator = opt.staging_vkallocator;
}
int cast_type_to = 0;
if (vkdev->info.type() != 0)
{
cast_type_to = 1;
Comment thread
nihui marked this conversation as resolved.
}

VkMat dst_staging;
vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging);
vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging);

// barrier device any @ compute to host-read @ compute
if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT)
Expand Down
94 changes: 32 additions & 62 deletions src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@

#include "command.h"
#include "layer.h"
#include "layer/vulkan/packing_vulkan.h"
#include "layer_type.h"
#include "mat.h"
#include "pipelinecache.h"
Expand Down Expand Up @@ -2989,7 +2988,7 @@ class VulkanDevicePrivate
void destroy_dummy_buffer_image();

// utility operator
const ncnn::Packing_vulkan* get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
const ncnn::Layer* get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
void destroy_utility_operator();

VkDevice device;
Expand Down Expand Up @@ -3029,10 +3028,10 @@ class VulkanDevicePrivate
PipelineCache* pipeline_cache;

// utility operator
// from fp32-b/i | fp16p-b/i | fp16s-b/i
// to fp32-b/i | fp16p-b/i | fp16s-b/i
// from fp32 | fp16
// to fp32 | fp16
// to pack1 | pack4 | pack8
mutable ncnn::Packing_vulkan* uop_packing[3][3][3];
mutable ncnn::Layer* uop_packing[2][2][3];
mutable Mutex uop_lock;

// device is valid and sucessfully initialized
Expand Down Expand Up @@ -3095,36 +3094,20 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image()
}
}

const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storage_type_from, int storage_type_to, int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
{
MutexLockGuard lock(uop_lock);

const ncnn::Packing_vulkan* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
const ncnn::Layer* cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
if (cached_uop)
return cached_uop;

if ((cast_type_from_index == 1 && cast_type_to_index == 2) || (cast_type_from_index == 2 && cast_type_to_index == 1))
{
NCNN_LOGE("no fp16p to/from fp16s conversion");
return 0;
}
bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);

// create uop
Option opt;
opt.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
opt.use_fp16_storage = (cast_type_from_index == 2 || cast_type_to_index == 2);

if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
{
NCNN_LOGE("cannot create uop with use_fp16_packed if not support_fp16_packed");
return 0;
}

if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
{
NCNN_LOGE("cannot create uop with use_fp16_storage if not support_fp16_storage");
return 0;
}
opt.use_fp16_packed = use_fp16; // fp16p is always supported
opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();

// fp16/int8 arithmetic are not necessary for packing
// and may conflict with storage options
Expand All @@ -3144,15 +3127,13 @@ const ncnn::Packing_vulkan* VulkanDevicePrivate::get_utility_operator(int storag

opt.vulkan_device_index = vkdev->info.device_index();

ncnn::Packing_vulkan* uop = new ncnn::Packing_vulkan;
ncnn::Layer* uop = ncnn::create_layer_vulkan(LayerType::Packing);
uop->vkdev = vkdev;

ncnn::ParamDict pd;
pd.set(0, packing_type_to_index == 0 ? 1 : packing_type_to_index == 1 ? 4 : 8); // out_elempack
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16p 3=fp16s
pd.set(2, cast_type_from_index + 1); // 0=auto 1=fp32 2=fp16
pd.set(3, cast_type_to_index + 1);
Comment thread
nihui marked this conversation as resolved.
pd.set(4, storage_type_from); // 0=buffer 1=image
pd.set(5, storage_type_to);

uop->load_param(pd);

Expand All @@ -3173,26 +3154,16 @@ void VulkanDevicePrivate::destroy_utility_operator()
opt.pipeline_cache = 0;
opt.vulkan_device_index = vkdev->info.device_index();

// from fp32-b/i | fp16p-b/i | fp16s-b/i
// to fp32-b/i | fp16p-b/i | fp16s-b/i
for (int j0 = 0; j0 < 3; j0++)
// from fp32 | fp16
for (int j0 = 0; j0 < 2; j0++)
{
for (int j1 = 0; j1 < 3; j1++)
// to fp32 | fp16
for (int j1 = 0; j1 < 2; j1++)
{
if ((j0 == 1 && j1 == 2) || (j0 == 2 && j1 == 1))
{
// no fp16p to/from fp16s conversion
continue;
}
bool use_fp16 = (j0 == 1 || j1 == 1);

opt.use_fp16_packed = (j0 == 1 || j1 == 1);
opt.use_fp16_storage = (j0 == 2 || j1 == 2);

if (!vkdev->info.support_fp16_packed() && opt.use_fp16_packed)
continue;

if (!vkdev->info.support_fp16_storage() && opt.use_fp16_storage)
continue;
opt.use_fp16_packed = use_fp16;
opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();

// to pack1 | pack4 | pack8
for (int k = 0; k < 3; k++)
Expand Down Expand Up @@ -4249,7 +4220,11 @@ uint32_t VulkanDevice::get_heap_budget() const

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
{
int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed ? 1 : 0;
convert_packing(src, dst, dst_elempack, 0, cmd, opt);
}

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
{
int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;

int cast_type_from_index;
Expand All @@ -4259,24 +4234,19 @@ void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempac
}
else // if (src.elembits() == 16)
{
if (cast_type_to_index != 0)
{
cast_type_from_index = cast_type_to_index;
}
else if (info.support_fp16_storage())
{
cast_type_from_index = 2;
}
else // if (info.support_fp16_packed())
{
cast_type_from_index = 1;
}
cast_type_from_index = 1;
}

int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index;

// NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);

const ncnn::Packing_vulkan* uop = d->get_utility_operator(0, 0, cast_type_from_index, cast_type_to_index, packing_type_to_index);
uop->forward(src, dst, cmd, opt);
Option opt2 = opt;
opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage();

const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index);
uop->forward(src, dst, cmd, opt2);
}

int VulkanDevice::init_device_extension()
Expand Down
2 changes: 2 additions & 0 deletions src/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,8 @@ class NCNN_EXPORT VulkanDevice

// utility operator
void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
// cast_type_to 0=auto(same as src) 1=fp32 2=fp16
void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const;

// VK_KHR_bind_memory2
PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
Expand Down
3 changes: 0 additions & 3 deletions src/layer/packing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@ int Packing::load_param(const ParamDict& pd)
cast_type_from = pd.get(2, 0);
cast_type_to = pd.get(3, 0);

storage_type_from = pd.get(4, 0);
storage_type_to = pd.get(5, 0);

return 0;
}

Expand Down
9 changes: 1 addition & 8 deletions src/layer/packing.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,9 @@ class Packing : public Layer
// element type
// 0 = auto
// 1 = fp32
// 2 = fp16p
// 3 = fp16s
// 2 = fp16
int cast_type_from;
int cast_type_to;

// storage type
// 0 = buffer
// 1 = image
int storage_type_from;
int storage_type_to;
};

} // namespace ncnn
Expand Down
Loading
Loading