2018-12-12 10:24:33 +01:00
|
|
|
|
#pragma once
|
2018-06-12 17:46:59 +02:00
|
|
|
|
#include "VKHelpers.h"
|
2019-06-09 09:03:27 +02:00
|
|
|
|
#include "Utilities/StrUtil.h"
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
2019-11-05 15:00:07 +01:00
|
|
|
|
#define VK_MAX_COMPUTE_TASKS 4096 // Max number of jobs per frame
|
2018-12-12 10:24:33 +01:00
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
namespace vk
|
|
|
|
|
|
{
|
|
|
|
|
|
struct compute_task
|
|
|
|
|
|
{
|
|
|
|
|
|
std::string m_src;
|
|
|
|
|
|
vk::glsl::shader m_shader;
|
|
|
|
|
|
std::unique_ptr<vk::glsl::program> m_program;
|
2019-04-02 14:16:52 +02:00
|
|
|
|
std::unique_ptr<vk::buffer> m_param_buffer;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
vk::descriptor_pool m_descriptor_pool;
|
|
|
|
|
|
VkDescriptorSet m_descriptor_set = nullptr;
|
|
|
|
|
|
VkDescriptorSetLayout m_descriptor_layout = nullptr;
|
|
|
|
|
|
VkPipelineLayout m_pipeline_layout = nullptr;
|
|
|
|
|
|
u32 m_used_descriptors = 0;
|
|
|
|
|
|
|
|
|
|
|
|
bool initialized = false;
|
2018-06-23 14:15:55 +02:00
|
|
|
|
bool unroll_loops = true;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
bool use_push_constants = false;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
u32 ssbo_count = 1;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
u32 push_constants_size = 0;
|
2018-06-23 14:15:55 +02:00
|
|
|
|
u32 optimal_group_size = 1;
|
|
|
|
|
|
u32 optimal_kernel_size = 1;
|
2019-11-05 15:03:25 +01:00
|
|
|
|
u32 max_invocations_x = 65535;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
virtual std::vector<std::pair<VkDescriptorType, u8>> get_descriptor_layout()
|
2018-06-12 17:46:59 +02:00
|
|
|
|
{
|
2019-05-30 17:38:18 +02:00
|
|
|
|
std::vector<std::pair<VkDescriptorType, u8>> result;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count);
|
2019-05-30 17:38:18 +02:00
|
|
|
|
return result;
|
|
|
|
|
|
}
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
void init_descriptors()
|
|
|
|
|
|
{
|
|
|
|
|
|
std::vector<VkDescriptorPoolSize> descriptor_pool_sizes;
|
|
|
|
|
|
std::vector<VkDescriptorSetLayoutBinding> bindings;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
const auto layout = get_descriptor_layout();
|
|
|
|
|
|
for (const auto &e : layout)
|
|
|
|
|
|
{
|
|
|
|
|
|
descriptor_pool_sizes.push_back({e.first, u32(VK_MAX_COMPUTE_TASKS * e.second)});
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned n = 0; n < e.second; ++n)
|
|
|
|
|
|
{
|
|
|
|
|
|
bindings.push_back
|
|
|
|
|
|
({
|
|
|
|
|
|
uint32_t(bindings.size()),
|
|
|
|
|
|
e.first,
|
|
|
|
|
|
1,
|
|
|
|
|
|
VK_SHADER_STAGE_COMPUTE_BIT,
|
|
|
|
|
|
nullptr
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
// Reserve descriptor pools
|
2019-12-03 23:34:23 +01:00
|
|
|
|
m_descriptor_pool.create(*get_current_renderer(), descriptor_pool_sizes.data(), ::size32(descriptor_pool_sizes), VK_MAX_COMPUTE_TASKS, 2);
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
VkDescriptorSetLayoutCreateInfo infos = {};
|
|
|
|
|
|
infos.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
|
|
|
|
|
|
infos.pBindings = bindings.data();
|
2019-12-03 23:34:23 +01:00
|
|
|
|
infos.bindingCount = ::size32(bindings);
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
CHECK_RESULT(vkCreateDescriptorSetLayout(*get_current_renderer(), &infos, nullptr, &m_descriptor_layout));
|
|
|
|
|
|
|
|
|
|
|
|
VkPipelineLayoutCreateInfo layout_info = {};
|
|
|
|
|
|
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
|
|
|
|
|
|
layout_info.setLayoutCount = 1;
|
|
|
|
|
|
layout_info.pSetLayouts = &m_descriptor_layout;
|
|
|
|
|
|
|
2019-10-29 13:21:53 +01:00
|
|
|
|
VkPushConstantRange push_constants{};
|
|
|
|
|
|
if (use_push_constants)
|
|
|
|
|
|
{
|
|
|
|
|
|
push_constants.size = push_constants_size;
|
|
|
|
|
|
push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
|
|
|
|
|
|
|
|
|
|
|
|
layout_info.pushConstantRangeCount = 1;
|
|
|
|
|
|
layout_info.pPushConstantRanges = &push_constants;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
CHECK_RESULT(vkCreatePipelineLayout(*get_current_renderer(), &layout_info, nullptr, &m_pipeline_layout));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void create()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!initialized)
|
|
|
|
|
|
{
|
|
|
|
|
|
init_descriptors();
|
|
|
|
|
|
|
|
|
|
|
|
switch (vk::get_driver_vendor())
|
|
|
|
|
|
{
|
|
|
|
|
|
case vk::driver_vendor::unknown:
|
2019-05-04 15:56:57 +02:00
|
|
|
|
case vk::driver_vendor::INTEL:
|
2019-08-30 13:46:48 +02:00
|
|
|
|
// Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256
|
|
|
|
|
|
// Based on intel's own OpenCL recommended settings
|
|
|
|
|
|
unroll_loops = true;
|
|
|
|
|
|
optimal_kernel_size = 1;
|
|
|
|
|
|
optimal_group_size = 128;
|
|
|
|
|
|
break;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
case vk::driver_vendor::NVIDIA:
|
2019-08-30 13:46:48 +02:00
|
|
|
|
// Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample)
|
2018-06-23 14:15:55 +02:00
|
|
|
|
unroll_loops = true;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
optimal_group_size = 32;
|
2019-08-30 13:46:48 +02:00
|
|
|
|
optimal_kernel_size = 1;
|
2018-06-23 14:15:55 +02:00
|
|
|
|
break;
|
|
|
|
|
|
case vk::driver_vendor::AMD:
|
|
|
|
|
|
case vk::driver_vendor::RADV:
|
2019-08-30 13:46:48 +02:00
|
|
|
|
// Wavefronts are multiples of 64
|
2018-06-23 14:15:55 +02:00
|
|
|
|
unroll_loops = false;
|
|
|
|
|
|
optimal_kernel_size = 1;
|
|
|
|
|
|
optimal_group_size = 64;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-11-05 15:03:25 +01:00
|
|
|
|
const auto& gpu = vk::get_current_renderer()->gpu();
|
|
|
|
|
|
max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0];
|
|
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
initialized = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void destroy()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (initialized)
|
|
|
|
|
|
{
|
|
|
|
|
|
m_shader.destroy();
|
|
|
|
|
|
m_program.reset();
|
2019-04-02 14:16:52 +02:00
|
|
|
|
m_param_buffer.reset();
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
vkDestroyDescriptorSetLayout(*get_current_renderer(), m_descriptor_layout, nullptr);
|
|
|
|
|
|
vkDestroyPipelineLayout(*get_current_renderer(), m_pipeline_layout, nullptr);
|
|
|
|
|
|
m_descriptor_pool.destroy();
|
|
|
|
|
|
|
|
|
|
|
|
initialized = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void free_resources()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (m_used_descriptors == 0)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2019-05-21 19:17:48 +02:00
|
|
|
|
m_descriptor_pool.reset(0);
|
2018-06-12 17:46:59 +02:00
|
|
|
|
m_used_descriptors = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
virtual void bind_resources()
|
|
|
|
|
|
{}
|
|
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
virtual void declare_inputs()
|
|
|
|
|
|
{}
|
|
|
|
|
|
|
2018-06-22 21:09:20 +02:00
|
|
|
|
void load_program(VkCommandBuffer cmd)
|
2018-06-12 17:46:59 +02:00
|
|
|
|
{
|
|
|
|
|
|
if (!m_program)
|
|
|
|
|
|
{
|
|
|
|
|
|
m_shader.create(::glsl::program_domain::glsl_compute_program, m_src);
|
|
|
|
|
|
auto handle = m_shader.compile();
|
|
|
|
|
|
|
|
|
|
|
|
VkPipelineShaderStageCreateInfo shader_stage{};
|
|
|
|
|
|
shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
|
|
|
|
|
|
shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
|
|
|
|
|
|
shader_stage.module = handle;
|
|
|
|
|
|
shader_stage.pName = "main";
|
|
|
|
|
|
|
|
|
|
|
|
VkComputePipelineCreateInfo info{};
|
|
|
|
|
|
info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
|
|
|
|
|
|
info.stage = shader_stage;
|
|
|
|
|
|
info.layout = m_pipeline_layout;
|
|
|
|
|
|
info.basePipelineIndex = -1;
|
|
|
|
|
|
info.basePipelineHandle = VK_NULL_HANDLE;
|
|
|
|
|
|
|
|
|
|
|
|
VkPipeline pipeline;
|
|
|
|
|
|
vkCreateComputePipelines(*get_current_renderer(), nullptr, 1, &info, nullptr, &pipeline);
|
|
|
|
|
|
|
2019-05-30 17:38:18 +02:00
|
|
|
|
m_program = std::make_unique<vk::glsl::program>(*get_current_renderer(), pipeline);
|
|
|
|
|
|
declare_inputs();
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-12 10:24:33 +01:00
|
|
|
|
verify(HERE), m_used_descriptors < VK_MAX_COMPUTE_TASKS;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
VkDescriptorSetAllocateInfo alloc_info = {};
|
|
|
|
|
|
alloc_info.descriptorPool = m_descriptor_pool;
|
|
|
|
|
|
alloc_info.descriptorSetCount = 1;
|
|
|
|
|
|
alloc_info.pSetLayouts = &m_descriptor_layout;
|
|
|
|
|
|
alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
|
|
|
|
|
|
|
|
|
|
|
|
CHECK_RESULT(vkAllocateDescriptorSets(*get_current_renderer(), &alloc_info, &m_descriptor_set));
|
|
|
|
|
|
m_used_descriptors++;
|
|
|
|
|
|
|
|
|
|
|
|
bind_resources();
|
|
|
|
|
|
|
|
|
|
|
|
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline);
|
|
|
|
|
|
vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout, 0, 1, &m_descriptor_set, 0, nullptr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-12-15 11:38:42 +01:00
|
|
|
|
void run(VkCommandBuffer cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z)
|
2018-06-12 17:46:59 +02:00
|
|
|
|
{
|
|
|
|
|
|
load_program(cmd);
|
2019-10-29 13:13:10 +01:00
|
|
|
|
vkCmdDispatch(cmd, invocations_x, invocations_y, invocations_z);
|
2019-05-30 17:38:18 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-12-15 11:38:42 +01:00
|
|
|
|
void run(VkCommandBuffer cmd, u32 num_invocations)
|
2019-05-30 17:38:18 +02:00
|
|
|
|
{
|
2019-11-05 15:03:25 +01:00
|
|
|
|
u32 invocations_x, invocations_y;
|
|
|
|
|
|
if (num_invocations > max_invocations_x)
|
|
|
|
|
|
{
|
|
|
|
|
|
// AMD hw reports an annoyingly small maximum number of invocations in the X dimension
|
|
|
|
|
|
// Split the 1D job into 2 dimensions to accomodate this
|
2019-12-03 23:34:23 +01:00
|
|
|
|
invocations_x = static_cast<u32>(floor(std::sqrt(num_invocations)));
|
2019-11-05 15:03:25 +01:00
|
|
|
|
invocations_y = invocations_x;
|
|
|
|
|
|
|
|
|
|
|
|
if (num_invocations % invocations_x) invocations_y++;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
invocations_x = num_invocations;
|
|
|
|
|
|
invocations_y = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
run(cmd, invocations_x, invocations_y, 1);
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_base : compute_task
|
|
|
|
|
|
{
|
2019-04-02 14:16:52 +02:00
|
|
|
|
const vk::buffer* m_data;
|
2018-06-22 21:09:20 +02:00
|
|
|
|
u32 m_data_offset = 0;
|
|
|
|
|
|
u32 m_data_length = 0;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
u32 kernel_size = 1;
|
|
|
|
|
|
|
2019-04-02 14:16:52 +02:00
|
|
|
|
std::string variables, work_kernel, loop_advance, suffix;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
std::string method_declarations;
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
cs_shuffle_base()
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel =
|
|
|
|
|
|
" value = data[index];\n"
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" data[index] = %f(value);\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
loop_advance =
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" index++;\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
suffix =
|
2019-04-06 08:48:58 +02:00
|
|
|
|
"}\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-23 14:15:55 +02:00
|
|
|
|
void build(const char* function_name, u32 _kernel_size = 0)
|
2018-06-12 17:46:59 +02:00
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
// Initialize to allow detecting optimal settings
|
|
|
|
|
|
create();
|
|
|
|
|
|
|
|
|
|
|
|
kernel_size = _kernel_size? _kernel_size : optimal_kernel_size;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
m_src =
|
|
|
|
|
|
"#version 430\n"
|
|
|
|
|
|
"layout(local_size_x=%ws, local_size_y=1, local_size_z=1) in;\n"
|
2019-04-02 14:16:52 +02:00
|
|
|
|
"layout(std430, set=0, binding=0) buffer ssbo{ uint data[]; };\n"
|
|
|
|
|
|
"%ub"
|
2018-06-12 17:46:59 +02:00
|
|
|
|
"\n"
|
|
|
|
|
|
"#define KERNEL_SIZE %ks\n"
|
2018-06-22 21:09:20 +02:00
|
|
|
|
"\n"
|
|
|
|
|
|
"// Generic swap routines\n"
|
2018-06-12 17:46:59 +02:00
|
|
|
|
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
|
|
|
|
|
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n"
|
|
|
|
|
|
"#define bswap_u16_u32(bits) (bits & 0xFFFF) << 16 | (bits & 0xFFFF0000) >> 16\n"
|
|
|
|
|
|
"\n"
|
2018-06-22 21:09:20 +02:00
|
|
|
|
"// Depth format conversions\n"
|
2019-04-02 14:16:52 +02:00
|
|
|
|
"#define d24_to_f32(bits) floatBitsToUint(float(bits) / 16777215.f)\n"
|
|
|
|
|
|
"#define f32_to_d24(bits) uint(uintBitsToFloat(bits) * 16777215.f)\n"
|
|
|
|
|
|
"#define d24x8_to_f32(bits) d24_to_f32(bits >> 8)\n"
|
2018-06-24 00:37:24 +02:00
|
|
|
|
"#define d24x8_to_d24x8_swapped(bits) (bits & 0xFF00) | (bits & 0xFF0000) >> 16 | (bits & 0xFF) << 16\n"
|
2019-04-02 14:16:52 +02:00
|
|
|
|
"#define f32_to_d24x8_swapped(bits) d24x8_to_d24x8_swapped(f32_to_d24(bits))\n"
|
2018-06-22 21:09:20 +02:00
|
|
|
|
"\n"
|
2018-06-12 17:46:59 +02:00
|
|
|
|
"void main()\n"
|
|
|
|
|
|
"{\n"
|
2019-11-05 15:03:25 +01:00
|
|
|
|
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
2020-01-14 14:32:13 +01:00
|
|
|
|
" uint invocation_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
|
|
|
|
|
" uint index = invocation_id * KERNEL_SIZE;\n"
|
2018-06-23 14:15:55 +02:00
|
|
|
|
" uint value;\n"
|
2019-04-02 14:16:52 +02:00
|
|
|
|
" %vars"
|
2019-04-06 08:48:58 +02:00
|
|
|
|
"\n";
|
2018-06-23 14:15:55 +02:00
|
|
|
|
|
2019-11-02 19:15:19 +01:00
|
|
|
|
const auto parameters_size = align(push_constants_size, 16) / 16;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
const std::pair<std::string, std::string> syntax_replace[] =
|
|
|
|
|
|
{
|
|
|
|
|
|
{ "%ws", std::to_string(optimal_group_size) },
|
|
|
|
|
|
{ "%ks", std::to_string(kernel_size) },
|
2019-04-02 14:16:52 +02:00
|
|
|
|
{ "%vars", variables },
|
|
|
|
|
|
{ "%f", function_name },
|
2019-11-02 19:15:19 +01:00
|
|
|
|
{ "%ub", use_push_constants? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : "" },
|
2018-06-12 17:46:59 +02:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
2018-06-23 14:15:55 +02:00
|
|
|
|
work_kernel = fmt::replace_all(work_kernel, syntax_replace);
|
|
|
|
|
|
|
|
|
|
|
|
if (kernel_size <= 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
m_src += " {\n" + work_kernel + " }\n";
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (unroll_loops)
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel += loop_advance + "\n";
|
|
|
|
|
|
|
|
|
|
|
|
m_src += std::string
|
|
|
|
|
|
(
|
|
|
|
|
|
" //Unrolled loop\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
// Assemble body with manual loop unroll to try loweing GPR usage
|
|
|
|
|
|
for (u32 n = 0; n < kernel_size; ++n)
|
|
|
|
|
|
{
|
|
|
|
|
|
m_src += work_kernel;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
m_src += " }\n";
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n";
|
|
|
|
|
|
m_src += " {\n";
|
|
|
|
|
|
m_src += work_kernel;
|
|
|
|
|
|
m_src += loop_advance;
|
|
|
|
|
|
m_src += " }\n";
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
m_src += suffix;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void bind_resources() override
|
|
|
|
|
|
{
|
2018-06-22 21:09:20 +02:00
|
|
|
|
m_program->bind_buffer({ m_data->value, m_data_offset, m_data_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
2019-04-02 14:16:52 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void set_parameters(VkCommandBuffer cmd, const u32* params, u8 count)
|
|
|
|
|
|
{
|
2019-11-02 19:15:19 +01:00
|
|
|
|
verify(HERE), use_push_constants;
|
|
|
|
|
|
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params);
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-04-02 14:16:52 +02:00
|
|
|
|
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_length, u32 data_offset = 0)
|
2018-06-12 17:46:59 +02:00
|
|
|
|
{
|
|
|
|
|
|
m_data = data;
|
2018-06-22 21:09:20 +02:00
|
|
|
|
m_data_offset = data_offset;
|
|
|
|
|
|
m_data_length = data_length;
|
2018-06-12 17:46:59 +02:00
|
|
|
|
|
|
|
|
|
|
const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4;
|
2020-01-14 14:40:29 +01:00
|
|
|
|
const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation);
|
2018-06-25 21:23:00 +02:00
|
|
|
|
const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation;
|
|
|
|
|
|
|
2019-04-02 14:16:52 +02:00
|
|
|
|
if ((num_bytes_to_process + data_offset) > data->size())
|
2018-06-25 21:23:00 +02:00
|
|
|
|
{
|
|
|
|
|
|
// Technically robust buffer access should keep the driver from crashing in OOB situations
|
|
|
|
|
|
LOG_ERROR(RSX, "Inadequate buffer length submitted for a compute operation."
|
|
|
|
|
|
"Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
compute_task::run(cmd, num_invocations);
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_16 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// byteswap ushort
|
|
|
|
|
|
cs_shuffle_16()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("bswap_u16");
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_32 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// byteswap_ulong
|
|
|
|
|
|
cs_shuffle_32()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("bswap_u32");
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_32_16 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// byteswap_ulong + byteswap_ushort
|
|
|
|
|
|
cs_shuffle_32_16()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("bswap_u16_u32");
|
2018-06-12 17:46:59 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2018-06-22 21:09:20 +02:00
|
|
|
|
struct cs_shuffle_d24x8_f32 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// convert d24x8 to f32
|
|
|
|
|
|
cs_shuffle_d24x8_f32()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("d24x8_to_f32");
|
2018-06-22 21:09:20 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_se_f32_d24x8 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// convert f32 to d24x8 and swap endianness
|
|
|
|
|
|
cs_shuffle_se_f32_d24x8()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("f32_to_d24x8_swapped");
|
2018-06-22 21:09:20 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_shuffle_se_d24x8 : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
// swap endianness of d24x8
|
|
|
|
|
|
cs_shuffle_se_d24x8()
|
|
|
|
|
|
{
|
2018-06-23 14:15:55 +02:00
|
|
|
|
cs_shuffle_base::build("d24x8_to_d24x8_swapped");
|
2018-06-22 21:09:20 +02:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2019-04-02 14:16:52 +02:00
|
|
|
|
// NOTE: D24S8 layout has the stencil in the MSB! Its actually S8|D24|S8|D24 starting at offset 0
|
|
|
|
|
|
struct cs_interleave_task : cs_shuffle_base
|
|
|
|
|
|
{
|
|
|
|
|
|
u32 m_ssbo_length = 0;
|
|
|
|
|
|
|
|
|
|
|
|
cs_interleave_task()
|
|
|
|
|
|
{
|
2019-11-02 19:15:19 +01:00
|
|
|
|
use_push_constants = true;
|
|
|
|
|
|
push_constants_size = 16;
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
variables =
|
|
|
|
|
|
" uint block_length = params[0].x >> 2;\n"
|
|
|
|
|
|
" uint z_offset = params[0].y >> 2;\n"
|
|
|
|
|
|
" uint s_offset = params[0].z >> 2;\n"
|
|
|
|
|
|
" uint depth;\n"
|
|
|
|
|
|
" uint stencil;\n"
|
|
|
|
|
|
" uint stencil_shift;\n"
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" uint stencil_offset;\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void bind_resources() override
|
|
|
|
|
|
{
|
|
|
|
|
|
m_program->bind_buffer({ m_data->value, m_data_offset, m_ssbo_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void run(VkCommandBuffer cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset)
|
|
|
|
|
|
{
|
2019-11-02 19:15:19 +01:00
|
|
|
|
u32 parameters[4] = { data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0 };
|
|
|
|
|
|
set_parameters(cmd, parameters, 4);
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
m_ssbo_length = stencil_offset + (data_length / 4) - data_offset;
|
|
|
|
|
|
cs_shuffle_base::run(cmd, data, data_length, data_offset);
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2019-09-04 21:19:58 +02:00
|
|
|
|
template<bool _SwapBytes = false>
|
2019-04-02 14:16:52 +02:00
|
|
|
|
struct cs_gather_d24x8 : cs_interleave_task
|
|
|
|
|
|
{
|
|
|
|
|
|
cs_gather_d24x8()
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel =
|
|
|
|
|
|
" if (index >= block_length)\n"
|
|
|
|
|
|
" return;\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" depth = data[index + z_offset] & 0x00FFFFFF;\n"
|
|
|
|
|
|
" stencil_offset = (index / 4);\n"
|
|
|
|
|
|
" stencil_shift = (index % 4) * 8;\n"
|
|
|
|
|
|
" stencil = data[stencil_offset + s_offset];\n"
|
|
|
|
|
|
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
|
2019-09-04 21:19:58 +02:00
|
|
|
|
" value = (depth << 8) | stencil;\n";
|
|
|
|
|
|
|
|
|
|
|
|
if constexpr (!_SwapBytes)
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel +=
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" data[index] = value;\n";
|
2019-09-04 21:19:58 +02:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel +=
|
|
|
|
|
|
" data[index] = bswap_u32(value);\n";
|
|
|
|
|
|
}
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
cs_shuffle_base::build("");
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2019-09-04 21:19:58 +02:00
|
|
|
|
template<bool _SwapBytes = false>
|
2019-04-02 14:16:52 +02:00
|
|
|
|
struct cs_gather_d32x8 : cs_interleave_task
|
|
|
|
|
|
{
|
|
|
|
|
|
cs_gather_d32x8()
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel =
|
|
|
|
|
|
" if (index >= block_length)\n"
|
|
|
|
|
|
" return;\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" depth = f32_to_d24(data[index + z_offset]);\n"
|
|
|
|
|
|
" stencil_offset = (index / 4);\n"
|
|
|
|
|
|
" stencil_shift = (index % 4) * 8;\n"
|
|
|
|
|
|
" stencil = data[stencil_offset + s_offset];\n"
|
|
|
|
|
|
" stencil = (stencil >> stencil_shift) & 0xFF;\n"
|
2019-09-04 21:19:58 +02:00
|
|
|
|
" value = (depth << 8) | stencil;\n";
|
|
|
|
|
|
|
|
|
|
|
|
if constexpr (!_SwapBytes)
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel +=
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" data[index] = value;\n";
|
2019-09-04 21:19:58 +02:00
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel +=
|
|
|
|
|
|
" data[index] = bswap_u32(value);\n";
|
|
|
|
|
|
}
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
cs_shuffle_base::build("");
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_scatter_d24x8 : cs_interleave_task
|
|
|
|
|
|
{
|
|
|
|
|
|
cs_scatter_d24x8()
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel =
|
|
|
|
|
|
" if (index >= block_length)\n"
|
|
|
|
|
|
" return;\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" value = data[index];\n"
|
|
|
|
|
|
" data[index + z_offset] = (value >> 8);\n"
|
|
|
|
|
|
" stencil_offset = (index / 4);\n"
|
|
|
|
|
|
" stencil_shift = (index % 4) * 8;\n"
|
|
|
|
|
|
" stencil = (value & 0xFF) << stencil_shift;\n"
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" data[stencil_offset + s_offset] |= stencil;\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
cs_shuffle_base::build("");
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct cs_scatter_d32x8 : cs_interleave_task
|
|
|
|
|
|
{
|
|
|
|
|
|
cs_scatter_d32x8()
|
|
|
|
|
|
{
|
|
|
|
|
|
work_kernel =
|
|
|
|
|
|
" if (index >= block_length)\n"
|
|
|
|
|
|
" return;\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" value = data[index];\n"
|
|
|
|
|
|
" data[index + z_offset] = d24_to_f32(value >> 8);\n"
|
|
|
|
|
|
" stencil_offset = (index / 4);\n"
|
|
|
|
|
|
" stencil_shift = (index % 4) * 8;\n"
|
|
|
|
|
|
" stencil = (value & 0xFF) << stencil_shift;\n"
|
2019-04-06 08:48:58 +02:00
|
|
|
|
" data[stencil_offset + s_offset] |= stencil;\n";
|
2019-04-02 14:16:52 +02:00
|
|
|
|
|
|
|
|
|
|
cs_shuffle_base::build("");
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2019-10-29 13:13:10 +01:00
|
|
|
|
// Reverse morton-order block arrangement
|
2019-10-29 13:21:53 +01:00
|
|
|
|
struct cs_deswizzle_base : compute_task
|
|
|
|
|
|
{
|
2019-11-05 15:00:07 +01:00
|
|
|
|
virtual void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) = 0;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
};
|
|
|
|
|
|
|
2019-10-29 13:13:10 +01:00
|
|
|
|
template <typename _BlockType, typename _BaseType, bool _SwapBytes>
|
2019-10-29 13:21:53 +01:00
|
|
|
|
struct cs_deswizzle_3d : cs_deswizzle_base
|
2019-10-29 13:13:10 +01:00
|
|
|
|
{
|
|
|
|
|
|
union params_t
|
|
|
|
|
|
{
|
2019-11-05 15:00:07 +01:00
|
|
|
|
u32 data[7];
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
|
|
|
|
|
struct
|
|
|
|
|
|
{
|
|
|
|
|
|
u32 width;
|
|
|
|
|
|
u32 height;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
u32 depth;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
u32 logw;
|
|
|
|
|
|
u32 logh;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
u32 logd;
|
2019-11-05 15:00:07 +01:00
|
|
|
|
u32 mipmaps;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
params;
|
|
|
|
|
|
|
|
|
|
|
|
const vk::buffer* src_buffer = nullptr;
|
|
|
|
|
|
const vk::buffer* dst_buffer = nullptr;
|
|
|
|
|
|
u32 in_offset = 0;
|
|
|
|
|
|
u32 out_offset = 0;
|
|
|
|
|
|
u32 block_length = 0;
|
2019-12-03 23:34:23 +01:00
|
|
|
|
|
2019-10-29 13:13:10 +01:00
|
|
|
|
cs_deswizzle_3d()
|
|
|
|
|
|
{
|
|
|
|
|
|
verify("Unsupported block type" HERE), (sizeof(_BlockType) & 3) == 0;
|
|
|
|
|
|
|
|
|
|
|
|
ssbo_count = 2;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
use_push_constants = true;
|
2019-11-05 15:00:07 +01:00
|
|
|
|
push_constants_size = 28;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
|
2019-10-29 13:13:10 +01:00
|
|
|
|
create();
|
|
|
|
|
|
|
|
|
|
|
|
m_src =
|
|
|
|
|
|
"#version 450\n"
|
2019-11-05 15:00:07 +01:00
|
|
|
|
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
2019-10-29 13:21:53 +01:00
|
|
|
|
"layout(set=0, binding=0, std430) buffer ssbo0{ uint data_in[]; };\n"
|
|
|
|
|
|
"layout(set=0, binding=1, std430) buffer ssbo1{ uint data_out[]; };\n"
|
|
|
|
|
|
"layout(push_constant) uniform parameters\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"{\n"
|
|
|
|
|
|
" uint image_width;\n"
|
|
|
|
|
|
" uint image_height;\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
" uint image_depth;\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
" uint image_logw;\n"
|
|
|
|
|
|
" uint image_logh;\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
" uint image_logd;\n"
|
2019-11-05 15:00:07 +01:00
|
|
|
|
" uint lod_count;\n"
|
|
|
|
|
|
"};\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"struct invocation_properties\n"
|
|
|
|
|
|
"{\n"
|
|
|
|
|
|
" uint data_offset;\n"
|
|
|
|
|
|
" uvec3 size;\n"
|
|
|
|
|
|
" uvec3 size_log2;\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"};\n\n"
|
|
|
|
|
|
|
2019-10-29 13:21:53 +01:00
|
|
|
|
"#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8\n"
|
2019-11-05 15:00:07 +01:00
|
|
|
|
"#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"invocation_properties invocation;\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"bool init_invocation_properties(const in uint offset)\n"
|
|
|
|
|
|
"{\n"
|
|
|
|
|
|
" invocation.data_offset = 0;\n"
|
|
|
|
|
|
" invocation.size.x = image_width;\n"
|
|
|
|
|
|
" invocation.size.y = image_height;\n"
|
|
|
|
|
|
" invocation.size.z = image_depth;\n"
|
|
|
|
|
|
" invocation.size_log2.x = image_logw;\n"
|
|
|
|
|
|
" invocation.size_log2.y = image_logh;\n"
|
|
|
|
|
|
" invocation.size_log2.z = image_logd;\n"
|
|
|
|
|
|
" uint level_end = image_width * image_height * image_depth;\n"
|
|
|
|
|
|
" uint level = 1;\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
" while (offset >= level_end && level < lod_count)\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" invocation.data_offset = level_end;\n"
|
|
|
|
|
|
" invocation.size.xy /= 2;\n"
|
|
|
|
|
|
" invocation.size.xy = max(invocation.size.xy, uvec2(1));\n"
|
|
|
|
|
|
" invocation.size_log2.xy = max(invocation.size_log2.xy, uvec2(1));\n"
|
|
|
|
|
|
" invocation.size_log2.xy --;\n"
|
|
|
|
|
|
" level_end += (invocation.size.x * invocation.size.y * image_depth);\n"
|
|
|
|
|
|
" level++;"
|
|
|
|
|
|
" }\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
" return (offset < level_end);\n"
|
|
|
|
|
|
"}\n\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
2019-11-02 19:15:19 +01:00
|
|
|
|
"uint get_z_index(const in uint x_, const in uint y_, const in uint z_)\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"{\n"
|
|
|
|
|
|
" uint offset = 0;\n"
|
|
|
|
|
|
" uint shift = 0;\n"
|
2019-11-02 19:15:19 +01:00
|
|
|
|
" uint x = x_;\n"
|
|
|
|
|
|
" uint y = y_;\n"
|
|
|
|
|
|
" uint z = z_;\n"
|
2019-11-05 15:00:07 +01:00
|
|
|
|
" uint log2w = invocation.size_log2.x;\n"
|
|
|
|
|
|
" uint log2h = invocation.size_log2.y;\n"
|
|
|
|
|
|
" uint log2d = invocation.size_log2.z;\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"\n"
|
|
|
|
|
|
" do\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" if (log2w > 0)\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" offset |= (x & 1) << shift;\n"
|
|
|
|
|
|
" shift++;\n"
|
|
|
|
|
|
" x >>= 1;\n"
|
|
|
|
|
|
" log2w--;\n"
|
|
|
|
|
|
" }\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" if (log2h > 0)\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" offset |= (y & 1) << shift;\n"
|
|
|
|
|
|
" shift++;\n"
|
|
|
|
|
|
" y >>= 1;\n"
|
|
|
|
|
|
" log2h--;\n"
|
|
|
|
|
|
" }\n"
|
|
|
|
|
|
"\n"
|
|
|
|
|
|
" if (log2d > 0)\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" offset |= (z & 1) << shift;\n"
|
|
|
|
|
|
" shift++;\n"
|
|
|
|
|
|
" z >>= 1;\n"
|
|
|
|
|
|
" log2d--;\n"
|
|
|
|
|
|
" }\n"
|
|
|
|
|
|
" }\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
" while(x > 0 || y > 0 || z > 0);\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"\n"
|
|
|
|
|
|
" return offset;\n"
|
|
|
|
|
|
"}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"void main()\n"
|
|
|
|
|
|
"{\n"
|
2019-11-05 15:00:07 +01:00
|
|
|
|
" uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);"
|
|
|
|
|
|
" uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;\n"
|
|
|
|
|
|
" uint word_count = %_wordcount;\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
" if (!init_invocation_properties(texel_id))\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
" return;\n\n"
|
|
|
|
|
|
|
2019-11-05 15:00:07 +01:00
|
|
|
|
" // Calculations done in texels, not bytes\n"
|
|
|
|
|
|
" uint row_length = invocation.size.x;\n"
|
|
|
|
|
|
" uint slice_length = (invocation.size.y * row_length);\n"
|
|
|
|
|
|
" uint level_offset = (texel_id - invocation.data_offset);\n"
|
|
|
|
|
|
" uint slice_offset = (level_offset % slice_length);\n"
|
|
|
|
|
|
" uint z = (level_offset / slice_length);\n"
|
|
|
|
|
|
" uint y = (slice_offset / row_length);\n"
|
|
|
|
|
|
" uint x = (slice_offset % row_length);\n\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
|
2019-11-05 15:00:07 +01:00
|
|
|
|
" uint src_texel_id = get_z_index(x, y, z);\n"
|
|
|
|
|
|
" uint dst_id = (texel_id * word_count);\n"
|
|
|
|
|
|
" uint src_id = (src_texel_id + invocation.data_offset) * word_count;\n\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
|
|
|
|
|
" for (uint i = 0; i < word_count; ++i)\n"
|
|
|
|
|
|
" {\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
" uint value = data_in[src_id++];\n"
|
|
|
|
|
|
" data_out[dst_id++] = %f(value);\n"
|
2019-10-29 13:13:10 +01:00
|
|
|
|
" }\n\n"
|
2019-10-29 13:21:53 +01:00
|
|
|
|
|
2019-10-29 13:13:10 +01:00
|
|
|
|
"}\n";
|
|
|
|
|
|
|
|
|
|
|
|
std::string transform;
|
|
|
|
|
|
if constexpr (_SwapBytes)
|
|
|
|
|
|
{
|
|
|
|
|
|
if constexpr (sizeof(_BaseType) == 4)
|
|
|
|
|
|
{
|
|
|
|
|
|
transform = "bswap_u32";
|
|
|
|
|
|
}
|
|
|
|
|
|
else if constexpr (sizeof(_BaseType) == 2)
|
|
|
|
|
|
{
|
|
|
|
|
|
transform = "bswap_u16";
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
{
|
|
|
|
|
|
fmt::throw_exception("Unreachable" HERE);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const std::pair<std::string, std::string> syntax_replace[] =
|
|
|
|
|
|
{
|
2019-11-05 15:00:07 +01:00
|
|
|
|
{ "%ws", std::to_string(optimal_group_size) },
|
2019-10-29 13:13:10 +01:00
|
|
|
|
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
|
|
|
|
|
|
{ "%f", transform }
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void bind_resources() override
|
|
|
|
|
|
{
|
|
|
|
|
|
m_program->bind_buffer({ src_buffer->value, in_offset, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
|
|
|
|
m_program->bind_buffer({ dst_buffer->value, out_offset, block_length }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void set_parameters(VkCommandBuffer cmd)
|
|
|
|
|
|
{
|
2019-11-05 15:00:07 +01:00
|
|
|
|
vkCmdPushConstants(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, push_constants_size, params.data);
|
2019-10-29 13:13:10 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-11-05 15:00:07 +01:00
|
|
|
|
void run(VkCommandBuffer cmd, const vk::buffer* dst, u32 out_offset, const vk::buffer* src, u32 in_offset, u32 data_length, u32 width, u32 height, u32 depth, u32 mipmaps) override
|
2019-10-29 13:13:10 +01:00
|
|
|
|
{
|
|
|
|
|
|
dst_buffer = dst;
|
|
|
|
|
|
src_buffer = src;
|
|
|
|
|
|
|
|
|
|
|
|
this->in_offset = in_offset;
|
|
|
|
|
|
this->out_offset = out_offset;
|
2019-11-05 15:00:07 +01:00
|
|
|
|
this->block_length = data_length;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
|
|
|
|
|
params.width = width;
|
2019-10-29 13:21:53 +01:00
|
|
|
|
params.height = height;
|
|
|
|
|
|
params.depth = depth;
|
2019-11-05 15:00:07 +01:00
|
|
|
|
params.mipmaps = mipmaps;
|
2019-10-29 13:13:10 +01:00
|
|
|
|
params.logw = rsx::ceil_log2(width);
|
|
|
|
|
|
params.logh = rsx::ceil_log2(height);
|
2019-10-29 13:21:53 +01:00
|
|
|
|
params.logd = rsx::ceil_log2(depth);
|
|
|
|
|
|
set_parameters(cmd);
|
2019-10-29 13:13:10 +01:00
|
|
|
|
|
2019-11-05 15:03:25 +01:00
|
|
|
|
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
2019-12-16 20:56:14 +01:00
|
|
|
|
const u32 linear_invocations = aligned_div(data_length, num_bytes_per_invocation);
|
2019-11-05 15:00:07 +01:00
|
|
|
|
compute_task::run(cmd, linear_invocations);
|
2019-10-29 13:13:10 +01:00
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2019-12-10 05:56:44 +01:00
|
|
|
|
struct cs_aggregator : compute_task
|
|
|
|
|
|
{
|
|
|
|
|
|
const buffer* src = nullptr;
|
|
|
|
|
|
const buffer* dst = nullptr;
|
|
|
|
|
|
u32 block_length = 0;
|
|
|
|
|
|
u32 word_count = 0;
|
|
|
|
|
|
|
|
|
|
|
|
cs_aggregator()
|
|
|
|
|
|
{
|
|
|
|
|
|
ssbo_count = 2;
|
|
|
|
|
|
|
|
|
|
|
|
create();
|
|
|
|
|
|
|
|
|
|
|
|
m_src =
|
|
|
|
|
|
"#version 450\n"
|
|
|
|
|
|
"layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n"
|
|
|
|
|
|
"layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
"void main()\n"
|
|
|
|
|
|
"{\n"
|
|
|
|
|
|
" if (gl_GlobalInvocationID.x < src.length())\n"
|
|
|
|
|
|
" {\n"
|
|
|
|
|
|
" atomicAdd(result, src[gl_GlobalInvocationID.x]);\n"
|
|
|
|
|
|
" }\n"
|
|
|
|
|
|
"}\n";
|
|
|
|
|
|
|
|
|
|
|
|
const std::pair<std::string, std::string> syntax_replace[] =
|
|
|
|
|
|
{
|
|
|
|
|
|
{ "%ws", std::to_string(optimal_group_size) },
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
m_src = fmt::replace_all(m_src, syntax_replace);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void bind_resources() override
|
|
|
|
|
|
{
|
|
|
|
|
|
m_program->bind_buffer({ src->value, 0, block_length }, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
|
|
|
|
m_program->bind_buffer({ dst->value, 0, 4 }, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void run(VkCommandBuffer cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words)
|
|
|
|
|
|
{
|
|
|
|
|
|
this->dst = dst;
|
|
|
|
|
|
this->src = src;
|
|
|
|
|
|
word_count = num_words;
|
|
|
|
|
|
block_length = num_words * 4;
|
|
|
|
|
|
|
|
|
|
|
|
const u32 linear_invocations = aligned_div(word_count, optimal_group_size);
|
|
|
|
|
|
compute_task::run(cmd, linear_invocations);
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2018-06-12 17:46:59 +02:00
|
|
|
|
// TODO: Replace with a proper manager
|
|
|
|
|
|
extern std::unordered_map<u32, std::unique_ptr<vk::compute_task>> g_compute_tasks;
|
|
|
|
|
|
|
|
|
|
|
|
template<class T>
|
|
|
|
|
|
T* get_compute_task()
|
|
|
|
|
|
{
|
|
|
|
|
|
u32 index = id_manager::typeinfo::get_index<T>();
|
|
|
|
|
|
auto &e = g_compute_tasks[index];
|
|
|
|
|
|
|
|
|
|
|
|
if (!e)
|
|
|
|
|
|
{
|
|
|
|
|
|
e = std::make_unique<T>();
|
|
|
|
|
|
e->create();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return static_cast<T*>(e.get());
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void reset_compute_tasks();
|
|
|
|
|
|
}
|