#include "VKCompute.h" #include "VKHelpers.h" #include "VKRenderPass.h" #include "vkutils/buffer_object.h" #include "VKPipelineCompiler.h" #include "rx/align.hpp" #define VK_MAX_COMPUTE_TASKS 8192 // Max number of jobs per frame namespace vk { std::vector> compute_task::get_descriptor_layout() { std::vector> result; result.emplace_back(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ssbo_count); return result; } void compute_task::init_descriptors() { rsx::simple_array descriptor_pool_sizes; rsx::simple_array bindings; const auto layout = get_descriptor_layout(); for (const auto& e : layout) { descriptor_pool_sizes.push_back({e.first, e.second}); for (unsigned n = 0; n < e.second; ++n) { bindings.push_back({u32(bindings.size()), e.first, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); } } // Reserve descriptor pools m_descriptor_pool.create(*g_render_device, descriptor_pool_sizes); m_descriptor_layout = vk::descriptors::create_layout(bindings); VkPipelineLayoutCreateInfo layout_info = {}; layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; layout_info.setLayoutCount = 1; layout_info.pSetLayouts = &m_descriptor_layout; VkPushConstantRange push_constants{}; if (use_push_constants) { push_constants.size = push_constants_size; push_constants.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; layout_info.pushConstantRangeCount = 1; layout_info.pPushConstantRanges = &push_constants; } CHECK_RESULT(VK_GET_SYMBOL(vkCreatePipelineLayout)(*g_render_device, &layout_info, nullptr, &m_pipeline_layout)); } void compute_task::create() { if (!initialized) { init_descriptors(); switch (vk::get_driver_vendor()) { case vk::driver_vendor::unknown: case vk::driver_vendor::INTEL: case vk::driver_vendor::ANV: // Intel hw has 8 threads, but LDS allocation behavior makes optimal group size between 64 and 256 // Based on intel's own OpenCL recommended settings unroll_loops = true; optimal_kernel_size = 1; optimal_group_size = 128; break; case vk::driver_vendor::LAVAPIPE: case vk::driver_vendor::V3DV: case vk::driver_vendor::PANVK: case vk::driver_vendor::ARM_MALI: // TODO: Actually bench this. Using 32 for now to match other common configurations. case vk::driver_vendor::DOZEN: // Actual optimal size depends on the D3D device. Use 32 since it should work well on both AMD and NVIDIA case vk::driver_vendor::NVIDIA: case vk::driver_vendor::NVK: // Warps are multiples of 32. Increasing kernel depth seems to hurt performance (Nier, Big Duck sample) unroll_loops = true; optimal_kernel_size = 1; optimal_group_size = 32; break; case vk::driver_vendor::AMD: case vk::driver_vendor::RADV: // Wavefronts are multiples of 64. (RDNA also supports wave32) unroll_loops = false; optimal_kernel_size = 1; optimal_group_size = 64; break; case vk::driver_vendor::MVK: case vk::driver_vendor::HONEYKRISP: unroll_loops = true; optimal_kernel_size = 1; optimal_group_size = 256; break; } const auto& gpu = vk::g_render_device->gpu(); max_invocations_x = gpu.get_limits().maxComputeWorkGroupCount[0]; initialized = true; } } void compute_task::destroy() { if (initialized) { m_shader.destroy(); m_program.reset(); m_param_buffer.reset(); VK_GET_SYMBOL(vkDestroyDescriptorSetLayout)(*g_render_device, m_descriptor_layout, nullptr); VK_GET_SYMBOL(vkDestroyPipelineLayout)(*g_render_device, m_pipeline_layout, nullptr); m_descriptor_pool.destroy(); initialized = false; } } void compute_task::load_program(const vk::command_buffer& cmd) { if (!m_program) { m_shader.create(::glsl::program_domain::glsl_compute_program, m_src); auto handle = m_shader.compile(); VkPipelineShaderStageCreateInfo shader_stage{}; shader_stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; shader_stage.stage = VK_SHADER_STAGE_COMPUTE_BIT; shader_stage.module = handle; shader_stage.pName = "main"; VkComputePipelineCreateInfo info{}; info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; info.stage = shader_stage; info.layout = m_pipeline_layout; info.basePipelineIndex = -1; info.basePipelineHandle = VK_NULL_HANDLE; auto compiler = vk::get_pipe_compiler(); m_program = compiler->compile(info, m_pipeline_layout, vk::pipe_compiler::COMPILE_INLINE); declare_inputs(); } ensure(m_used_descriptors < VK_MAX_COMPUTE_TASKS); m_descriptor_set = m_descriptor_pool.allocate(m_descriptor_layout, VK_TRUE); bind_resources(); VK_GET_SYMBOL(vkCmdBindPipeline)(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_program->pipeline); m_descriptor_set.bind(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, m_pipeline_layout); } void compute_task::run(const vk::command_buffer& cmd, u32 invocations_x, u32 invocations_y, u32 invocations_z) { // CmdDispatch is outside renderpass scope only if (vk::is_renderpass_open(cmd)) { vk::end_renderpass(cmd); } load_program(cmd); VK_GET_SYMBOL(vkCmdDispatch)(cmd, invocations_x, invocations_y, invocations_z); } void compute_task::run(const vk::command_buffer& cmd, u32 num_invocations) { u32 invocations_x, invocations_y; if (num_invocations > max_invocations_x) { // AMD hw reports an annoyingly small maximum number of invocations in the X dimension // Split the 1D job into 2 dimensions to accomodate this invocations_x = static_cast(floor(std::sqrt(num_invocations))); invocations_y = invocations_x; if (num_invocations % invocations_x) invocations_y++; } else { invocations_x = num_invocations; invocations_y = 1; } run(cmd, invocations_x, invocations_y, 1); } cs_shuffle_base::cs_shuffle_base() { work_kernel = " value = data[index];\n" " data[index] = %f(value);\n"; loop_advance = " index++;\n"; suffix = "}\n"; } void cs_shuffle_base::build(const char* function_name, u32 _kernel_size) { // Initialize to allow detecting optimal settings create(); kernel_size = _kernel_size ? _kernel_size : optimal_kernel_size; m_src = #include "../Program/GLSLSnippets/ShuffleBytes.glsl" ; const auto parameters_size = rx::alignUp(push_constants_size, 16) / 16; const std::pair syntax_replace[] = { {"%loc", "0"}, {"%set", "set = 0"}, {"%ws", std::to_string(optimal_group_size)}, {"%ks", std::to_string(kernel_size)}, {"%vars", variables}, {"%f", function_name}, {"%md", method_declarations}, {"%ub", use_push_constants ? "layout(push_constant) uniform ubo{ uvec4 params[" + std::to_string(parameters_size) + "]; };\n" : ""}, }; m_src = fmt::replace_all(m_src, syntax_replace); work_kernel = fmt::replace_all(work_kernel, syntax_replace); if (kernel_size <= 1) { m_src += " {\n" + work_kernel + " }\n"; } else if (unroll_loops) { work_kernel += loop_advance + "\n"; m_src += std::string( " //Unrolled loop\n" " {\n"); // Assemble body with manual loop unroll to try loweing GPR usage for (u32 n = 0; n < kernel_size; ++n) { m_src += work_kernel; } m_src += " }\n"; } else { m_src += " for (int loop = 0; loop < KERNEL_SIZE; ++loop)\n"; m_src += " {\n"; m_src += work_kernel; m_src += loop_advance; m_src += " }\n"; } m_src += suffix; } void cs_shuffle_base::bind_resources() { m_program->bind_buffer({m_data->value, m_data_offset, m_data_length}, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); } void cs_shuffle_base::set_parameters(const vk::command_buffer& cmd, const u32* params, u8 count) { ensure(use_push_constants); VK_GET_SYMBOL(vkCmdPushConstants)(cmd, m_pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, count * 4, params); } void cs_shuffle_base::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_length, u32 data_offset) { m_data = data; m_data_offset = data_offset; m_data_length = data_length; const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; const auto num_bytes_to_process = rsx::align2(data_length, num_bytes_per_invocation); const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; if ((num_bytes_to_process + data_offset) > data->size()) { // Technically robust buffer access should keep the driver from crashing in OOB situations rsx_log.error("Inadequate buffer length submitted for a compute operation." "Required=%d bytes, Available=%d bytes", num_bytes_to_process, data->size()); } compute_task::run(cmd, num_invocations); } cs_interleave_task::cs_interleave_task() { use_push_constants = true; push_constants_size = 16; variables = " uint block_length = params[0].x >> 2;\n" " uint z_offset = params[0].y >> 2;\n" " uint s_offset = params[0].z >> 2;\n" " uint depth;\n" " uint stencil;\n" " uint stencil_shift;\n" " uint stencil_offset;\n"; } void cs_interleave_task::bind_resources() { m_program->bind_buffer({m_data->value, m_data_offset, m_ssbo_length}, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); } void cs_interleave_task::run(const vk::command_buffer& cmd, const vk::buffer* data, u32 data_offset, u32 data_length, u32 zeta_offset, u32 stencil_offset) { u32 parameters[4] = {data_length, zeta_offset - data_offset, stencil_offset - data_offset, 0}; set_parameters(cmd, parameters, 4); ensure(stencil_offset > data_offset); m_ssbo_length = stencil_offset + (data_length / 4) - data_offset; cs_shuffle_base::run(cmd, data, data_length, data_offset); } cs_scatter_d24x8::cs_scatter_d24x8() { work_kernel = " if (index >= block_length)\n" " return;\n" "\n" " value = data[index];\n" " data[index + z_offset] = (value >> 8);\n" " stencil_offset = (index / 4);\n" " stencil_shift = (index % 4) * 8;\n" " stencil = (value & 0xFF) << stencil_shift;\n" " atomicOr(data[stencil_offset + s_offset], stencil);\n"; cs_shuffle_base::build(""); } cs_aggregator::cs_aggregator() { ssbo_count = 2; create(); m_src = "#version 450\n" "layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;\n\n" "layout(set=0, binding=0, std430) readonly buffer ssbo0{ uint src[]; };\n" "layout(set=0, binding=1, std430) writeonly buffer ssbo1{ uint result; };\n\n" "void main()\n" "{\n" " if (gl_GlobalInvocationID.x < src.length())\n" " {\n" " atomicAdd(result, src[gl_GlobalInvocationID.x]);\n" " }\n" "}\n"; const std::pair syntax_replace[] = { {"%ws", std::to_string(optimal_group_size)}, }; m_src = fmt::replace_all(m_src, syntax_replace); } void cs_aggregator::bind_resources() { m_program->bind_buffer({src->value, 0, block_length}, 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); m_program->bind_buffer({dst->value, 0, 4}, 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_descriptor_set); } void cs_aggregator::run(const vk::command_buffer& cmd, const vk::buffer* dst, const vk::buffer* src, u32 num_words) { this->dst = dst; this->src = src; word_count = num_words; block_length = num_words * 4; const u32 linear_invocations = rx::aligned_div(word_count, optimal_group_size); compute_task::run(cmd, linear_invocations); } } // namespace vk