From e2c7b0fe2de5b0fb412a02bd88b3e4a07b0cb7a0 Mon Sep 17 00:00:00 2001 From: DH Date: Wed, 2 Oct 2024 02:08:09 +0300 Subject: [PATCH] tiler: use push constant to deliver configuration --- rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl | 4 +- .../lib/amdgpu-tiler/src/tiler_vulkan.cpp | 261 +++++------------- 2 files changed, 73 insertions(+), 192 deletions(-) diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl index 10507d5c3..29be1b02d 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl @@ -990,7 +990,7 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil } -layout(binding=0) uniform Config { +layout(push_constant) uniform Config { uint64_t srcAddress; uint64_t srcEndAddress; uint64_t dstAddress; @@ -1003,6 +1003,4 @@ layout(binding=0) uniform Config { uint32_t bitsPerElement; uint32_t tiledSurfaceSize; uint32_t linearSurfaceSize; - uint32_t padding0; - uint32_t padding1; } config; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp index 43ae86e0e..e3ad4dc8e 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp +++ b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp @@ -3,7 +3,6 @@ #include "amdgpu/tiler.hpp" #include #include -#include #include #include @@ -13,38 +12,33 @@ #include #include -struct TilerDecriptorSetLayout { - VkDescriptorSetLayout layout; +#include - TilerDecriptorSetLayout() { - std::vector bindings{{ - .binding = 0, - .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - }}; - - VkDescriptorSetLayoutCreateInfo layoutInfo{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, - .bindingCount = static_cast(bindings.size()), - .pBindings = bindings.data(), - }; - - VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo, - nullptr, &layout)); - } - - ~TilerDecriptorSetLayout() { - vkDestroyDescriptorSetLayout(vk::context->device, layout, - vk::context->allocator); - } +struct Config { + uint64_t srcAddress; + uint64_t srcEndAddress; + uint64_t dstAddress; + uint64_t dstEndAddress; + uint32_t dataWidth; + uint32_t dataHeight; + uint32_t tileMode; + uint32_t macroTileMode; + uint32_t dfmt; + uint32_t numFragments; + uint32_t bitsPerElement; + uint32_t tiledSurfaceSize; + uint32_t linearSurfaceSize; }; struct TilerShader { VkShaderEXT shader; - TilerShader(TilerDecriptorSetLayout &setLayout, - std::span spirv) { + TilerShader(std::span spirv) { + VkPushConstantRange pushConstantRange = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(Config), + }; VkShaderCreateInfoEXT shaderInfo{ .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT, @@ -55,10 +49,8 @@ struct TilerShader { .codeSize = spirv.size_bytes(), .pCode = spirv.data(), .pName = "main", - .setLayoutCount = 1, - .pSetLayouts = &setLayout.layout, - .pushConstantRangeCount = 0, - .pPushConstantRanges = 0, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange, .pSpecializationInfo = 0, }; @@ -72,96 +64,35 @@ struct TilerShader { }; struct amdgpu::GpuTiler::Impl { - static constexpr auto kDescriptorSetCount = 32; - TilerDecriptorSetLayout descriptorSetLayout; - rx::ConcurrentBitPool descriptorSetPool; - VkDescriptorSet descriptorSets[kDescriptorSetCount]{}; - VkDescriptorPool descriptorPool; - - vk::Buffer configData; - TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp}; - TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp}; - TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp}; - TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp}; - TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp}; - TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp}; + TilerShader detilerLinear{spirv_detilerLinear_comp}; + TilerShader detiler1d{spirv_detiler1d_comp}; + TilerShader detiler2d{spirv_detilerLinear_comp}; + TilerShader tilerLinear{spirv_tiler2d_comp}; + TilerShader tiler1d{spirv_tiler1d_comp}; + TilerShader tiler2d{spirv_tiler2d_comp}; VkPipelineLayout pipelineLayout; - struct alignas(64) Config { - uint64_t srcAddress; - uint64_t srcEndAddress; - uint64_t dstAddress; - uint64_t dstEndAddress; - uint32_t dataWidth; - uint32_t dataHeight; - uint32_t tileMode; - uint32_t macroTileMode; - uint32_t dfmt; - uint32_t numFragments; - uint32_t bitsPerElement; - uint32_t tiledSurfaceSize; - uint32_t linearSurfaceSize; - }; - Impl() { - std::size_t count = 256; - - configData = vk::Buffer::Allocate( - vk::getHostVisibleMemory(), sizeof(Config) * count, - VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT); + VkPushConstantRange pushConstantRange = { + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .offset = 0, + .size = sizeof(Config), + }; VkPipelineLayoutCreateInfo piplineLayoutInfo{ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, - .setLayoutCount = 1, - .pSetLayouts = &descriptorSetLayout.layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &pushConstantRange, }; VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo, nullptr, &pipelineLayout)); - - { - VkDescriptorPoolSize poolSizes[]{{ - .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .descriptorCount = - static_cast(std::size(descriptorSets)) * 2, - }}; - - VkDescriptorPoolCreateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, - .maxSets = static_cast(std::size(descriptorSets)) * 2, - .poolSizeCount = static_cast(std::size(poolSizes)), - .pPoolSizes = poolSizes, - }; - - VK_VERIFY(vkCreateDescriptorPool( - vk::context->device, &info, vk::context->allocator, &descriptorPool)); - } - - VkDescriptorSetAllocateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, - .descriptorPool = descriptorPool, - .descriptorSetCount = 1, - .pSetLayouts = &descriptorSetLayout.layout, - }; - for (std::size_t i = 0; i < std::size(descriptorSets); ++i) { - VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info, - descriptorSets + i)); - } } ~Impl() { - vkDestroyDescriptorPool(vk::context->device, descriptorPool, - vk::context->allocator); vkDestroyPipelineLayout(vk::context->device, pipelineLayout, vk::context->allocator); } - - std::uint32_t allocateDescriptorSlot() { return descriptorSetPool.acquire(); } - - void releaseDescriptorSlot(std::uint32_t slot) { - descriptorSetPool.release(slot); - } }; amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique(); } @@ -176,32 +107,28 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount) { auto commandBuffer = scheduler.getCommandBuffer(); - auto slot = mImpl->allocateDescriptorSlot(); - - auto configOffset = slot * sizeof(Impl::Config); - auto config = reinterpret_cast(mImpl->configData.getData() + - configOffset); + Config config{}; auto &subresource = info.getSubresourceInfo(mipLevel); - config->srcAddress = srcTiledAddress + subresource.offset; - config->srcEndAddress = srcTiledAddress + srcSize; - config->dstAddress = dstLinearAddress; - config->dstEndAddress = dstLinearAddress + dstSize; - config->dataWidth = subresource.dataWidth; - config->dataHeight = subresource.dataHeight; - config->tileMode = tileMode.raw; - config->dfmt = dfmt; - config->numFragments = info.numFragments; - config->bitsPerElement = info.bitsPerElement; + config.srcAddress = srcTiledAddress + subresource.offset; + config.srcEndAddress = srcTiledAddress + srcSize; + config.dstAddress = dstLinearAddress; + config.dstEndAddress = dstLinearAddress + dstSize; + config.dataWidth = subresource.dataWidth; + config.dataHeight = subresource.dataHeight; + config.tileMode = tileMode.raw; + config.dfmt = dfmt; + config.numFragments = info.numFragments; + config.bitsPerElement = info.bitsPerElement; uint32_t groupCountZ = subresource.dataDepth; if (arrayCount > 1) { - config->tiledSurfaceSize = subresource.tiledSize; - config->linearSurfaceSize = subresource.linearSize; + config.tiledSurfaceSize = subresource.tiledSize; + config.linearSurfaceSize = subresource.linearSize; groupCountZ = arrayCount; } else { - config->tiledSurfaceSize = 0; - config->linearSurfaceSize = 0; + config.tiledSurfaceSize = 0; + config.linearSurfaceSize = 0; } VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; @@ -230,7 +157,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - config->macroTileMode = + config.macroTileMode = getDefaultMacroTileModes()[computeMacroTileIndex( tileMode, info.bitsPerElement, 1 << info.numFragments)] @@ -240,31 +167,10 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, break; } - VkDescriptorBufferInfo bufferInfo{ - .buffer = mImpl->configData.getHandle(), - .offset = configOffset, - .range = sizeof(Impl::Config), - }; - - VkWriteDescriptorSet writeDescSet{ - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .dstSet = mImpl->descriptorSets[slot], - .dstBinding = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .pBufferInfo = &bufferInfo, - }; - - vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); - - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - mImpl->pipelineLayout, 0, 1, - &mImpl->descriptorSets[slot], 0, nullptr); - + vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config); vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, groupCountZ); - - scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); }); } void amdgpu::GpuTiler::tile(Scheduler &scheduler, @@ -276,32 +182,29 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount) { auto commandBuffer = scheduler.getCommandBuffer(); - auto slot = mImpl->allocateDescriptorSlot(); - auto configOffset = slot * sizeof(Impl::Config); - auto config = reinterpret_cast(mImpl->configData.getData() + - configOffset); + Config config{}; auto &subresource = info.getSubresourceInfo(mipLevel); - config->srcAddress = srcLinearAddress; - config->srcEndAddress = srcLinearAddress + srcSize; - config->dstAddress = dstTiledAddress + subresource.offset; - config->dstEndAddress = dstTiledAddress + dstSize; - config->dataWidth = subresource.dataWidth; - config->dataHeight = subresource.dataHeight; - config->tileMode = tileMode.raw; - config->dfmt = dfmt; - config->numFragments = info.numFragments; - config->bitsPerElement = info.bitsPerElement; + config.srcAddress = srcLinearAddress; + config.srcEndAddress = srcLinearAddress + srcSize; + config.dstAddress = dstTiledAddress + subresource.offset; + config.dstEndAddress = dstTiledAddress + dstSize; + config.dataWidth = subresource.dataWidth; + config.dataHeight = subresource.dataHeight; + config.tileMode = tileMode.raw; + config.dfmt = dfmt; + config.numFragments = info.numFragments; + config.bitsPerElement = info.bitsPerElement; uint32_t groupCountZ = subresource.dataDepth; if (arrayCount > 1) { - config->tiledSurfaceSize = subresource.tiledSize; - config->linearSurfaceSize = subresource.linearSize; + config.tiledSurfaceSize = subresource.tiledSize; + config.linearSurfaceSize = subresource.linearSize; groupCountZ = arrayCount; } else { - config->tiledSurfaceSize = 0; - config->linearSurfaceSize = 0; + config.tiledSurfaceSize = 0; + config.linearSurfaceSize = 0; } VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; @@ -329,7 +232,7 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - config->macroTileMode = + config.macroTileMode = getDefaultMacroTileModes()[computeMacroTileIndex( tileMode, info.bitsPerElement, 1 << info.numFragments)] @@ -338,29 +241,9 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, break; } - VkDescriptorBufferInfo bufferInfo{ - .buffer = mImpl->configData.getHandle(), - .offset = configOffset, - .range = sizeof(Impl::Config), - }; - - VkWriteDescriptorSet writeDescSet{ - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .dstSet = mImpl->descriptorSets[slot], - .dstBinding = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .pBufferInfo = &bufferInfo, - }; - - vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); - - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, - mImpl->pipelineLayout, 0, 1, - &mImpl->descriptorSets[slot], 0, nullptr); + vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout, + VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config); vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, groupCountZ); - - scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); }); }