tiler: use push constant to deliver configuration

This commit is contained in:
DH 2024-10-02 02:08:09 +03:00
parent 835e3f3417
commit e2c7b0fe2d
2 changed files with 73 additions and 192 deletions

View file

@ -990,7 +990,7 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil
}
layout(binding=0) uniform Config {
layout(push_constant) uniform Config {
uint64_t srcAddress;
uint64_t srcEndAddress;
uint64_t dstAddress;
@ -1003,6 +1003,4 @@ layout(binding=0) uniform Config {
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
uint32_t padding0;
uint32_t padding1;
} config;

View file

@ -3,7 +3,6 @@
#include "amdgpu/tiler.hpp"
#include <cstring>
#include <memory>
#include <rx/ConcurrentBitPool.hpp>
#include <vk.hpp>
#include <shaders/detiler1d.comp.h>
@ -13,38 +12,33 @@
#include <shaders/tiler2d.comp.h>
#include <shaders/tilerLinear.comp.h>
struct TilerDecriptorSetLayout {
VkDescriptorSetLayout layout;
#include <vulkan/vulkan.h>
TilerDecriptorSetLayout() {
std::vector<VkDescriptorSetLayoutBinding> bindings{{
.binding = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
}};
VkDescriptorSetLayoutCreateInfo layoutInfo{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = static_cast<uint32_t>(bindings.size()),
.pBindings = bindings.data(),
};
VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo,
nullptr, &layout));
}
~TilerDecriptorSetLayout() {
vkDestroyDescriptorSetLayout(vk::context->device, layout,
vk::context->allocator);
}
struct Config {
uint64_t srcAddress;
uint64_t srcEndAddress;
uint64_t dstAddress;
uint64_t dstEndAddress;
uint32_t dataWidth;
uint32_t dataHeight;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
};
struct TilerShader {
VkShaderEXT shader;
TilerShader(TilerDecriptorSetLayout &setLayout,
std::span<const std::uint32_t> spirv) {
TilerShader(std::span<const std::uint32_t> spirv) {
VkPushConstantRange pushConstantRange = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.offset = 0,
.size = sizeof(Config),
};
VkShaderCreateInfoEXT shaderInfo{
.sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT,
@ -55,10 +49,8 @@ struct TilerShader {
.codeSize = spirv.size_bytes(),
.pCode = spirv.data(),
.pName = "main",
.setLayoutCount = 1,
.pSetLayouts = &setLayout.layout,
.pushConstantRangeCount = 0,
.pPushConstantRanges = 0,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &pushConstantRange,
.pSpecializationInfo = 0,
};
@ -72,96 +64,35 @@ struct TilerShader {
};
struct amdgpu::GpuTiler::Impl {
static constexpr auto kDescriptorSetCount = 32;
TilerDecriptorSetLayout descriptorSetLayout;
rx::ConcurrentBitPool<kDescriptorSetCount, std::uint32_t> descriptorSetPool;
VkDescriptorSet descriptorSets[kDescriptorSetCount]{};
VkDescriptorPool descriptorPool;
vk::Buffer configData;
TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp};
TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp};
TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp};
TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp};
TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp};
TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp};
TilerShader detilerLinear{spirv_detilerLinear_comp};
TilerShader detiler1d{spirv_detiler1d_comp};
TilerShader detiler2d{spirv_detilerLinear_comp};
TilerShader tilerLinear{spirv_tiler2d_comp};
TilerShader tiler1d{spirv_tiler1d_comp};
TilerShader tiler2d{spirv_tiler2d_comp};
VkPipelineLayout pipelineLayout;
struct alignas(64) Config {
uint64_t srcAddress;
uint64_t srcEndAddress;
uint64_t dstAddress;
uint64_t dstEndAddress;
uint32_t dataWidth;
uint32_t dataHeight;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
};
Impl() {
std::size_t count = 256;
configData = vk::Buffer::Allocate(
vk::getHostVisibleMemory(), sizeof(Config) * count,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
VkPushConstantRange pushConstantRange = {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.offset = 0,
.size = sizeof(Config),
};
VkPipelineLayoutCreateInfo piplineLayoutInfo{
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &descriptorSetLayout.layout,
.pushConstantRangeCount = 1,
.pPushConstantRanges = &pushConstantRange,
};
VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo,
nullptr, &pipelineLayout));
{
VkDescriptorPoolSize poolSizes[]{{
.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount =
static_cast<std::uint32_t>(std::size(descriptorSets)) * 2,
}};
VkDescriptorPoolCreateInfo info{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = static_cast<std::uint32_t>(std::size(descriptorSets)) * 2,
.poolSizeCount = static_cast<uint32_t>(std::size(poolSizes)),
.pPoolSizes = poolSizes,
};
VK_VERIFY(vkCreateDescriptorPool(
vk::context->device, &info, vk::context->allocator, &descriptorPool));
}
VkDescriptorSetAllocateInfo info{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
.descriptorPool = descriptorPool,
.descriptorSetCount = 1,
.pSetLayouts = &descriptorSetLayout.layout,
};
for (std::size_t i = 0; i < std::size(descriptorSets); ++i) {
VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info,
descriptorSets + i));
}
}
~Impl() {
vkDestroyDescriptorPool(vk::context->device, descriptorPool,
vk::context->allocator);
vkDestroyPipelineLayout(vk::context->device, pipelineLayout,
vk::context->allocator);
}
std::uint32_t allocateDescriptorSlot() { return descriptorSetPool.acquire(); }
void releaseDescriptorSlot(std::uint32_t slot) {
descriptorSetPool.release(slot);
}
};
amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
@ -176,32 +107,28 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
std::uint64_t dstSize, int mipLevel,
int baseArray, int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
auto slot = mImpl->allocateDescriptorSlot();
auto configOffset = slot * sizeof(Impl::Config);
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
configOffset);
Config config{};
auto &subresource = info.getSubresourceInfo(mipLevel);
config->srcAddress = srcTiledAddress + subresource.offset;
config->srcEndAddress = srcTiledAddress + srcSize;
config->dstAddress = dstLinearAddress;
config->dstEndAddress = dstLinearAddress + dstSize;
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->dfmt = dfmt;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
config.srcAddress = srcTiledAddress + subresource.offset;
config.srcEndAddress = srcTiledAddress + srcSize;
config.dstAddress = dstLinearAddress;
config.dstEndAddress = dstLinearAddress + dstSize;
config.dataWidth = subresource.dataWidth;
config.dataHeight = subresource.dataHeight;
config.tileMode = tileMode.raw;
config.dfmt = dfmt;
config.numFragments = info.numFragments;
config.bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
if (arrayCount > 1) {
config->tiledSurfaceSize = subresource.tiledSize;
config->linearSurfaceSize = subresource.linearSize;
config.tiledSurfaceSize = subresource.tiledSize;
config.linearSurfaceSize = subresource.linearSize;
groupCountZ = arrayCount;
} else {
config->tiledSurfaceSize = 0;
config->linearSurfaceSize = 0;
config.tiledSurfaceSize = 0;
config.linearSurfaceSize = 0;
}
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
@ -230,7 +157,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
config->macroTileMode =
config.macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
@ -240,31 +167,10 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
break;
}
VkDescriptorBufferInfo bufferInfo{
.buffer = mImpl->configData.getHandle(),
.offset = configOffset,
.range = sizeof(Impl::Config),
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = mImpl->descriptorSets[slot],
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
mImpl->pipelineLayout, 0, 1,
&mImpl->descriptorSets[slot], 0, nullptr);
vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
}
void amdgpu::GpuTiler::tile(Scheduler &scheduler,
@ -276,32 +182,29 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
std::uint64_t dstSize, int mipLevel, int baseArray,
int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
auto slot = mImpl->allocateDescriptorSlot();
auto configOffset = slot * sizeof(Impl::Config);
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
configOffset);
Config config{};
auto &subresource = info.getSubresourceInfo(mipLevel);
config->srcAddress = srcLinearAddress;
config->srcEndAddress = srcLinearAddress + srcSize;
config->dstAddress = dstTiledAddress + subresource.offset;
config->dstEndAddress = dstTiledAddress + dstSize;
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->dfmt = dfmt;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
config.srcAddress = srcLinearAddress;
config.srcEndAddress = srcLinearAddress + srcSize;
config.dstAddress = dstTiledAddress + subresource.offset;
config.dstEndAddress = dstTiledAddress + dstSize;
config.dataWidth = subresource.dataWidth;
config.dataHeight = subresource.dataHeight;
config.tileMode = tileMode.raw;
config.dfmt = dfmt;
config.numFragments = info.numFragments;
config.bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
if (arrayCount > 1) {
config->tiledSurfaceSize = subresource.tiledSize;
config->linearSurfaceSize = subresource.linearSize;
config.tiledSurfaceSize = subresource.tiledSize;
config.linearSurfaceSize = subresource.linearSize;
groupCountZ = arrayCount;
} else {
config->tiledSurfaceSize = 0;
config->linearSurfaceSize = 0;
config.tiledSurfaceSize = 0;
config.linearSurfaceSize = 0;
}
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
@ -329,7 +232,7 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
config->macroTileMode =
config.macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
@ -338,29 +241,9 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
break;
}
VkDescriptorBufferInfo bufferInfo{
.buffer = mImpl->configData.getHandle(),
.offset = configOffset,
.range = sizeof(Impl::Config),
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = mImpl->descriptorSets[slot],
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
mImpl->pipelineLayout, 0, 1,
&mImpl->descriptorSets[slot], 0, nullptr);
vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
}