gpu2: move shader resource management to cache

fixed descriptor set binding
fixed 5_6_5 format swizzling
fix rect calculation
fix possible crash in scheduler
implement lock-free bit pool utility
This commit is contained in:
DH 2024-09-29 03:22:39 +03:00
parent 4e83c9e121
commit 0877d3f1cd
12 changed files with 1175 additions and 967 deletions

View file

@ -1,8 +1,6 @@
#include "Renderer.hpp"
#include "Device.hpp"
#include "gnm/descriptors.hpp"
#include "gnm/gnm.hpp"
#include "rx/MemoryTable.hpp"
#include <amdgpu/tiler.hpp>
#include <gnm/constants.hpp>
@ -14,7 +12,6 @@
#include <shaders/fill_red.frag.h>
#include <shaders/rect_list.geom.h>
#include <bit>
#include <vulkan/vulkan_core.h>
using namespace shader;
@ -35,39 +32,26 @@ VkRect2D toVkRect2D(amdgpu::PaScRect rect) {
};
}
amdgpu::PaScRect intersection(amdgpu::PaScRect rect, amdgpu::PaScRect scissor) {
amdgpu::PaScRect intersection(amdgpu::PaScRect lhs, amdgpu::PaScRect rhs) {
if (!lhs.isValid()) {
return rhs;
}
if (!rhs.isValid()) {
return lhs;
}
amdgpu::PaScRect result{
.left = std::max(rect.left, scissor.left),
.top = std::max(rect.top, scissor.top),
.right = std::min(rect.right, scissor.right),
.bottom = std::min(rect.bottom, scissor.bottom),
.left = std::max(lhs.left, rhs.left),
.top = std::max(lhs.top, rhs.top),
.right = std::min(lhs.right, rhs.right),
.bottom = std::min(lhs.bottom, rhs.bottom),
};
result.top = std::min(result.top, result.bottom);
result.bottom = std::max(result.top, result.bottom);
result.left = std::min(result.left, result.right);
result.right = std::max(result.left, result.right);
return result;
}
} // namespace gnm
struct MemoryTableSlot {
std::uint64_t address;
union {
struct {
std::uint64_t size : 40;
std::uint64_t flags : 4;
};
std::uint64_t sizeAndFlags;
};
std::uint64_t deviceAddress;
};
struct MemoryTable {
std::uint32_t count;
std::uint32_t pad;
MemoryTableSlot slots[];
};
static VkShaderEXT getPrimTypeRectGeomShader(amdgpu::Cache &cache) {
static VkShaderEXT shader = VK_NULL_HANDLE;
if (shader != VK_NULL_HANDLE) {
@ -151,282 +135,6 @@ static VkPrimitiveTopology toVkPrimitiveType(gnm::PrimitiveType type) {
}
}
struct ShaderResources : eval::Evaluator {
amdgpu::Cache::Tag *cacheTag;
shader::eval::Evaluator evaluator;
std::map<std::uint32_t, std::uint32_t> slotResources;
std::span<const std::uint32_t> userSgprs;
std::uint32_t slotOffset = 0;
rx::MemoryTableWithPayload<Access> bufferMemoryTable;
std::vector<std::pair<std::uint32_t, std::uint64_t>> resourceSlotToAddress;
std::vector<amdgpu::Cache::Sampler> samplerResources;
std::vector<amdgpu::Cache::ImageView> imageResources[3];
using Evaluator::eval;
ShaderResources() = default;
void loadResources(shader::gcn::Resources &res,
std::span<const std::uint32_t> userSgprs) {
this->userSgprs = userSgprs;
for (auto &pointer : res.pointers) {
auto pointerBase = eval(pointer.base).zExtScalar();
auto pointerOffset = eval(pointer.offset).zExtScalar();
if (!pointerBase || !pointerOffset) {
res.dump();
rx::die("failed to evaluate pointer");
}
bufferMemoryTable.map(*pointerBase,
*pointerBase + *pointerOffset + pointer.size,
Access::Read);
resourceSlotToAddress.push_back(
{slotOffset + pointer.resourceSlot, *pointerBase});
}
for (auto &bufferRes : res.buffers) {
auto word0 = eval(bufferRes.words[0]).zExtScalar();
auto word1 = eval(bufferRes.words[1]).zExtScalar();
auto word2 = eval(bufferRes.words[2]).zExtScalar();
auto word3 = eval(bufferRes.words[3]).zExtScalar();
if (!word0 || !word1 || !word2 || !word3) {
res.dump();
rx::die("failed to evaluate V#");
}
gnm::VBuffer buffer{};
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer), &*word0,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 1, &*word1,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 2, &*word2,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 3, &*word3,
sizeof(std::uint32_t));
bufferMemoryTable.map(buffer.address(), buffer.address() + buffer.size(),
bufferRes.access);
resourceSlotToAddress.push_back(
{slotOffset + bufferRes.resourceSlot, buffer.address()});
}
for (auto &texture : res.textures) {
auto word0 = eval(texture.words[0]).zExtScalar();
auto word1 = eval(texture.words[1]).zExtScalar();
auto word2 = eval(texture.words[2]).zExtScalar();
auto word3 = eval(texture.words[3]).zExtScalar();
if (!word0 || !word1 || !word2 || !word3) {
res.dump();
rx::die("failed to evaluate 128 bit T#");
}
gnm::TBuffer buffer{};
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer), &*word0,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 1, &*word1,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 2, &*word2,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 3, &*word3,
sizeof(std::uint32_t));
if (texture.words[4] != nullptr) {
auto word4 = eval(texture.words[4]).zExtScalar();
auto word5 = eval(texture.words[5]).zExtScalar();
auto word6 = eval(texture.words[6]).zExtScalar();
auto word7 = eval(texture.words[7]).zExtScalar();
if (!word4 || !word5 || !word6 || !word7) {
res.dump();
rx::die("failed to evaluate 256 bit T#");
}
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 4, &*word4,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 5, &*word5,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 6, &*word6,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&buffer) + 7, &*word7,
sizeof(std::uint32_t));
}
std::vector<amdgpu::Cache::ImageView> *resources = nullptr;
switch (buffer.type) {
case gnm::TextureType::Array1D:
case gnm::TextureType::Dim1D:
resources = &imageResources[0];
break;
case gnm::TextureType::Dim2D:
case gnm::TextureType::Array2D:
case gnm::TextureType::Msaa2D:
case gnm::TextureType::MsaaArray2D:
case gnm::TextureType::Cube:
resources = &imageResources[1];
break;
case gnm::TextureType::Dim3D:
resources = &imageResources[2];
break;
}
rx::dieIf(resources == nullptr,
"ShaderResources: unexpected texture type %u",
static_cast<unsigned>(buffer.type));
slotResources[slotOffset + texture.resourceSlot] = resources->size();
resources->push_back(cacheTag->getImageView(
amdgpu::ImageViewKey::createFrom(buffer), texture.access));
}
for (auto &sampler : res.samplers) {
auto word0 = eval(sampler.words[0]).zExtScalar();
auto word1 = eval(sampler.words[1]).zExtScalar();
auto word2 = eval(sampler.words[2]).zExtScalar();
auto word3 = eval(sampler.words[3]).zExtScalar();
if (!word0 || !word1 || !word2 || !word3) {
res.dump();
rx::die("failed to evaluate S#");
}
gnm::SSampler sSampler{};
std::memcpy(reinterpret_cast<std::uint32_t *>(&sSampler), &*word0,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&sSampler) + 1, &*word1,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&sSampler) + 2, &*word2,
sizeof(std::uint32_t));
std::memcpy(reinterpret_cast<std::uint32_t *>(&sSampler) + 3, &*word3,
sizeof(std::uint32_t));
if (sampler.unorm) {
sSampler.force_unorm_coords = true;
}
slotResources[slotOffset + sampler.resourceSlot] =
samplerResources.size();
samplerResources.push_back(
cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler)));
}
slotOffset += res.slots;
}
void buildMemoryTable(MemoryTable &memoryTable) {
memoryTable.count = 0;
for (auto p : bufferMemoryTable) {
auto size = p.endAddress - p.beginAddress;
auto buffer = cacheTag->getBuffer(p.beginAddress, size, p.payload);
auto memoryTableSlot = memoryTable.count;
memoryTable.slots[memoryTable.count++] = {
.address = p.beginAddress,
.size = size,
.flags = static_cast<uint8_t>(p.payload),
.deviceAddress = buffer.deviceAddress,
};
for (auto [slot, address] : resourceSlotToAddress) {
if (address >= p.beginAddress && address < p.endAddress) {
slotResources[slot] = memoryTableSlot;
}
}
}
}
std::uint32_t getResourceSlot(std::uint32_t id) {
if (auto it = slotResources.find(id); it != slotResources.end()) {
return it->second;
}
return -1;
}
template <typename T> T readPointer(std::uint64_t address) {
T result{};
cacheTag->readMemory(&result, address, sizeof(result));
return result;
}
eval::Value eval(ir::InstructionId instId,
std::span<const ir::Operand> operands) override {
if (instId == ir::amdgpu::POINTER) {
auto type = operands[0].getAsValue();
auto loadSize = *operands[1].getAsInt32();
auto base = eval(operands[2]).zExtScalar();
auto offset = eval(operands[3]).zExtScalar();
if (!base || !offset) {
rx::die("failed to evaluate pointer dependency");
}
eval::Value result;
auto address = *base + *offset;
switch (loadSize) {
case 1:
result = readPointer<std::uint8_t>(address);
break;
case 2:
result = readPointer<std::uint16_t>(address);
break;
case 4:
result = readPointer<std::uint32_t>(address);
break;
case 8:
result = readPointer<std::uint64_t>(address);
break;
case 12:
result = readPointer<u32vec3>(address);
break;
case 16:
result = readPointer<u32vec4>(address);
break;
case 32:
result = readPointer<std::array<std::uint32_t, 8>>(address);
break;
default:
rx::die("unexpected pointer load size");
}
return result;
}
if (instId == ir::amdgpu::VBUFFER) {
rx::die("resource depends on buffer value");
}
if (instId == ir::amdgpu::TBUFFER) {
rx::die("resource depends on texture value");
}
if (instId == ir::amdgpu::SAMPLER) {
rx::die("resource depends on sampler value");
}
if (instId == ir::amdgpu::USER_SGPR) {
auto index = static_cast<std::uint32_t>(*operands[1].getAsInt32());
rx::dieIf(index >= userSgprs.size(), "out of user sgprs");
return userSgprs[index];
}
if (instId == ir::amdgpu::IMM) {
auto address = static_cast<std::uint64_t>(*operands[1].getAsInt64());
std::uint32_t result;
cacheTag->readMemory(&result, address, sizeof(result));
return result;
}
return Evaluator::eval(instId, operands);
}
};
void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
std::uint32_t vertexCount, std::uint32_t firstInstance,
std::uint32_t instanceCount, std::uint64_t indiciesAddress,
@ -449,7 +157,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
return;
}
auto cacheTag = pipe.device->getCacheTag(vmId, pipe.scheduler);
auto cacheTag = pipe.device->getGraphicsTag(vmId, pipe.scheduler);
auto targetMask = pipe.context.cbTargetMask.raw;
VkRenderingAttachmentInfo colorAttachments[8]{};
@ -460,8 +168,12 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
VkRect2D viewPortScissors[8]{};
unsigned renderTargets = 0;
VkRenderingAttachmentInfo depthAttachment{};
VkRenderingAttachmentInfo stencilAttachment{};
VkRenderingAttachmentInfo depthAttachment{
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
};
VkRenderingAttachmentInfo stencilAttachment{
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
};
auto depthAccess = Access::None;
auto stencilAccess = Access::None;
@ -484,12 +196,15 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
}
}
// FIXME
stencilAccess = Access::None;
if (depthAccess != Access::None) {
auto viewPortScissor = pipe.context.paScScreenScissor;
auto viewPortRect = gnm::toVkRect2D(viewPortScissor);
auto imageView = cacheTag.getImageView(
{{
{
.readAddress = pipe.context.dbZReadBase,
.writeAddress = pipe.context.dbZWriteBase,
.dfmt = gnm::getDataFormat(pipe.context.dbZInfo.format),
@ -502,7 +217,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
},
.pitch = viewPortRect.extent.width,
.kind = ImageKind::Depth,
}},
},
depthAccess);
depthAttachment = {
@ -533,12 +248,12 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
}
auto viewPortScissor = pipe.context.paScScreenScissor;
// viewPortScissor = gnm::intersection(
// viewPortScissor, pipe.context.paScVportScissor[renderTargets]);
// viewPortScissor =
// gnm::intersection(viewPortScissor, pipe.context.paScWindowScissor);
// viewPortScissor =
// gnm::intersection(viewPortScissor, pipe.context.paScGenericScissor);
viewPortScissor = gnm::intersection(
viewPortScissor, pipe.context.paScVportScissor[renderTargets]);
viewPortScissor =
gnm::intersection(viewPortScissor, pipe.context.paScWindowScissor);
viewPortScissor =
gnm::intersection(viewPortScissor, pipe.context.paScGenericScissor);
auto viewPortRect = gnm::toVkRect2D(viewPortScissor);
@ -554,7 +269,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
auto vkViewPortScissor = gnm::toVkRect2D(viewPortScissor);
viewPortScissors[renderTargets] = vkViewPortScissor;
ImageViewKey renderTargetInfo{};
ImageKey renderTargetInfo{};
renderTargetInfo.type = gnm::TextureType::Dim2D;
renderTargetInfo.pitch = vkViewPortScissor.extent.width;
renderTargetInfo.readAddress = static_cast<std::uint64_t>(cbColor.base)
@ -572,7 +287,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
cbColor.info.linearGeneral
? TileMode{.raw = 0}
: getDefaultTileModes()[cbColor.attrib.tileModeIndex];
// std::printf("draw to %lx\n", renderTargetInfo.address);
auto access = Access::None;
@ -640,13 +354,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
if (renderTargets == 0) {
return;
}
// if (pipe.context.cbTargetMask == 0) {
// return;
// }
// auto cache = pipe.device->getCache(vmId);
if (indiciesAddress == 0) {
indexCount = vertexCount;
}
@ -659,244 +366,34 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
VkShaderEXT shaders[stages.size()]{};
auto pipelineLayout = cacheTag.getGraphicsPipelineLayout();
auto descriptorSets = cacheTag.createGraphicsDescriptorSets();
std::vector<std::uint32_t *> descriptorBuffers;
auto &memoryTableBuffer = cacheTag.getCache()->getMemoryTableBuffer();
std::uint64_t memoryTableAddress = memoryTableBuffer.getAddress();
auto memoryTable = std::bit_cast<MemoryTable *>(memoryTableBuffer.getData());
std::uint64_t gdsAddress = cacheTag.getCache()->getGdsBuffer().getAddress();
ShaderResources shaderResources;
shaderResources.cacheTag = &cacheTag;
struct MemoryTableConfigSlot {
std::uint32_t bufferIndex;
std::uint32_t configIndex;
std::uint32_t resourceSlot;
};
std::vector<MemoryTableConfigSlot> memoryTableConfigSlots;
auto addShader = [&](const SpiShaderPgm &pgm, shader::gcn::Stage stage) {
shader::gcn::Environment env{
.vgprCount = pgm.rsrc1.getVGprCount(),
.sgprCount = pgm.rsrc1.getSGprCount(),
.userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr),
.supportsBarycentric = vk::context->supportsBarycentric,
.supportsInt8 = vk::context->supportsInt8,
.supportsInt64Atomics = vk::context->supportsInt64Atomics,
};
auto shader = cacheTag.getShader({
.address = pgm.address << 8,
.stage = stage,
.env = env,
});
std::uint32_t slotOffset = shaderResources.slotOffset;
shaderResources.loadResources(
shader.info->resources,
std::span(pgm.userData.data(), pgm.rsrc2.userSgpr));
const auto &configSlots = shader.info->configSlots;
auto configSize = configSlots.size() * sizeof(std::uint32_t);
auto configBuffer = cacheTag.getInternalBuffer(configSize);
auto configPtr = reinterpret_cast<std::uint32_t *>(configBuffer.data);
shader::gcn::PsVGprInput
psVgprInput[static_cast<std::size_t>(shader::gcn::PsVGprInput::Count)];
std::size_t psVgprInputs = 0;
if (stage == shader::gcn::Stage::Ps) {
SpiPsInput spiInputAddr = pipe.context.spiPsInputAddr;
if (spiInputAddr.perspSampleEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspSample;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspSample;
}
if (spiInputAddr.perspCenterEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCenter;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCenter;
}
if (spiInputAddr.perspCentroidEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCentroid;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCentroid;
}
if (spiInputAddr.perspPullModelEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IW;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JW;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::_1W;
}
if (spiInputAddr.linearSampleEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearSample;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearSample;
}
if (spiInputAddr.linearCenterEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCenter;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCenter;
}
if (spiInputAddr.linearCentroidEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCentroid;
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCentroid;
}
if (spiInputAddr.posXFloatEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::X;
}
if (spiInputAddr.posYFloatEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Y;
}
if (spiInputAddr.posZFloatEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Z;
}
if (spiInputAddr.posWFloatEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::W;
}
if (spiInputAddr.frontFaceEna) {
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::FrontFace;
}
if (spiInputAddr.ancillaryEna) {
rx::die("unimplemented ancillary fs input");
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Ancillary;
}
if (spiInputAddr.sampleCoverageEna) {
rx::die("unimplemented sample coverage fs input");
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::SampleCoverage;
}
if (spiInputAddr.posFixedPtEna) {
rx::die("unimplemented pos fixed fs input");
psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::PosFixed;
}
}
for (std::size_t index = 0; const auto &slot : configSlots) {
switch (slot.type) {
case shader::gcn::ConfigType::Imm:
cacheTag.readMemory(&configPtr[index], slot.data,
sizeof(std::uint32_t));
break;
case shader::gcn::ConfigType::UserSgpr:
configPtr[index] = pgm.userData[slot.data];
break;
case shader::gcn::ConfigType::ViewPortOffsetX:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].xOffset /
(viewPorts[0].width / 2.f) -
1);
break;
case shader::gcn::ConfigType::ViewPortOffsetY:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].yOffset /
(viewPorts[slot.data].height / 2.f) -
1);
break;
case shader::gcn::ConfigType::ViewPortOffsetZ:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].zOffset);
break;
case shader::gcn::ConfigType::ViewPortScaleX:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].xScale /
(viewPorts[slot.data].width / 2.f));
break;
case shader::gcn::ConfigType::ViewPortScaleY:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].yScale /
(viewPorts[slot.data].height / 2.f));
break;
case shader::gcn::ConfigType::ViewPortScaleZ:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].zScale);
break;
case shader::gcn::ConfigType::PsInputVGpr:
if (slot.data > psVgprInputs) {
configPtr[index] = ~0;
} else {
configPtr[index] =
std::bit_cast<std::uint32_t>(psVgprInput[slot.data]);
}
break;
case shader::gcn::ConfigType::VsPrimType:
if (indexBuffer.handle == VK_NULL_HANDLE &&
pipe.uConfig.vgtPrimitiveType != indexBuffer.primType) {
configPtr[index] =
static_cast<std::uint32_t>(pipe.uConfig.vgtPrimitiveType.value);
} else {
configPtr[index] = 0;
}
break;
case shader::gcn::ConfigType::ResourceSlot:
memoryTableConfigSlots.push_back({
.bufferIndex = static_cast<std::uint32_t>(descriptorBuffers.size()),
.configIndex = static_cast<std::uint32_t>(index),
.resourceSlot = static_cast<std::uint32_t>(slotOffset + slot.data),
});
break;
case shader::gcn::ConfigType::MemoryTable:
if (slot.data == 0) {
configPtr[index] = static_cast<std::uint32_t>(memoryTableAddress);
} else {
configPtr[index] =
static_cast<std::uint32_t>(memoryTableAddress >> 32);
}
break;
case shader::gcn::ConfigType::Gds:
if (slot.data == 0) {
configPtr[index] = static_cast<std::uint32_t>(gdsAddress);
} else {
configPtr[index] = static_cast<std::uint32_t>(gdsAddress >> 32);
}
break;
case shader::gcn::ConfigType::CbCompSwap:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.cbColor[slot.data].info.compSwap);
break;
}
++index;
}
VkDescriptorBufferInfo bufferInfo{
.buffer = configBuffer.handle,
.offset = configBuffer.offset,
.range = configSize,
};
auto stageIndex = Cache::getStageIndex(shader.stage);
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = descriptorSets[stageIndex],
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
shaders[stageIndex] = shader.handle
? shader.handle
: getFillRedFragShader(*cacheTag.getCache());
descriptorBuffers.push_back(configPtr);
};
auto descriptorSets = cacheTag.getDescriptorSets();
Cache::Shader vertexShader;
if (pipe.context.vgtShaderStagesEn.vsEn == amdgpu::VsStage::VsReal) {
addShader(pipe.sh.spiShaderPgmVs, shader::gcn::Stage::VsVs);
gnm::PrimitiveType vsPrimType = {};
if (indexBuffer.handle == VK_NULL_HANDLE &&
pipe.uConfig.vgtPrimitiveType != indexBuffer.primType) {
vsPrimType = pipe.uConfig.vgtPrimitiveType.value;
}
vertexShader =
cacheTag.getVertexShader(gcn::Stage::VsVs, pipe.sh.spiShaderPgmVs,
pipe.context, vsPrimType, viewPorts);
}
if (true) {
addShader(pipe.sh.spiShaderPgmPs, shader::gcn::Stage::Ps);
} else {
auto pixelShader =
cacheTag.getPixelShader(pipe.sh.spiShaderPgmPs, pipe.context, viewPorts);
if (pixelShader.handle == nullptr) {
shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] =
getFillRedFragShader(*cacheTag.getCache());
}
shaders[Cache::getStageIndex(VK_SHADER_STAGE_VERTEX_BIT)] =
vertexShader.handle;
shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] =
pixelShader.handle;
if (pipe.uConfig.vgtPrimitiveType == gnm::PrimitiveType::RectList) {
shaders[Cache::getStageIndex(VK_SHADER_STAGE_GEOMETRY_BIT)] =
getPrimTypeRectGeomShader(*cacheTag.getCache());
@ -906,18 +403,24 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
vertexCount = indexBuffer.indexCount;
}
auto commandBuffer = pipe.scheduler.getCommandBuffer();
VkRenderingInfo renderInfo{
.sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
.renderArea = gnm::toVkRect2D(pipe.context.paScScreenScissor),
.layerCount = 1,
.colorAttachmentCount = renderTargets,
.pColorAttachments = colorAttachments,
.pDepthAttachment = &depthAttachment,
// .pStencilAttachment = &stencilAttachment,
.pDepthAttachment =
depthAccess != Access::None ? &depthAttachment : nullptr,
.pStencilAttachment =
stencilAccess != Access::None ? &stencilAttachment : nullptr,
};
cacheTag.buildDescriptors(descriptorSets[0]);
pipe.scheduler.afterSubmit([cacheTag = std::move(cacheTag)] {});
auto commandBuffer = pipe.scheduler.getCommandBuffer();
vkCmdBeginRendering(commandBuffer, &renderInfo);
vkCmdSetRasterizerDiscardEnable(commandBuffer, VK_FALSE);
@ -991,57 +494,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
vk::CmdBindShadersEXT(commandBuffer, stages.size(), stages.data(), shaders);
shaderResources.buildMemoryTable(*memoryTable);
for (auto &sampler : shaderResources.samplerResources) {
uint32_t index = &sampler - shaderResources.samplerResources.data();
VkDescriptorImageInfo samplerInfo{.sampler = sampler.handle};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = descriptorSets[0],
.dstBinding = Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER),
.dstArrayElement = index,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER,
.pImageInfo = &samplerInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
}
for (auto &imageResources : shaderResources.imageResources) {
auto dim = (&imageResources - shaderResources.imageResources) + 1;
for (auto &image : imageResources) {
uint32_t index = &image - imageResources.data();
VkDescriptorImageInfo imageInfo{
.imageView = image.handle,
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = descriptorSets[0],
.dstBinding = static_cast<uint32_t>(Cache::getDescriptorBinding(
VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, dim)),
.dstArrayElement = index,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
.pImageInfo = &imageInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
}
}
for (auto &mtConfig : memoryTableConfigSlots) {
auto config = descriptorBuffers[mtConfig.bufferIndex];
config[mtConfig.configIndex] =
shaderResources.getResourceSlot(mtConfig.resourceSlot);
}
if (indexBuffer.handle != VK_NULL_HANDLE) {
vkCmdBindIndexBuffer(commandBuffer, indexBuffer.handle, indexBuffer.offset,
gnm::toVkIndexType(indexBuffer.indexType));
@ -1054,10 +506,19 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
vkCmdEndRendering(commandBuffer);
pipe.scheduler.submit();
pipe.scheduler.then([=, cacheTag = std::move(cacheTag),
shaderResources = std::move(shaderResources)] {});
}
// void amdgpu::dispatch(Scheduler &sched,
// amdgpu::Registers::ComputeConfig &computeConfig, int
// vmId, std::uint32_t groupCountX, std::uint32_t
// groupCountY, std::uint32_t groupCountZ) {
// vkCmdDispatch(sched.getCommandBuffer(), groupCountX, groupCountY,
// groupCountZ);
// sched.submit();
// }
static void
transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
VkImageLayout oldLayout, VkImageLayout newLayout,
@ -1115,10 +576,10 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
VkExtent2D targetExtent, std::uint64_t address,
VkImageView target, VkExtent2D imageExtent,
FlipType type, TileMode tileMode, gnm::DataFormat dfmt,
VkImageView target, VkExtent2D imageExtent, FlipType type,
TileMode tileMode, gnm::DataFormat dfmt,
gnm::NumericFormat nfmt) {
ImageViewKey framebuffer{};
ImageKey framebuffer{};
framebuffer.readAddress = address;
framebuffer.type = gnm::TextureType::Dim2D;
framebuffer.dfmt = dfmt;
@ -1181,7 +642,8 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
vkCmdBeginRendering(commandBuffer, &renderInfo);
cacheTag.getDevice()->flipPipeline.bind(cacheTag.getScheduler(), type, imageView.handle, sampler.handle);
cacheTag.getDevice()->flipPipeline.bind(cacheTag.getScheduler(), type,
imageView.handle, sampler.handle);
vkCmdSetViewportWithCount(commandBuffer, 1, viewPorts);
vkCmdSetScissorWithCount(commandBuffer, 1, viewPortScissors);