gpu2: initial dispatch implementation

This commit is contained in:
DH 2024-09-30 21:43:34 +03:00
parent 424ce5cf68
commit 239a0645bc
10 changed files with 338 additions and 130 deletions

View file

@ -762,8 +762,6 @@ Cache::Shader Cache::Tag::getShader(const ShaderKey &key,
std::shared_ptr<Cache::Entry>
Cache::Tag::findShader(const ShaderKey &key, const ShaderKey *dependedKey) {
auto data = RemoteMemory{mParent->mVmIm}.getPointer(key.address);
auto cacheIt = mParent->mShaders.queryArea(key.address);
if (cacheIt == mParent->mShaders.end() ||
@ -1088,10 +1086,9 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
key.mipCount, key.pow2pad);
VkImageUsageFlags usage =
VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
VK_IMAGE_USAGE_SAMPLED_BIT // | VK_IMAGE_USAGE_STORAGE_BIT
;
VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
if (key.kind == ImageKind::Color) {
usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
bool isCompressed =
key.dfmt == gnm::kDataFormatBc1 || key.dfmt == gnm::kDataFormatBc2 ||
key.dfmt == gnm::kDataFormatBc3 || key.dfmt == gnm::kDataFormatBc4 ||
@ -1101,6 +1098,9 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
if (!isCompressed) {
usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
}
} else {
usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
}
auto image = vk::Image::Allocate(
@ -1151,25 +1151,6 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
.depth = std::max(key.extent.depth >> mipLevel, 1u),
},
});
regions.push_back({
.bufferOffset = info.offset,
.bufferRowLength =
mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u),
.imageSubresource =
{
.aspectMask = toAspect(key.kind),
.mipLevel = mipLevel,
.baseArrayLayer = key.baseArrayLayer,
.layerCount = key.arrayLayerCount,
},
.imageExtent =
{
.width = std::max(key.extent.width >> mipLevel, 1u),
.height = std::max(key.extent.height >> mipLevel, 1u),
.depth = std::max(key.extent.depth >> mipLevel, 1u),
},
});
}
} else {
auto &tiler = mParent->mDevice->tiler;
@ -1434,10 +1415,10 @@ Cache::GraphicsTag::getShader(gcn::Stage stage, const SpiShaderPgm &pgm,
gcn::Environment env{
.vgprCount = pgm.rsrc1.getVGprCount(),
.sgprCount = pgm.rsrc1.getSGprCount(),
.userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr),
.supportsBarycentric = vk::context->supportsBarycentric,
.supportsInt8 = vk::context->supportsInt8,
.supportsInt64Atomics = vk::context->supportsInt64Atomics,
.userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr),
};
auto shader = Tag::getShader({
@ -1545,6 +1526,10 @@ Cache::GraphicsTag::getShader(gcn::Stage stage, const SpiShaderPgm &pgm,
configPtr[index] = std::bit_cast<std::uint32_t>(
context.cbColor[slot.data].info.compSwap);
break;
default:
rx::die("unexpected resource slot in graphics shader %u, stage %u",
int(slot.type), int(stage));
}
++index;
@ -1575,7 +1560,140 @@ Cache::GraphicsTag::getShader(gcn::Stage stage, const SpiShaderPgm &pgm,
Cache::Shader
Cache::ComputeTag::getShader(const Registers::ComputeConfig &pgm) {
return {};
auto descriptorSet = getDescriptorSet();
gcn::Environment env{
.vgprCount = pgm.rsrc1.getVGprCount(),
.sgprCount = pgm.rsrc1.getSGprCount(),
.numThreadX = static_cast<std::uint8_t>(pgm.numThreadX),
.numThreadY = static_cast<std::uint8_t>(pgm.numThreadY),
.numThreadZ = static_cast<std::uint8_t>(pgm.numThreadZ),
.supportsBarycentric = vk::context->supportsBarycentric,
.supportsInt8 = vk::context->supportsInt8,
.supportsInt64Atomics = vk::context->supportsInt64Atomics,
.userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr),
};
auto shader = Tag::getShader({
.address = pgm.address << 8,
.stage = gcn::Stage::Cs,
.env = env,
});
if (!shader.handle) {
return shader;
}
std::uint64_t memoryTableAddress = getMemoryTable().deviceAddress;
std::uint64_t gdsAddress = mParent->getGdsBuffer().getAddress();
mStorage->shaderResources.cacheTag = this;
std::uint32_t slotOffset = mStorage->shaderResources.slotOffset;
mStorage->shaderResources.loadResources(
shader.info->resources,
std::span(pgm.userData.data(), pgm.rsrc2.userSgpr));
const auto &configSlots = shader.info->configSlots;
auto configSize = configSlots.size() * sizeof(std::uint32_t);
auto configBuffer = getInternalHostVisibleBuffer(configSize);
auto configPtr = reinterpret_cast<std::uint32_t *>(configBuffer.data);
std::uint32_t sgprInput[static_cast<std::size_t>(gcn::CsSGprInput::Count)];
std::uint32_t sgprInputCount = 0;
if (pgm.rsrc2.tgIdXEn) {
sgprInput[sgprInputCount++] = static_cast<std::uint32_t>(gcn::CsSGprInput::ThreadGroupIdX);
}
if (pgm.rsrc2.tgIdYEn) {
sgprInput[sgprInputCount++] = static_cast<std::uint32_t>(gcn::CsSGprInput::ThreadGroupIdY);
}
if (pgm.rsrc2.tgIdZEn) {
sgprInput[sgprInputCount++] = static_cast<std::uint32_t>(gcn::CsSGprInput::ThreadGroupIdZ);
}
if (pgm.rsrc2.tgSizeEn) {
sgprInput[sgprInputCount++] = static_cast<std::uint32_t>(gcn::CsSGprInput::ThreadGroupSize);
}
if (pgm.rsrc2.scratchEn) {
sgprInput[sgprInputCount++] = static_cast<std::uint32_t>(gcn::CsSGprInput::Scratch);
}
for (std::size_t index = 0; const auto &slot : configSlots) {
switch (slot.type) {
case gcn::ConfigType::Imm:
readMemory(&configPtr[index], slot.data, sizeof(std::uint32_t));
break;
case gcn::ConfigType::UserSgpr:
configPtr[index] = pgm.userData[slot.data];
break;
case gcn::ConfigType::ResourceSlot:
mStorage->memoryTableConfigSlots.push_back({
.bufferIndex =
static_cast<std::uint32_t>(mStorage->descriptorBuffers.size()),
.configIndex = static_cast<std::uint32_t>(index),
.resourceSlot = static_cast<std::uint32_t>(slotOffset + slot.data),
});
break;
case gcn::ConfigType::MemoryTable:
if (slot.data == 0) {
configPtr[index] = static_cast<std::uint32_t>(memoryTableAddress);
} else {
configPtr[index] = static_cast<std::uint32_t>(memoryTableAddress >> 32);
}
break;
case gcn::ConfigType::Gds:
if (slot.data == 0) {
configPtr[index] = static_cast<std::uint32_t>(gdsAddress);
} else {
configPtr[index] = static_cast<std::uint32_t>(gdsAddress >> 32);
}
break;
case gcn::ConfigType::CsTgIdCompCnt:
configPtr[index] = pgm.rsrc2.tidIgCompCount;
break;
case gcn::ConfigType::CsInputSGpr:
if (slot.data < sgprInputCount) {
configPtr[index] = sgprInput[slot.data];
} else {
configPtr[index] = -1;
}
break;
default:
rx::die("unexpected resource slot in compute shader %u", int(slot.type));
}
++index;
}
mStorage->descriptorBuffers.push_back(configPtr);
VkDescriptorBufferInfo bufferInfo{
.buffer = configBuffer.handle,
.offset = configBuffer.offset,
.range = configSize,
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = descriptorSet,
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
return shader;
}
Cache::Cache(Device *device, int vmId) : mDevice(device), mVmIm(vmId) {

View file

@ -157,7 +157,7 @@ struct Cache {
VkImageSubresourceRange subresource;
};
class Tag;
struct Tag;
private:
struct MemoryTableSlot {

View file

@ -3,8 +3,7 @@
#include "shaders/flip_alt.frag.h"
#include "shaders/flip_std.frag.h"
#include "vk.hpp"
#include <atomic>
#include <vulkan/vulkan_core.h>
#include <vulkan/vulkan.h>
FlipPipeline::~FlipPipeline() {
vkDestroyPipeline(vk::context->device, pipelines[0], vk::context->allocator);

View file

@ -485,7 +485,8 @@ bool GraphicsPipe::dispatchDirect(Queue &queue) {
auto dispatchInitiator = queue.rptr[4];
sh.compute.computeDispatchInitiator = dispatchInitiator;
// FIXME
amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX,
dimY, dimZ);
return true;
}
bool GraphicsPipe::dispatchIndirect(Queue &queue) {
@ -500,7 +501,8 @@ bool GraphicsPipe::dispatchIndirect(Queue &queue) {
auto dimY = buffer[1];
auto dimZ = buffer[2];
// FIXME
amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX,
dimY, dimZ);
return true;
}

View file

@ -551,20 +551,71 @@ struct Registers {
std::uint32_t computeDispatchInitiator;
std::uint32_t _pad0[6];
std::uint32_t computeNumThreadX;
std::uint32_t computeNumThreadY;
std::uint32_t computeNumThreadZ;
std::uint32_t numThreadX;
std::uint32_t numThreadY;
std::uint32_t numThreadZ;
std::uint32_t _pad1[2];
std::uint32_t computePgmLo;
std::uint32_t computePgmHi;
std::uint64_t address;
std::uint32_t _pad2[4];
std::uint32_t computePgmRsrc1;
std::uint32_t computePgmRsrc2;
struct {
union {
std::uint32_t raw;
struct {
std::uint32_t vgprs : 6;
std::uint32_t sgprs : 4;
std::uint32_t priority : 2;
std::uint32_t floatMode : 8;
std::uint32_t priv : 1;
std::uint32_t dx10Clamp : 1;
std::uint32_t debugMode : 1;
std::uint32_t ieeeMode : 1;
};
};
std::uint8_t getVGprCount() const { return (vgprs + 1) * 4; }
std::uint8_t getSGprCount() const { return (sgprs + 1) * 8; }
} rsrc1;
struct {
union {
std::uint32_t raw;
struct {
bool scratchEn : 1;
std::uint32_t userSgpr : 5;
bool trapPresent : 1;
bool tgIdXEn : 1;
bool tgIdYEn : 1;
bool tgIdZEn : 1;
bool tgSizeEn : 1;
std::uint32_t tidIgCompCount : 2;
std::uint32_t : 2;
std::uint32_t ldsSize : 9;
std::uint32_t excpEn : 7;
};
};
std::uint32_t getLdsDwordsCount() const { return ldsSize * 64; }
} rsrc2;
std::uint32_t _pad3[1];
std::uint32_t computeResourceLimits;
std::uint32_t computeStaticThreadMgmtSe0;
std::uint32_t computeStaticThreadMgmtSe1;
std::uint32_t computeTmpRingSize;
struct {
union {
std::uint32_t raw;
struct {
std::uint32_t wavesPerSh : 6;
std::uint32_t : 6;
std::uint32_t tgPerCu : 4;
std::uint32_t lockThreshold: 6;
std::uint32_t simdDestCntl : 1;
};
};
std::uint32_t getWavesPerSh() const { return wavesPerSh << 4; }
} resourceLimits;
std::uint32_t staticThreadMgmtSe0;
std::uint32_t staticThreadMgmtSe1;
std::uint32_t tmpRingSize;
std::uint32_t _pad4[39];
std::array<std::uint32_t, 16> userData;
};

View file

@ -512,70 +512,23 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
pipe.scheduler.submit();
}
// void amdgpu::dispatch(Scheduler &sched,
// amdgpu::Registers::ComputeConfig &computeConfig, int
// vmId, std::uint32_t groupCountX, std::uint32_t
// groupCountY, std::uint32_t groupCountZ) {
void amdgpu::dispatch(Cache &cache, Scheduler &sched,
Registers::ComputeConfig &computeConfig,
std::uint32_t groupCountX, std::uint32_t groupCountY,
std::uint32_t groupCountZ) {
auto tag = cache.createComputeTag(sched);
auto descriptorSet = tag.getDescriptorSet();
auto shader = tag.getShader(computeConfig);
auto pipelineLayout = tag.getComputePipelineLayout();
tag.buildDescriptors(descriptorSet);
// vkCmdDispatch(sched.getCommandBuffer(), groupCountX, groupCountY,
// groupCountZ);
// sched.submit();
// }
static void
transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
VkImageLayout oldLayout, VkImageLayout newLayout,
const VkImageSubresourceRange &subresourceRange) {
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = oldLayout;
barrier.newLayout = newLayout;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = image;
barrier.subresourceRange = subresourceRange;
auto layoutToStageAccess = [](VkImageLayout layout)
-> std::pair<VkPipelineStageFlags, VkAccessFlags> {
switch (layout) {
case VK_IMAGE_LAYOUT_UNDEFINED:
case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
case VK_IMAGE_LAYOUT_GENERAL:
return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT};
case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT};
case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL:
return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT};
case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL:
return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT};
default:
std::abort();
}
};
auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
barrier.srcAccessMask = sourceAccess;
barrier.dstAccessMask = destinationAccess;
vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0,
nullptr, 0, nullptr, 1, &barrier);
auto commandBuffer = sched.getCommandBuffer();
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &shader.handle);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
vkCmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
sched.submit();
}
void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
@ -604,12 +557,6 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
auto imageView = cacheTag.getImageView(framebuffer, Access::Read);
auto sampler = cacheTag.getSampler(framebufferSampler);
VkDescriptorImageInfo imageInfo{
.sampler = sampler.handle,
.imageView = imageView.handle,
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
};
VkRenderingAttachmentInfo colorAttachments[1]{{
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
.imageView = target,

View file

@ -11,6 +11,10 @@ void draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
std::uint32_t vertexCount, std::uint32_t firstInstance,
std::uint32_t instanceCount, std::uint64_t indiciesAddress,
std::uint32_t indexCount);
void dispatch(Cache &cache, Scheduler &sched,
Registers::ComputeConfig &computeConfig,
std::uint32_t groupCountX, std::uint32_t groupCountY,
std::uint32_t groupCountZ);
void flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
VkExtent2D targetExtent, std::uint64_t address, VkImageView target,
VkExtent2D imageExtent, FlipType type, TileMode tileMode,

View file

@ -7,6 +7,70 @@
#include <vector>
namespace shader::gcn {
enum class VsSGprInput {
State,
StreamOutWriteIndex,
StreamOutBaseOffset0,
StreamOutBaseOffset1,
StreamOutBaseOffset2,
StreamOutBaseOffset3,
OffchipLds,
WaveId,
Scratch,
Count,
};
enum class PsSGprInput {
State,
WaveCount,
Scratch,
Count,
};
enum class GsSGprInput {
GsVsOffset,
GsWaveId,
Scratch,
Count,
};
enum class EsSGprInput {
OffchipLds,
IsOffchip,
EsGsOffset,
Scratch,
Count,
};
enum class HsSGprInput {
OffchipLds,
ThreadGroupSize,
TesselationFactorBase,
Scratch,
Count,
};
enum class LsSGprInput {
Scratch,
Count,
};
enum class CsSGprInput {
ThreadGroupIdX,
ThreadGroupIdY,
ThreadGroupIdZ,
ThreadGroupSize,
Scratch,
Count,
};
enum class PsVGprInput {
IPerspSample,
JPerspSample,
@ -34,6 +98,7 @@ enum class PsVGprInput {
Count
};
enum class ConfigType {
Imm,
UserSgpr,
@ -41,7 +106,20 @@ enum class ConfigType {
MemoryTable,
Gds,
PsInputVGpr,
VsInputSGpr,
PsInputSGpr,
GsInputSGpr,
EsInputSGpr,
HsInputSGpr,
LsInputSGpr,
CsInputSGpr,
GsPrimType,
GsInstanceEn,
InstanceEn,
VsPrimType,
PsPrimType,
CsTgIdCompCnt,
VsInputVgprCount,
CbCompSwap,
ViewPortOffsetX,
ViewPortOffsetY,

View file

@ -27,6 +27,23 @@ enum class Stage {
Invalid,
};
enum RegId {
Sgpr,
Vgpr,
M0,
Scc,
Vcc,
Exec,
VccZ,
ExecZ,
LdsDirect,
SgprCount,
VgprCount,
ThreadId,
MemoryTable,
Gds,
};
struct Import : spv::Import {
ir::Node getOrCloneImpl(ir::Context &context, ir::Node node,
bool isOperand) override;
@ -55,23 +72,6 @@ struct InstructionRegion : ir::RegionLikeImpl {
}
};
enum RegId {
Sgpr,
Vgpr,
M0,
Scc,
Vcc,
Exec,
VccZ,
ExecZ,
LdsDirect,
SgprCount,
VgprCount,
ThreadId,
MemoryTable,
Gds,
};
struct Context : spv::Context {
ir::Region body;
rx::MemoryAreaTable<> memoryMap;
@ -113,10 +113,13 @@ struct Context : spv::Context {
struct Environment {
std::uint8_t vgprCount;
std::uint8_t sgprCount;
std::span<const std::uint32_t> userSgprs;
std::uint8_t numThreadX;
std::uint8_t numThreadY;
std::uint8_t numThreadZ;
bool supportsBarycentric = true;
bool supportsInt8 = false;
bool supportsInt64Atomics = false;
std::span<const std::uint32_t> userSgprs;
};
ir::Region deserialize(Context &context, const Environment &environment,

View file

@ -10,6 +10,9 @@ void rx::die(const char *message, ...) {
std::vfprintf(stderr, message, args);
std::fprintf(stderr, "\n");
va_end(args);
std::fflush(stdout);
std::fflush(stderr);
std::abort();
}
@ -20,6 +23,9 @@ void rx::dieIf(bool condition, const char *message, ...) {
std::vfprintf(stderr, message, args);
std::fprintf(stderr, "\n");
va_end(args);
std::fflush(stdout);
std::fflush(stderr);
std::abort();
}
}