From 4185b1aa40f88d59bc5b7dab02e215c18ce60040 Mon Sep 17 00:00:00 2001 From: DH Date: Sat, 28 Sep 2024 03:12:12 +0300 Subject: [PATCH] gpu2: implement depth textures initial 2d tiler implementation fixed mtbuf index order simplify v_mac_*_f32 instructions --- rpcsx-gpu2/Cache.cpp | 52 +++- rpcsx-gpu2/Cache.hpp | 10 +- rpcsx-gpu2/Device.cpp | 14 +- rpcsx-gpu2/Registers.hpp | 16 +- rpcsx-gpu2/Renderer.cpp | 114 +++++-- .../lib/amdgpu-tiler/include/amdgpu/tiler.hpp | 24 +- .../include/amdgpu/tiler_vulkan.hpp | 13 +- .../amdgpu-tiler/shaders/detiler2d.comp.glsl | 13 +- .../lib/amdgpu-tiler/shaders/tiler.glsl | 290 ++++++++++++++++++ .../amdgpu-tiler/shaders/tiler2d.comp.glsl | 12 +- rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp | 167 +++++++++- rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp | 10 +- .../lib/amdgpu-tiler/src/tiler_vulkan.cpp | 29 +- rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl | 13 +- .../lib/gcn-shader/src/GcnConverter.cpp | 12 +- .../lib/gcn-shader/src/GcnInstruction.cpp | 12 +- rpcsx-gpu2/lib/gcn-shader/src/gcn.cpp | 35 ++- rpcsx-gpu2/lib/gnm/include/gnm/gnm.hpp | 53 +++- 18 files changed, 790 insertions(+), 99 deletions(-) diff --git a/rpcsx-gpu2/Cache.cpp b/rpcsx-gpu2/Cache.cpp index de343adc6..ef57179e1 100644 --- a/rpcsx-gpu2/Cache.cpp +++ b/rpcsx-gpu2/Cache.cpp @@ -232,10 +232,25 @@ struct CachedIndexBuffer : Cache::Entry { gnm::PrimitiveType primType; }; +constexpr VkImageAspectFlags toAspect(ImageKind kind) { + switch (kind) { + case ImageKind::Color: + return VK_IMAGE_ASPECT_COLOR_BIT; + case ImageKind::Depth: + return VK_IMAGE_ASPECT_DEPTH_BIT; + case ImageKind::Stencil: + return VK_IMAGE_ASPECT_STENCIL_BIT; + } + + return VK_IMAGE_ASPECT_NONE; +} + struct CachedImage : Cache::Entry { vk::Image image; + ImageKind kind; SurfaceInfo info; TileMode acquiredTileMode; + gnm::DataFormat acquiredDfmt{}; void flush(Cache::Tag &tag, Scheduler &scheduler, std::uint64_t beginAddress, std::uint64_t endAddress) override { @@ -246,7 +261,7 @@ struct CachedImage : Cache::Entry { // std::printf("writing image to buffer to %lx\n", baseAddress); VkImageSubresourceRange subresourceRange{ - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = toAspect(kind), .baseMipLevel = 0, .levelCount = image.getMipLevels(), .baseArrayLayer = 0, @@ -270,7 +285,7 @@ struct CachedImage : Cache::Entry { mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u), .imageSubresource = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = toAspect(kind), .mipLevel = mipLevel, .baseArrayLayer = 0, .layerCount = image.getArrayLayers(), @@ -287,9 +302,9 @@ struct CachedImage : Cache::Entry { VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, transferBuffer.getHandle(), 1, ®ion); - tiler.tile(scheduler, info, acquiredTileMode, transferBuffer.getAddress(), - tiledBuffer.deviceAddress, mipLevel, 0, - image.getArrayLayers()); + tiler.tile(scheduler, info, acquiredTileMode, acquiredDfmt, + transferBuffer.getAddress(), tiledBuffer.deviceAddress, + mipLevel, 0, image.getArrayLayers()); } transitionImageLayout(scheduler.getCommandBuffer(), image, @@ -307,7 +322,8 @@ struct CachedImageView : Cache::Entry { ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) { return { - .address = buffer.address(), + .readAddress = buffer.address(), + .writeAddress = buffer.address(), .type = buffer.type, .dfmt = buffer.dfmt, .nfmt = buffer.nfmt, @@ -324,6 +340,7 @@ ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) { .mipCount = buffer.last_level - buffer.base_level + 1u, .baseArrayLayer = static_cast(buffer.base_array), .arrayLayerCount = buffer.last_array - buffer.base_array + 1u, + .kind = ImageKind::Color, .pow2pad = buffer.pow2pad != 0, }; } @@ -714,7 +731,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { if ((access & Access::Read) != Access::None) { auto tiledBuffer = - getBuffer(key.address, surfaceInfo.totalSize, Access::Read); + getBuffer(key.readAddress, surfaceInfo.totalSize, Access::Read); auto &tiler = mParent->mDevice->tiler; auto detiledBuffer = @@ -722,7 +739,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR | VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR); VkImageSubresourceRange subresourceRange{ - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = toAspect(key.kind), .baseMipLevel = key.baseMipLevel, .levelCount = key.mipCount, .baseArrayLayer = key.baseArrayLayer, @@ -756,8 +773,8 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { .size = info.linearSize * key.arrayLayerCount, }); } else { - tiler.detile(*mScheduler, surfaceInfo, key.tileMode, srcAddress, - dstAddress, mipLevel, 0, key.arrayLayerCount); + tiler.detile(*mScheduler, surfaceInfo, key.tileMode, key.dfmt, + srcAddress, dstAddress, mipLevel, 0, key.arrayLayerCount); } regions.push_back({ @@ -766,7 +783,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u), .imageSubresource = { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = toAspect(key.kind), .mipLevel = mipLevel, .baseArrayLayer = key.baseArrayLayer, .layerCount = key.arrayLayerCount, @@ -808,8 +825,13 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { auto cached = std::make_shared(); cached->image = std::move(image); cached->info = std::move(surfaceInfo); - cached->baseAddress = key.address; + cached->baseAddress = (access & Access::Write) != Access::None + ? key.writeAddress + : key.readAddress; + cached->kind = key.kind; cached->acquiredAccess = access; + cached->acquiredTileMode = key.tileMode; + cached->acquiredDfmt = key.dfmt; mAcquiredResources.push_back(cached); return {.handle = cached->image.getHandle()}; @@ -827,14 +849,16 @@ Cache::ImageView Cache::Tag::getImageView(const ImageViewKey &key, .a = gnm::toVkComponentSwizzle(key.A), }, { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .aspectMask = toAspect(key.kind), .baseMipLevel = key.baseMipLevel, .levelCount = key.mipCount, .baseArrayLayer = key.baseArrayLayer, .layerCount = key.arrayLayerCount, }); auto cached = std::make_shared(); - cached->baseAddress = key.address; + cached->baseAddress = (access & Access::Write) != Access::None + ? key.writeAddress + : key.readAddress; cached->acquiredAccess = access; cached->view = std::move(result); diff --git a/rpcsx-gpu2/Cache.hpp b/rpcsx-gpu2/Cache.hpp index 4ae2c46e1..be197226a 100644 --- a/rpcsx-gpu2/Cache.hpp +++ b/rpcsx-gpu2/Cache.hpp @@ -22,8 +22,15 @@ struct ShaderKey { shader::gcn::Environment env; }; +enum class ImageKind { + Color, + Depth, + Stencil +}; + struct ImageKey { - std::uint64_t address; + std::uint64_t readAddress; + std::uint64_t writeAddress; gnm::TextureType type; gnm::DataFormat dfmt; gnm::NumericFormat nfmt; @@ -35,6 +42,7 @@ struct ImageKey { unsigned mipCount = 1; unsigned baseArrayLayer = 0; unsigned arrayLayerCount = 1; + ImageKind kind = ImageKind::Color; bool pow2pad = false; static ImageKey createFrom(const gnm::TBuffer &tbuffer); diff --git a/rpcsx-gpu2/Device.cpp b/rpcsx-gpu2/Device.cpp index 6fe886a6f..ebdd3b06a 100644 --- a/rpcsx-gpu2/Device.cpp +++ b/rpcsx-gpu2/Device.cpp @@ -301,10 +301,10 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, .layerCount = 1, }); - amdgpu::flip(cacheTag, commandBuffer, vk::context->swapchainExtent, - buffer.address, swapchainImageView, - {bufferAttr.width, bufferAttr.height}, compSwap, - getDefaultTileModes()[13], dfmt, nfmt); + amdgpu::flip( + cacheTag, commandBuffer, vk::context->swapchainExtent, buffer.address, + swapchainImageView, {bufferAttr.width, bufferAttr.height}, compSwap, + getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], dfmt, nfmt); transitionImageLayout(commandBuffer, swapchainImage, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, @@ -316,11 +316,11 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, }); } else { ImageKey frameKey{ - .address = buffer.address, + .readAddress = buffer.address, .type = gnm::TextureType::Dim2D, .dfmt = dfmt, .nfmt = nfmt, - .tileMode = getDefaultTileModes()[13], + .tileMode = getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], .extent = { .width = bufferAttr.width, @@ -429,7 +429,7 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, }; vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence); - // vkQueueWaitIdle(queue); + vkQueueWaitIdle(vk::context->presentQueue); } scheduler.then([=, this, cacheTag = std::move(cacheTag)] { diff --git a/rpcsx-gpu2/Registers.hpp b/rpcsx-gpu2/Registers.hpp index b1c13765b..cddc737fd 100644 --- a/rpcsx-gpu2/Registers.hpp +++ b/rpcsx-gpu2/Registers.hpp @@ -405,6 +405,20 @@ struct DbRenderControl { }; }; +struct DbDepthView { + union { + struct { + std::uint32_t sliceStart : 11; + std::uint32_t : 2; + std::uint32_t sliceMax : 11; + bool zReadOnly : 1; + bool stencilReadOnly : 1; + }; + + std::uint32_t raw; + }; +}; + struct CbBlendControl { union { struct { @@ -574,7 +588,7 @@ struct Registers { union { Register<0x0, DbRenderControl> dbRenderControl; Register<0x1> dbCountControl; - Register<0x2> dbDepthView; + Register<0x2, DbDepthView> dbDepthView; Register<0x3> dbRenderOverride; Register<0x4> dbRenderOverride2; Register<0x5> dbHTileDataBase; diff --git a/rpcsx-gpu2/Renderer.cpp b/rpcsx-gpu2/Renderer.cpp index ed5391ec9..064dea5c8 100644 --- a/rpcsx-gpu2/Renderer.cpp +++ b/rpcsx-gpu2/Renderer.cpp @@ -1,6 +1,7 @@ #include "Renderer.hpp" #include "Device.hpp" #include "gnm/descriptors.hpp" +#include "gnm/gnm.hpp" #include "rx/MemoryTable.hpp" #include @@ -227,7 +228,8 @@ struct ShaderResources : eval::Evaluator { bufferMemoryTable.map(*pointerBase, *pointerBase + *pointerOffset + pointer.size, Access::Read); - resourceSlotToAddress.push_back({slotOffset + pointer.resourceSlot, *pointerBase}); + resourceSlotToAddress.push_back( + {slotOffset + pointer.resourceSlot, *pointerBase}); } for (auto &bufferRes : res.buffers) { @@ -352,7 +354,8 @@ struct ShaderResources : eval::Evaluator { sSampler.force_unorm_coords = true; } - slotResources[slotOffset + sampler.resourceSlot] = samplerResources.size(); + slotResources[slotOffset + sampler.resourceSlot] = + samplerResources.size(); samplerResources.push_back( cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler))); } @@ -503,11 +506,78 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, VkRect2D viewPortScissors[8]{}; unsigned renderTargets = 0; + VkRenderingAttachmentInfo depthAttachment{}; + VkRenderingAttachmentInfo stencilAttachment{}; + + auto depthAccess = Access::None; + auto stencilAccess = Access::None; + + if (pipe.context.dbDepthControl.depthEnable) { + if (!pipe.context.dbRenderControl.depthClearEnable) { + depthAccess |= Access::Read; + } + if (!pipe.context.dbDepthView.zReadOnly) { + depthAccess |= Access::Write; + } + } + + if (pipe.context.dbDepthControl.stencilEnable) { + if (!pipe.context.dbRenderControl.stencilClearEnable) { + stencilAccess |= Access::Read; + } + if (!pipe.context.dbDepthView.stencilReadOnly) { + stencilAccess |= Access::Write; + } + } + + if (depthAccess != Access::None) { + auto viewPortScissor = pipe.context.paScScreenScissor; + auto viewPortRect = gnm::toVkRect2D(viewPortScissor); + + auto imageView = cacheTag.getImageView( + {{ + .readAddress = pipe.context.dbZReadBase, + .writeAddress = pipe.context.dbZWriteBase, + .dfmt = gnm::getDataFormat(pipe.context.dbZInfo.format), + .nfmt = gnm::getNumericFormat(pipe.context.dbZInfo.format), + .extent = + { + .width = viewPortRect.extent.width, + .height = viewPortRect.extent.height, + .depth = 1, + }, + .pitch = viewPortRect.extent.width, + .kind = ImageKind::Depth, + }}, + depthAccess); + + depthAttachment = { + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + .imageView = imageView.handle, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + }; + + if ((depthAccess & Access::Read) == Access::None) { + depthAttachment.clearValue.depthStencil.depth = pipe.context.dbDepthClear; + depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + } + + if ((depthAccess & Access::Write) == Access::None) { + depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_NONE; + } + } + for (auto &cbColor : pipe.context.cbColor) { if (targetMask == 0) { break; } + if (cbColor.info.dfmt == gnm::kDataFormatInvalid) { + continue; + } + auto viewPortScissor = pipe.context.paScScreenScissor; // viewPortScissor = gnm::intersection( // viewPortScissor, pipe.context.paScVportScissor[renderTargets]); @@ -533,7 +603,9 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, ImageViewKey renderTargetInfo{}; renderTargetInfo.type = gnm::TextureType::Dim2D; renderTargetInfo.pitch = vkViewPortScissor.extent.width; - renderTargetInfo.address = static_cast(cbColor.base) << 8; + renderTargetInfo.readAddress = static_cast(cbColor.base) + << 8; + renderTargetInfo.writeAddress = renderTargetInfo.readAddress; renderTargetInfo.extent.width = vkViewPortScissor.extent.width; renderTargetInfo.extent.height = vkViewPortScissor.extent.height; renderTargetInfo.extent.depth = 1; @@ -545,9 +617,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, renderTargetInfo.tileMode = cbColor.info.linearGeneral ? TileMode{.raw = 0} - : getDefaultTileModes()[/*cbColor.attrib.tileModeIndex*/ - 13]; - + : getDefaultTileModes()[cbColor.attrib.tileModeIndex]; // std::printf("draw to %lx\n", renderTargetInfo.address); auto access = Access::None; @@ -613,6 +683,10 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, targetMask >>= 4; } + if (renderTargets == 0) { + return; + } + // if (pipe.context.cbTargetMask == 0) { // return; // } @@ -654,7 +728,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, .vgprCount = pgm.rsrc1.getVGprCount(), .sgprCount = pgm.rsrc1.getSGprCount(), .userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr), - .supportsBarycentric = vk::context->supportsBarycentric, + // .supportsBarycentric = vk::context->supportsBarycentric, .supportsInt8 = vk::context->supportsInt8, .supportsInt64Atomics = vk::context->supportsInt64Atomics, }; @@ -754,29 +828,33 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, break; case shader::gcn::ConfigType::ViewPortOffsetX: configPtr[index] = std::bit_cast( - pipe.context.paClVports[0].xOffset / (viewPorts[0].width / 2.f) - + pipe.context.paClVports[slot.data].xOffset / + (viewPorts[0].width / 2.f) - 1); break; case shader::gcn::ConfigType::ViewPortOffsetY: configPtr[index] = std::bit_cast( - pipe.context.paClVports[0].yOffset / (viewPorts[0].height / 2.f) - + pipe.context.paClVports[slot.data].yOffset / + (viewPorts[slot.data].height / 2.f) - 1); break; case shader::gcn::ConfigType::ViewPortOffsetZ: - configPtr[index] = - std::bit_cast(pipe.context.paClVports[0].zOffset); + configPtr[index] = std::bit_cast( + pipe.context.paClVports[slot.data].zOffset); break; case shader::gcn::ConfigType::ViewPortScaleX: configPtr[index] = std::bit_cast( - pipe.context.paClVports[0].xScale / (viewPorts[0].width / 2.f)); + pipe.context.paClVports[slot.data].xScale / + (viewPorts[slot.data].width / 2.f)); break; case shader::gcn::ConfigType::ViewPortScaleY: configPtr[index] = std::bit_cast( - pipe.context.paClVports[0].yScale / (viewPorts[0].height / 2.f)); + pipe.context.paClVports[slot.data].yScale / + (viewPorts[slot.data].height / 2.f)); break; case shader::gcn::ConfigType::ViewPortScaleZ: - configPtr[index] = - std::bit_cast(pipe.context.paClVports[0].zScale); + configPtr[index] = std::bit_cast( + pipe.context.paClVports[slot.data].zScale); break; case shader::gcn::ConfigType::PsInputVGpr: if (slot.data > psVgprInputs) { @@ -882,8 +960,8 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, .layerCount = 1, .colorAttachmentCount = renderTargets, .pColorAttachments = colorAttachments, - // .pDepthAttachment = &depthAttachment, - // .pStencilAttachment = &stencilAttachment, + .pDepthAttachment = &depthAttachment, + // .pStencilAttachment = &stencilAttachment, }; vkCmdBeginRendering(commandBuffer, &renderInfo); @@ -1092,7 +1170,7 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer, ImageViewKey framebuffer{}; framebuffer.type = gnm::TextureType::Dim2D; framebuffer.pitch = imageExtent.width; - framebuffer.address = address; + framebuffer.readAddress = address; framebuffer.extent.width = imageExtent.width; framebuffer.extent.height = imageExtent.height; framebuffer.extent.depth = 1; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp index 64d0d4668..9d875c4de 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp @@ -1,11 +1,11 @@ #pragma once #include +#include #include #include #include #include -#include namespace amdgpu { inline constexpr uint32_t kMicroTileWidth = 8; @@ -496,6 +496,28 @@ constexpr std::uint32_t getPipeCount(PipeConfig pipeConfig) { } } +constexpr int computeMacroTileIndex(amdgpu::TileMode tileMode, + uint32_t bitsPerElement, + uint32_t numFragmentsPerPixel) { + auto arrayMode = tileMode.arrayMode(); + auto microTileMode = tileMode.microTileMode(); + auto sampleSplitHw = tileMode.sampleSplit(); + auto tileSplitHw = tileMode.tileSplit(); + + uint32_t tileThickness = getMicroTileThickness(arrayMode); + uint32_t tileBytes1x = + bitsPerElement * kMicroTileWidth * kMicroTileHeight * tileThickness / 8; + uint32_t sampleSplit = 1 << sampleSplitHw; + uint32_t colorTileSplit = std::max(256U, sampleSplit * tileBytes1x); + uint32_t tileSplit = (microTileMode == amdgpu::kMicroTileModeDepth) + ? (64UL << tileSplitHw) + : colorTileSplit; + uint32_t tileSplitC = std::min(kDramRowSize, tileSplit); + uint32_t tileBytes = std::min(tileSplitC, numFragmentsPerPixel * tileBytes1x); + uint32_t mtmIndex = std::countr_zero(tileBytes / 64); + return isPrt(arrayMode) ? mtmIndex + 8 : mtmIndex; +} + SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt, std::uint32_t width, std::uint32_t height, std::uint32_t depth, diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp index 658dc7d5a..718fb15c1 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp @@ -1,4 +1,5 @@ #pragma once +#include "gnm/constants.hpp" #include "tiler.hpp" #include #include @@ -10,13 +11,13 @@ struct GpuTiler { ~GpuTiler(); void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress, - std::uint64_t dstLinearAddress, int mipLevel, int baseArray, - int arrayCount); + amdgpu::TileMode tileMode, gnm::DataFormat dfmt, + std::uint64_t srcTiledAddress, std::uint64_t dstLinearAddress, + int mipLevel, int baseArray, int arrayCount); void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress, - std::uint64_t dstTiledAddress, int mipLevel, int baseArray, - int arrayCount); + amdgpu::TileMode tileMode, gnm::DataFormat dfmt, + std::uint64_t srcLinearAddress, std::uint64_t dstTiledAddress, + int mipLevel, int baseArray, int arrayCount); private: std::unique_ptr mImpl; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl index 2f65b404e..ea01560ba 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl @@ -18,17 +18,24 @@ void main() { uvec3 pos = gl_GlobalInvocationID; uint64_t tiledSliceOffset = 0; uint64_t linearSliceOffset = 0; + int arraySlice = 0; + int fragmentIndex = 0; + if (config.tiledSurfaceSize != 0) { tiledSliceOffset = pos.z * config.tiledSurfaceSize; linearSliceOffset = pos.z * config.linearSurfaceSize; pos.z = 0; } - uint64_t tiledByteOffset = getTiledBitOffset1D( + uint64_t tiledByteOffset = getTiledBitOffset2D( + config.dfmt, config.tileMode, - pos, + config.macroTileMode, config.dataSize, - config.bitsPerElement + arraySlice, + config.numFragments, + pos, + fragmentIndex ) / 8; tiledByteOffset += tiledSliceOffset; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl index 04c9dbd04..bcb6f70ff 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl @@ -537,6 +537,86 @@ uint32_t tileMode_getSampleSplit(uint32_t tileMode) { return (tileMode & 0x06000000) >> 25; } +uint32_t macroTileMode_getBankWidth(uint32_t tileMode) { + return (tileMode & 0x00000003) >> 0; +} +uint32_t macroTileMode_getBankHeight(uint32_t tileMode) { + return (tileMode & 0x0000000c) >> 2; +} +uint32_t macroTileMode_getMacroTileAspect(uint32_t tileMode) { + return (tileMode & 0x00000030) >> 4; +} +uint32_t macroTileMode_getNumBanks(uint32_t tileMode) { + return (tileMode & 0x000000c0) >> 6; +} + +uint32_t getPipeCount(uint32_t pipeConfig) { + switch (pipeConfig) { + case kPipeConfigP8_32x32_8x16: + case kPipeConfigP8_32x32_16x16: + return 8; + case kPipeConfigP16: + return 16; + default: + return 0; + } +} + +uint32_t getPipeIndex(uint32_t x, uint32_t y, uint32_t pipeCfg) { + uint32_t pipe = 0; + switch (pipeCfg) { + case kPipeConfigP8_32x32_8x16: + pipe |= (((x >> 4) ^ (y >> 3) ^ (x >> 5)) & 0x1) << 0; + pipe |= (((x >> 3) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + break; + case kPipeConfigP8_32x32_16x16: + pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0; + pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + break; + case kPipeConfigP16: + pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0; + pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1; + pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2; + pipe |= (((x >> 6) ^ (y >> 5)) & 0x1) << 3; + break; + } + return pipe; +} + +uint32_t getBankIndex(uint32_t x, uint32_t y, uint32_t bank_width, uint32_t bank_height, uint32_t num_banks, uint32_t num_pipes) { + uint32_t x_shift_offset = findLSB(bank_width * num_pipes); + uint32_t y_shift_offset = findLSB(bank_height); + uint32_t xs = x >> x_shift_offset; + uint32_t ys = y >> y_shift_offset; + uint32_t bank = 0; + switch (num_banks) { + case 2: + bank |= (((xs >> 3) ^ (ys >> 3)) & 0x1) << 0; + break; + case 4: + bank |= (((xs >> 3) ^ (ys >> 4)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 3)) & 0x1) << 1; + break; + case 8: + bank |= (((xs >> 3) ^ (ys >> 5)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 4) ^ (ys >> 5)) & 0x1) << 1; + bank |= (((xs >> 5) ^ (ys >> 3)) & 0x1) << 2; + break; + case 16: + bank |= (((xs >> 3) ^ (ys >> 6)) & 0x1) << 0; + bank |= (((xs >> 4) ^ (ys >> 5) ^ (ys >> 6)) & 0x1) << 1; + bank |= (((xs >> 5) ^ (ys >> 4)) & 0x1) << 2; + bank |= (((xs >> 6) ^ (ys >> 3)) & 0x1) << 3; + break; + default: + break; + } + + return bank; +} + uint32_t bit_ceil(uint32_t x) { x = x - 1; x |= x >> 1; @@ -704,13 +784,223 @@ uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint3 return (sliceOffset + tileOffset) * 8 + elementOffset; } + +uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTileMode, + uvec2 dataSize, int arraySlice, uint32_t numFragments, u32vec3 pos, int fragmentIndex) { + uint32_t bitsPerFragment = getBitsPerElement(dfmt); + + bool isBlockCompressed = getTexelsPerElement(dfmt) > 1; + uint32_t tileSwizzleMask = 0; + uint32_t numFragmentsPerPixel = 1 << numFragments; + uint32_t arrayMode = tileMode_getArrayMode(tileMode); + + uint32_t tileThickness = 1; + + switch (arrayMode) { + case kArrayMode2dTiledThin: + case kArrayMode3dTiledThin: + case kArrayModeTiledThinPrt: + case kArrayMode2dTiledThinPrt: + case kArrayMode3dTiledThinPrt: + tileThickness = 1; + break; + case kArrayMode1dTiledThick: + case kArrayMode2dTiledThick: + case kArrayMode3dTiledThick: + case kArrayModeTiledThickPrt: + case kArrayMode2dTiledThickPrt: + case kArrayMode3dTiledThickPrt: + tileThickness = 4; + break; + case kArrayMode2dTiledXThick: + case kArrayMode3dTiledXThick: + tileThickness = 8; + break; + default: + break; + } + + uint32_t bitsPerElement = bitsPerFragment; + uint32_t paddedWidth = dataSize.x; + uint32_t paddedHeight = dataSize.y; + + uint32_t bankWidthHW = macroTileMode_getBankWidth(macroTileMode); + uint32_t bankHeightHW = macroTileMode_getBankHeight(macroTileMode); + uint32_t macroAspectHW = macroTileMode_getMacroTileAspect(macroTileMode); + uint32_t numBanksHW = macroTileMode_getNumBanks(macroTileMode); + + uint32_t bankWidth = 1 << bankWidthHW; + uint32_t bankHeight = 1 << bankHeightHW; + uint32_t numBanks = 2 << numBanksHW; + uint32_t macroTileAspect = 1 << macroAspectHW; + + uint32_t tileBytes1x = + (tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight + + 7) / + 8; + + uint32_t sampleSplitHw = tileMode_getSampleSplit(tileMode); + uint32_t tileSplitHw = tileMode_getTileSplit(tileMode); + uint32_t sampleSplit = 1 << sampleSplitHw; + uint32_t tileSplitC = + (tileMode_getMicroTileMode(tileMode) == kMicroTileModeDepth) + ? (64 << tileSplitHw) + : max(256U, tileBytes1x * sampleSplit); + + uint32_t tileSplitBytes = min(kDramRowSize, tileSplitC); + + uint32_t numPipes = getPipeCount(tileMode_getPipeConfig(tileMode)); + uint32_t pipeInterleaveBits = findLSB(kPipeInterleaveBytes); + uint32_t pipeInterleaveMask = (1 << pipeInterleaveBits) - 1; + uint32_t pipeBits = findLSB(numPipes); + uint32_t bankBits = findLSB(numBanks); + uint32_t bankSwizzleMask = tileSwizzleMask; + uint32_t pipeSwizzleMask = 0; + uint32_t macroTileWidth = + (kMicroTileWidth * bankWidth * numPipes) * macroTileAspect; + uint32_t macroTileHeight = + (kMicroTileHeight * bankHeight * numBanks) / macroTileAspect; + + uint32_t microTileMode = tileMode_getMicroTileMode(tileMode); + + uint64_t elementIndex = + getElementIndex(pos, bitsPerElement, microTileMode, arrayMode); + + uint32_t xh = pos.x; + uint32_t yh = pos.y; + if (arrayMode == kArrayModeTiledThinPrt || + arrayMode == kArrayModeTiledThickPrt) { + xh %= macroTileWidth; + yh %= macroTileHeight; + } + uint64_t pipe = getPipeIndex(xh, yh, tileMode_getPipeConfig(tileMode)); + uint64_t bank = + getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes); + + uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness * + bitsPerElement * numFragmentsPerPixel + + 7) / + 8; + + uint64_t elementOffset = 0; + if (microTileMode == kMicroTileModeDepth) { + uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel; + elementOffset = pixelOffset + (fragmentIndex * bitsPerElement); + } else { + uint64_t fragmentOffset = + fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8; + elementOffset = fragmentOffset + (elementIndex * bitsPerElement); + } + + uint64_t slicesPerTile = 1; + uint64_t tileSplitSlice = 0; + if (tileBytes > tileSplitBytes && tileThickness == 1) { + slicesPerTile = tileBytes / tileSplitBytes; + tileSplitSlice = elementOffset / (tileSplitBytes * 8); + elementOffset %= (tileSplitBytes * 8); + tileBytes = tileSplitBytes; + } + + uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) * + (macroTileHeight / kMicroTileHeight) * tileBytes / + (numPipes * numBanks); + uint64_t macroTilesPerRow = paddedWidth / macroTileWidth; + uint64_t macroTileRowIndex = pos.y / macroTileHeight; + uint64_t macroTileColumnIndex = pos.x / macroTileWidth; + uint64_t macroTileIndex = + (macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex; + uint64_t macro_tile_offset = macroTileIndex * macroTileBytes; + uint64_t macroTilesPerSlice = + macroTilesPerRow * (paddedHeight / macroTileHeight); + uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes; + + uint32_t slice = pos.z; + uint64_t sliceOffset = + (tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes; + if (arraySlice != 0) { + slice = arraySlice; + } + + uint64_t tileRowIndex = (pos.y / kMicroTileHeight) % bankHeight; + uint64_t tileColumnIndex = ((pos.x / kMicroTileWidth) / numPipes) % bankWidth; + uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex; + uint64_t tileOffset = tileIndex * tileBytes; + + uint64_t bankSwizzle = bankSwizzleMask; + uint64_t pipeSwizzle = pipeSwizzleMask; + + uint64_t pipeSliceRotation = 0; + switch (arrayMode) { + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + pipeSliceRotation = + max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness); + break; + default: + break; + } + pipeSwizzle += pipeSliceRotation; + pipeSwizzle &= (numPipes - 1); + pipe = pipe ^ pipeSwizzle; + + uint64_t sliceRotation = 0; + switch (arrayMode) { + case kArrayMode2dTiledThin: + case kArrayMode2dTiledThick: + case kArrayMode2dTiledXThick: + sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness); + break; + case kArrayMode3dTiledThin: + case kArrayMode3dTiledThick: + case kArrayMode3dTiledXThick: + sliceRotation = max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness) / numPipes; + break; + default: + break; + } + uint64_t tileSplitSliceRotation = 0; + switch (arrayMode) { + case kArrayMode2dTiledThin: + case kArrayMode3dTiledThin: + case kArrayMode2dTiledThinPrt: + case kArrayMode3dTiledThinPrt: + tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice; + break; + default: + break; + } + bank ^= bankSwizzle + sliceRotation; + bank ^= tileSplitSliceRotation; + bank &= (numBanks - 1); + + uint64_t totalOffset = + (sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset; + uint64_t bitOffset = totalOffset & 0x7; + totalOffset /= 8; + + uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask; + uint64_t offset = totalOffset >> pipeInterleaveBits; + + uint64_t finalByteOffset = + pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) | + (bank << (pipeInterleaveBits + pipeBits)) | + (offset << (pipeInterleaveBits + pipeBits + bankBits)); + return (finalByteOffset << 3) | bitOffset; +} + + layout(binding=0) uniform Config { uint64_t srcAddress; uint64_t dstAddress; uvec2 dataSize; uint32_t tileMode; + uint32_t macroTileMode; + uint32_t dfmt; uint32_t numFragments; uint32_t bitsPerElement; uint32_t tiledSurfaceSize; uint32_t linearSurfaceSize; + uint32_t padding0; + uint32_t padding1; } config; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl index db92aae0b..6fc258307 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl +++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl @@ -18,17 +18,23 @@ void main() { uvec3 pos = gl_GlobalInvocationID; uint64_t tiledSliceOffset = 0; uint64_t linearSliceOffset = 0; + int arraySlice = 0; + int fragmentIndex = 0; if (config.tiledSurfaceSize != 0) { tiledSliceOffset = pos.z * config.tiledSurfaceSize; linearSliceOffset = pos.z * config.linearSurfaceSize; pos.z = 0; } - uint64_t tiledByteOffset = getTiledBitOffset1D( + uint64_t tiledByteOffset = getTiledBitOffset2D( + config.dfmt, config.tileMode, - pos, + config.macroTileMode, config.dataSize, - config.bitsPerElement + arraySlice, + config.numFragments, + pos, + fragmentIndex ) / 8; tiledByteOffset += tiledSliceOffset; diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp index eadae2b84..e99f0518c 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp @@ -5,6 +5,169 @@ using namespace amdgpu; +// FIXME: should be properly implemented +static SurfaceInfo +computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, + gnm::DataFormat dfmt, std::uint32_t width, + std::uint32_t height, std::uint32_t depth, + std::uint32_t pitch, int baseArrayLayer, int arrayCount, + int baseMipLevel, int mipCount, bool pow2pad) { + bool isCubemap = type == gnm::TextureType::Cube; + bool isVolume = type == gnm::TextureType::Dim3D; + + auto bitsPerFragment = getBitsPerElement(dfmt); + std::uint32_t arraySliceCount = depth; + + if (isCubemap) { + arraySliceCount *= 6; + } else if (isVolume) { + arraySliceCount = 1; + } + + int numFragments = (type == gnm::TextureType::Msaa2D || + type == gnm::TextureType::MsaaArray2D) + ? (baseArrayLayer + arrayCount - 1) + : 0; + + auto numFragmentsPerPixel = 1 << numFragments; + auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; + + auto bitsPerElement = bitsPerFragment; + depth = isVolume ? depth : 1; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + bitsPerElement *= 8; + break; + case 4: + case 8: + bitsPerElement *= 16; + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + arraySliceCount = std::bit_ceil(arraySliceCount); + } + + std::uint64_t surfaceOffset = 0; + std::uint64_t surfaceSize = 0; + + SurfaceInfo result; + result.width = width; + result.height = height; + result.depth = depth; + result.pitch = pitch; + result.numFragments = numFragments; + result.bitsPerElement = bitsPerElement; + result.arrayLayerCount = arraySliceCount; + + auto thickness = getMicroTileThickness(arrayMode); + + for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) { + std::uint32_t elemWidth = std::max(width >> mipLevel, 1); + std::uint32_t elemPitch = std::max(pitch >> mipLevel, 1); + std::uint32_t elemHeight = std::max(height >> mipLevel, 1); + std::uint32_t elemDepth = std::max(depth >> mipLevel, 1); + + std::uint32_t linearPitch = elemPitch; + std::uint32_t linearWidth = elemWidth; + std::uint32_t linearHeight = elemHeight; + std::uint32_t linearDepth = elemDepth; + + if (isBlockCompressed) { + switch (bitsPerFragment) { + case 1: + linearWidth = std::max((linearWidth + 7) / 8, 1); + linearPitch = std::max((linearPitch + 7) / 8, 1); + break; + case 4: + case 8: + linearWidth = std::max((linearWidth + 3) / 4, 1); + linearPitch = std::max((linearPitch + 3) / 4, 1); + linearHeight = std::max((linearHeight + 3) / 4, 1); + break; + case 16: + std::abort(); + break; + + default: + std::abort(); + break; + } + } + + if (pow2pad) { + linearPitch = std::bit_ceil(linearPitch); + linearWidth = std::bit_ceil(linearWidth); + linearHeight = std::bit_ceil(linearHeight); + linearDepth = std::bit_ceil(linearDepth); + } + + if (mipLevel > 0 && pitch > 0) { + linearPitch = linearWidth; + } + + std::uint32_t paddedPitch = + (linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1); + std::uint32_t paddedHeight = + (linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1); + std::uint32_t paddedDepth = linearDepth; + + if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) { + if (isCubemap) { + linearDepth = std::bit_ceil(linearDepth); + } + + paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1); + } + + std::uint32_t tempPitch = paddedPitch; + std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) * + paddedHeight * bitsPerElement * + numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + + uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) { + tempPitch += kMicroTileWidth; + logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight * + bitsPerElement * numFragmentsPerPixel; + logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; + physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; + } + + surfaceSize = logicalSliceSizeBytes * paddedDepth; + auto linearSize = + linearDepth * + (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel + + 7) / + 8; + + result.setSubresourceInfo(mipLevel, { + .dataWidth = linearPitch, + .dataHeight = linearHeight, + .dataDepth = linearDepth, + .offset = surfaceOffset, + .tiledSize = surfaceSize, + .linearSize = linearSize, + }); + + surfaceOffset += arraySliceCount * surfaceSize; + } + + result.totalSize = surfaceOffset; + return result; +} + static SurfaceInfo computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt, std::uint32_t width, @@ -370,7 +533,9 @@ SurfaceInfo amdgpu::computeSurfaceInfo( case kArrayMode2dTiledThickPrt: case kArrayMode3dTiledThinPrt: case kArrayMode3dTiledThickPrt: - std::abort(); + return computeTexture2dInfo(tileMode.arrayMode(), type, dfmt, width, height, + depth, pitch, baseArrayLayer, arrayCount, + baseMipLevel, mipCount, pow2pad); } std::abort(); diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp index 206def23f..33c287ba9 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp @@ -175,9 +175,9 @@ getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded, bool isCubemap = texType == gnm::TextureType::Cube; bool isVolume = texType == gnm::TextureType::Dim3D; - auto m_bitsPerFragment = getBitsPerElement(dfmt); + auto bitsPerFragment = getBitsPerElement(dfmt); - auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1; + auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; auto tileSwizzleMask = 0; auto numFragmentsPerPixel = 1 << numFragments; auto arrayMode = tileMode.arrayMode(); @@ -208,12 +208,12 @@ getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded, break; } - auto bitsPerElement = m_bitsPerFragment; + auto bitsPerElement = bitsPerFragment; auto paddedWidth = pitch; auto paddedHeight = height; - if (m_isBlockCompressed) { - switch (m_bitsPerFragment) { + if (isBlockCompressed) { + switch (bitsPerFragment) { case 1: bitsPerElement *= 8; paddedWidth = std::max((paddedWidth + 7) / 8, 1); diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp index 587b45f43..12c8a7399 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp @@ -93,10 +93,13 @@ struct amdgpu::GpuTiler::Impl { uint32_t dataWidth; uint32_t dataHeight; uint32_t tileMode; + uint32_t macroTileMode; + uint32_t dfmt; uint32_t numFragments; uint32_t bitsPerElement; uint32_t tiledSurfaceSize; uint32_t linearSurfaceSize; + uint32_t padding[2]; }; Impl() { @@ -119,7 +122,8 @@ struct amdgpu::GpuTiler::Impl { { VkDescriptorPoolSize poolSizes[]{{ .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .descriptorCount = static_cast(std::size(descriptorSets)) * 2, + .descriptorCount = + static_cast(std::size(descriptorSets)) * 2, }}; VkDescriptorPoolCreateInfo info{ @@ -174,7 +178,7 @@ amdgpu::GpuTiler::~GpuTiler() = default; void amdgpu::GpuTiler::detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, + amdgpu::TileMode tileMode, gnm::DataFormat dfmt, std::uint64_t srcTiledAddress, std::uint64_t dstLinearAddress, int mipLevel, int baseArray, int arrayCount) { @@ -192,6 +196,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, config->dataWidth = subresource.dataWidth; config->dataHeight = subresource.dataHeight; config->tileMode = tileMode.raw; + config->dfmt = dfmt; config->numFragments = info.numFragments; config->bitsPerElement = info.bitsPerElement; uint32_t groupCountZ = subresource.dataDepth; @@ -231,8 +236,13 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - std::abort(); - vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader); + config->macroTileMode = + getDefaultMacroTileModes()[computeMacroTileIndex( + tileMode, info.bitsPerElement, + 1 << info.numFragments)] + .raw; + + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader); break; } @@ -265,7 +275,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, void amdgpu::GpuTiler::tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, + amdgpu::TileMode tileMode, gnm::DataFormat dfmt, std::uint64_t srcLinearAddress, std::uint64_t dstTiledAddress, int mipLevel, int baseArray, int arrayCount) { @@ -283,6 +293,7 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, config->dataWidth = subresource.dataWidth; config->dataHeight = subresource.dataHeight; config->tileMode = tileMode.raw; + config->dfmt = dfmt; config->numFragments = info.numFragments; config->bitsPerElement = info.bitsPerElement; uint32_t groupCountZ = subresource.dataDepth; @@ -321,8 +332,12 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - std::abort(); - vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader); + config->macroTileMode = + getDefaultMacroTileModes()[computeMacroTileIndex( + tileMode, info.bitsPerElement, + 1 << info.numFragments)] + .raw; + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader); break; } diff --git a/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl b/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl index 8862a8a0c..6fe9f8f69 100644 --- a/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl +++ b/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl @@ -15,6 +15,7 @@ #extension GL_EXT_shader_atomic_float2 : require #extension GL_EXT_nonuniform_qualifier: require #extension GL_EXT_samplerless_texture_functions : require +#extension GL_EXT_debug_printf : enable #define FLT_MAX 3.402823466e+38 #define FLT_MIN 1.175494351e-38 @@ -235,6 +236,8 @@ float32_t ps_input_vgpr(int32_t index, f32vec4 fragCoord, bool frontFace) { case kPsVGprInputPosFixed: return 0; } + + // debugPrintfEXT("ps_input_vgpr: invalid index %d", index); return 0; } @@ -385,8 +388,10 @@ uint32_t v_cndmask_b32(uint32_t x, uint32_t y, uint64_t mask) { float32_t v_add_f32(float32_t x, float32_t y) { return x + y; } float32_t v_sub_f32(float32_t x, float32_t y) { return x - y; } float32_t v_subrev_f32(float32_t x, float32_t y) { return y - x; } -float32_t v_mac_legacy_f32(float32_t x, float32_t y, float32_t dst) { - return x == 0 || y == 0 ? dst : fma(x, y, dst); +void v_mac_legacy_f32(inout float32_t dst, float32_t x, float32_t y) { + if (!(x == 0 || y == 0)) { + dst = fma(x, y, dst); + } } float32_t v_mul_legacy_f32(float32_t x, float32_t y) { return x == 0 || y == 0 ? 0 : x * y; @@ -425,7 +430,7 @@ uint32_t v_and_b32(uint32_t x, uint32_t y) { return x & y; } uint32_t v_or_b32(uint32_t x, uint32_t y) { return x | y; } uint32_t v_xor_b32(uint32_t x, uint32_t y) { return x ^ y; } uint32_t v_bfm_b32(uint32_t x, uint32_t y) { return ((1 << (x & 0x1f)) - 1) << (y & 0x1f); } -float32_t v_mac_f32(float32_t x, float32_t y, float32_t dst) { return fma(x, y, dst); } +void v_mac_f32(inout float32_t dst, float32_t x, float32_t y) { dst = fma(x, y, dst); } float32_t v_madmk_f32(float32_t x, float32_t y, float32_t k) { return fma(x, k, y); } float32_t v_madak_f32(float32_t x, float32_t y, float32_t k) { return fma(x, y, k); } uint32_t v_bcnt_u32_b32(uint32_t x) { return bitCount(x); } @@ -2575,6 +2580,8 @@ void image_sample(inout f32vec4 vdata, f32vec3 vaddr, int32_t textureIndexHint, return; } + // debugPrintfEXT("image_sample: textureType: %u, coord: %v3f, result: %v4f, dmask: %u", textureType, vaddr, result, dmask); + int vdataIndex = 0; for (int i = 0; i < 4; ++i) { if ((dmask & (1 << i)) != 0) { diff --git a/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp b/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp index 25a108cb6..35d80443c 100644 --- a/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp +++ b/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp @@ -1422,12 +1422,6 @@ static void createInitialValues(GcnConverter &converter, if (stage != gcn::Stage::Cs) { context.writeReg(loc, builder, gcn::RegId::Exec, 0, context.imm64(1)); - // context.writeReg(loc, builder, gcn::RegId::ThreadId, 0, - // context.imm32(0)); - - replaceVariableWithConstant( - context.getOrCreateRegisterVariable(gcn::RegId::ThreadId), - context.imm32(0)); } if (stage == gcn::Stage::VsVs || stage == gcn::Stage::GsVs || @@ -1561,6 +1555,12 @@ gcn::convertToSpv(Context &context, ir::Region body, createInitialValues(converter, env, stage, result.info, body); instructionsToSpv(converter, importer, stage, env, semanticInfo, result.info, body); + if (stage != gcn::Stage::Cs) { + replaceVariableWithConstant( + context.getOrCreateRegisterVariable(gcn::RegId::ThreadId), + context.imm32(0)); + } + createEntryPoint(context, stage, std::move(body)); for (int userSgpr = std::countr_zero(context.requiredUserSgprs); diff --git a/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp b/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp index 9bd91d276..3e945fc2f 100644 --- a/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp +++ b/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp @@ -127,8 +127,6 @@ readVop2Inst(GcnInstruction &inst, std::uint64_t &address, if (op == ir::vop2::MADMK_F32 || op == ir::vop2::MADAK_F32) { inst.addOperand(createImmediateGcnOperand(address)); - } else if (op == ir::vop2::MAC_F32) { - inst.addOperand(createVgprGcnOperand(vdst).withR()); } } @@ -343,8 +341,6 @@ readVop3Inst(GcnInstruction &inst, std::uint64_t &address, .withNeg(((neg >> 2) & 1) != 0)); } else if (op == ir::vop3::MADMK_F32 || op == ir::vop3::MADAK_F32) { inst.addOperand(createImmediateGcnOperand(address)); - } else if (op == ir::vop3::MAC_F32) { - inst.addOperand(createSgprGcnOperand(address, vdst).withRW()); } } else if (op >= 384 && op < ir::vop1::OpCount + 384) { // vop1 @@ -527,14 +523,14 @@ readMtbufInst(GcnInstruction &inst, std::uint64_t &address, inst.op = op; inst.addOperand(createVgprGcnOperand(vdata).withAccess(dataAccess)); - if (idxen) { - inst.addOperand(createVgprGcnOperand(vaddr).withR()); + if (offen) { + inst.addOperand(createVgprGcnOperand(vaddr + (idxen ? 1 : 0)).withR()); } else { inst.addOperand(GcnOperand::createConstant(0u)); } - if (offen) { - inst.addOperand(createVgprGcnOperand(vaddr + (idxen ? 1 : 0)).withR()); + if (idxen) { + inst.addOperand(createVgprGcnOperand(vaddr).withR()); } else { inst.addOperand(GcnOperand::createConstant(0u)); } diff --git a/rpcsx-gpu2/lib/gcn-shader/src/gcn.cpp b/rpcsx-gpu2/lib/gcn-shader/src/gcn.cpp index 938a75e72..0fb854fbb 100644 --- a/rpcsx-gpu2/lib/gcn-shader/src/gcn.cpp +++ b/rpcsx-gpu2/lib/gcn-shader/src/gcn.cpp @@ -1081,6 +1081,23 @@ static ir::Value deserializeGcnRegion( auto instSem = semInfo.findSemantic(ir::getInstructionId(isaInst.kind, isaInst.op)); + auto createExecTest = [&] { + auto mergeBlock = builder.createSpvLabel(loc); + gcn::Builder::createInsertBefore(converter, mergeBlock) + .createSpvBranch(loc, mergeBlock); + auto instBlock = gcn::Builder::createInsertAfter(converter, instrBegin) + .createSpvLabel(loc); + auto prependInstBuilder = + gcn::Builder::createInsertBefore(converter, instBlock); + auto exec = prependInstBuilder.createValue( + loc, ir::amdgpu::EXEC_TEST, + converter.getType(execTestSem->returnType)); + prependInstBuilder.createSpvSelectionMerge( + loc, mergeBlock, ir::spv::SelectionControl::None); + prependInstBuilder.createSpvBranchConditional(loc, exec, instBlock, + mergeBlock); + }; + if (instSem == nullptr) { if (isaInst == ir::sopp::BRANCH) { auto target = @@ -1268,6 +1285,9 @@ static ir::Value deserializeGcnRegion( inst.addOperand(createOperandRead(loc, paramBuilder, uint32TV, op)); } + if (isaInst == ir::exp::EXP) { + createExecTest(); + } continue; } @@ -1400,20 +1420,7 @@ static ir::Value deserializeGcnRegion( } if (!hasDestination && injectExecTest) { - auto mergeBlock = builder.createSpvLabel(loc); - gcn::Builder::createInsertBefore(converter, mergeBlock) - .createSpvBranch(loc, mergeBlock); - auto instBlock = gcn::Builder::createInsertAfter(converter, instrBegin) - .createSpvLabel(loc); - auto prependInstBuilder = - gcn::Builder::createInsertBefore(converter, instBlock); - auto exec = prependInstBuilder.createValue( - loc, ir::amdgpu::EXEC_TEST, - converter.getType(execTestSem->returnType)); - prependInstBuilder.createSpvSelectionMerge( - loc, mergeBlock, ir::spv::SelectionControl::None); - prependInstBuilder.createSpvBranchConditional(loc, exec, instBlock, - mergeBlock); + createExecTest(); } } diff --git a/rpcsx-gpu2/lib/gnm/include/gnm/gnm.hpp b/rpcsx-gpu2/lib/gnm/include/gnm/gnm.hpp index 2593e6a1c..275df90e5 100644 --- a/rpcsx-gpu2/lib/gnm/include/gnm/gnm.hpp +++ b/rpcsx-gpu2/lib/gnm/include/gnm/gnm.hpp @@ -252,5 +252,56 @@ constexpr ZFormat getZFormat(DataFormat dfmt) { constexpr StencilFormat getStencilFormat(DataFormat dfmt) { return dfmt == kDataFormat8 ? kStencil8 : kStencilInvalid; } -} // namespace gnm +constexpr DataFormat getDataFormat(ZFormat format) { + switch (format) { + case kZFormat32Float: + return kDataFormat32; + case kZFormat16: + return kDataFormat16; + + case kZFormatInvalid: + break; + } + + return kDataFormatInvalid; +} + +constexpr NumericFormat getNumericFormat(ZFormat format) { + switch (format) { + case kZFormat32Float: + return kNumericFormatFloat; + case kZFormat16: + return kNumericFormatUInt; + + case kZFormatInvalid: + break; + } + + return kNumericFormatUNorm; +} + +constexpr DataFormat getDataFormat(StencilFormat format) { + switch (format) { + case kStencil8: + return kDataFormat8; + + case kStencilInvalid: + break; + } + + return kDataFormatInvalid; +} + +constexpr NumericFormat getNumericFormat(StencilFormat format) { + switch (format) { + case kStencil8: + return kNumericFormatSInt; + + case kStencilInvalid: + break; + } + + return kNumericFormatUNorm; +} +} // namespace gnm