diff --git a/rpcsx-gpu/Cache.cpp b/rpcsx-gpu/Cache.cpp index 161a8edb2..87e9b34b0 100644 --- a/rpcsx-gpu/Cache.cpp +++ b/rpcsx-gpu/Cache.cpp @@ -534,14 +534,14 @@ struct CachedImage : Cache::Entry { regions.reserve(image.getMipLevels()); auto tiledBuffer = - tag.getBuffer(baseAddress, info.totalSize, Access::Write); + tag.getBuffer(baseAddress, info.totalTiledSize, Access::Write); if (isLinear) { for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) { auto ®ionInfo = info.getSubresourceInfo(mipLevel); regions.push_back({ - .bufferOffset = regionInfo.offset, + .bufferOffset = regionInfo.tiledOffset, .bufferRowLength = mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u), .imageSubresource = @@ -565,14 +565,11 @@ struct CachedImage : Cache::Entry { tiledBuffer.handle, regions.size(), regions.data()); } else { - auto tiledSize = info.totalSize; - std::uint64_t linearOffset = 0; for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) { auto ®ionInfo = info.getSubresourceInfo(mipLevel); regions.push_back({ - .bufferOffset = linearOffset, - .bufferRowLength = - mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u), + .bufferOffset = regionInfo.linearOffset, + .bufferRowLength = regionInfo.linearPitch, .imageSubresource = { .aspectMask = toAspect(kind), @@ -582,18 +579,15 @@ struct CachedImage : Cache::Entry { }, .imageExtent = { - .width = std::max(image.getWidth() >> mipLevel, 1u), - .height = std::max(image.getHeight() >> mipLevel, 1u), - .depth = std::max(image.getDepth() >> mipLevel, 1u), + .width = regionInfo.linearWidth, + .height = regionInfo.linearHeight, + .depth = regionInfo.linearDepth, }, }); - - linearOffset += regionInfo.linearSize * image.getArrayLayers(); } - auto linearSize = linearOffset; auto transferBuffer = vk::Buffer::Allocate( - vk::getDeviceLocalMemory(), linearOffset, + vk::getDeviceLocalMemory(), info.totalLinearSize, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT); vkCmdCopyImageToBuffer(scheduler.getCommandBuffer(), image.getHandle(), @@ -603,14 +597,11 @@ struct CachedImage : Cache::Entry { auto &tiler = tag.getDevice()->tiler; - linearOffset = 0; for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) { - auto ®ionInfo = info.getSubresourceInfo(mipLevel); - tiler.tile(scheduler, info, acquiredTileMode, acquiredDfmt, - transferBuffer.getAddress() + linearOffset, - linearSize - linearOffset, tiledBuffer.deviceAddress, - tiledSize, mipLevel, 0, image.getArrayLayers()); - linearOffset += regionInfo.linearSize * image.getArrayLayers(); + tiler.tile(scheduler, info, acquiredTileMode, + transferBuffer.getAddress(), info.totalLinearSize, + tiledBuffer.deviceAddress, info.totalTiledSize, mipLevel, 0, + image.getArrayLayers()); } scheduler.afterSubmit([transferBuffer = std::move(transferBuffer)] {}); @@ -1157,7 +1148,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { VkBuffer sourceBuffer; auto tiledBuffer = - getBuffer(key.readAddress, surfaceInfo.totalSize, Access::Read); + getBuffer(key.readAddress, surfaceInfo.totalTiledSize, Access::Read); if (isLinear) { sourceBuffer = tiledBuffer.handle; @@ -1165,7 +1156,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) { auto &info = surfaceInfo.getSubresourceInfo(mipLevel); regions.push_back({ - .bufferOffset = info.offset, + .bufferOffset = info.tiledOffset, .bufferRowLength = mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u), .imageSubresource = @@ -1186,15 +1177,13 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { } else { auto &tiler = mParent->mDevice->tiler; - std::uint64_t linearOffset = 0; for (unsigned mipLevel = key.baseMipLevel; mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) { auto &info = surfaceInfo.getSubresourceInfo(mipLevel); regions.push_back({ - .bufferOffset = linearOffset, - .bufferRowLength = - mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u), + .bufferOffset = info.linearOffset, + .bufferRowLength = info.linearPitch, .imageSubresource = { .aspectMask = toAspect(key.kind), @@ -1204,37 +1193,29 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { }, .imageExtent = { - .width = std::max(key.extent.width >> mipLevel, 1u), - .height = std::max(key.extent.height >> mipLevel, 1u), - .depth = std::max(key.extent.depth >> mipLevel, 1u), + .width = info.linearWidth, + .height = info.linearHeight, + .depth = info.linearDepth, }, }); - - linearOffset += info.linearSize * key.arrayLayerCount; } - auto detiledSize = linearOffset; - - auto detiledBuffer = - vk::Buffer::Allocate(vk::getDeviceLocalMemory(), detiledSize, - VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR | - VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR); + auto detiledBuffer = vk::Buffer::Allocate( + vk::getDeviceLocalMemory(), surfaceInfo.totalLinearSize, + VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR | + VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR); sourceBuffer = detiledBuffer.getHandle(); - std::uint64_t dstAddress = detiledBuffer.getAddress(); + auto linearAddress = detiledBuffer.getAddress(); mScheduler->afterSubmit([detiledBuffer = std::move(detiledBuffer)] {}); for (unsigned mipLevel = key.baseMipLevel; mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) { - auto &info = surfaceInfo.getSubresourceInfo(mipLevel); - - tiler.detile(*mScheduler, surfaceInfo, key.tileMode, key.dfmt, - tiledBuffer.deviceAddress, surfaceInfo.totalSize, - dstAddress, detiledSize, mipLevel, 0, key.arrayLayerCount); - - detiledSize -= info.linearSize * key.arrayLayerCount; - dstAddress += info.linearSize * key.arrayLayerCount; + tiler.detile(*mScheduler, surfaceInfo, key.tileMode, + tiledBuffer.deviceAddress, surfaceInfo.totalTiledSize, + linearAddress, surfaceInfo.totalLinearSize, mipLevel, 0, + key.arrayLayerCount); } } diff --git a/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp b/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp index 9d875c4de..2c745f6c2 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp +++ b/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp @@ -115,27 +115,27 @@ struct TileMode { constexpr TileMode &arrayMode(ArrayMode mode) { raw = (raw & ~0x0000003c) | - (static_cast(mode) << 2) & 0x0000003c; + ((static_cast(mode) << 2) & 0x0000003c); return *this; } constexpr TileMode &pipeConfig(PipeConfig mode) { raw = (raw & ~0x000007c0) | - (static_cast(mode) << 6) & 0x000007c0; + ((static_cast(mode) << 6) & 0x000007c0); return *this; } constexpr TileMode &tileSplit(TileSplit mode) { raw = (raw & ~0x00003800) | - (static_cast(mode) << 11) & 0x00003800; + ((static_cast(mode) << 11) & 0x00003800); return *this; } constexpr TileMode µTileMode(MicroTileMode mode) { raw = (raw & ~0x01c00000) | - (static_cast(mode) << 22) & 0x01c00000; + ((static_cast(mode) << 22) & 0x01c00000); return *this; } constexpr TileMode &sampleSplit(SampleSplit mode) { raw = (raw & ~0x06000000) | - (static_cast(mode) << 25) & 0x06000000; + ((static_cast(mode) << 25) & 0x06000000); return *this; } }; @@ -166,17 +166,24 @@ struct SurfaceInfo { std::uint32_t height; std::uint32_t depth; std::uint32_t pitch; + MacroTileMode macroTileMode; int arrayLayerCount; int numFragments; int bitsPerElement; - std::uint64_t totalSize; + std::uint64_t totalTiledSize; + std::uint64_t totalLinearSize; struct SubresourceInfo { - std::uint32_t dataWidth; - std::uint32_t dataHeight; - std::uint32_t dataDepth; - std::uint64_t offset; + std::uint32_t tiledWidth; + std::uint32_t tiledHeight; + std::uint32_t tiledDepth; + std::uint64_t tiledOffset; std::uint64_t tiledSize; + std::uint32_t linearPitch; + std::uint32_t linearWidth; + std::uint32_t linearHeight; + std::uint32_t linearDepth; + std::uint64_t linearOffset; std::uint64_t linearSize; }; @@ -524,5 +531,4 @@ SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type, std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel, int mipCount, bool pow2pad); -SurfaceInfo computeSurfaceInfo(const gnm::TBuffer &tbuffer, TileMode tileMode); } // namespace amdgpu diff --git a/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp b/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp index 8fae711e3..bf830c96a 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp +++ b/rpcsx-gpu/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp @@ -11,15 +11,14 @@ struct GpuTiler { ~GpuTiler(); void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, gnm::DataFormat dfmt, - std::uint64_t srcTiledAddress, std::uint64_t srcSize, - std::uint64_t dstLinearAddress, std::uint64_t dstSize, - int mipLevel, int baseArray, int arrayCount); + amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress, + std::uint64_t srcSize, std::uint64_t dstLinearAddress, + std::uint64_t dstSize, int mipLevel, int baseArray, + int arrayCount); void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, gnm::DataFormat dfmt, - std::uint64_t srcLinearAddress, std::uint64_t srcSize, - std::uint64_t dstTiledAddress, std::uint64_t dstSize, int mipLevel, - int baseArray, int arrayCount); + amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress, + std::uint64_t srcSize, std::uint64_t dstTiledAddress, + std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount); private: std::unique_ptr mImpl; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl index bc4eaa265..9af4667c6 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl @@ -37,8 +37,8 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); @@ -63,6 +63,9 @@ void main() { switch (bpp) { case 1: + // buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; + // break; + case 2: buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl index d6eb6a44f..d24d12647 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl @@ -1,5 +1,7 @@ #version 460 +#define DEBUG + #extension GL_GOOGLE_include_directive : enable #extension GL_EXT_shader_explicit_arithmetic_types : enable #extension GL_EXT_shader_atomic_int64 : enable @@ -32,12 +34,12 @@ void main() { } uint64_t tiledByteOffset = getTiledBitOffset2D( - config.dfmt, config.tileMode, config.macroTileMode, config.dataSize, arraySlice, config.numFragments, + config.bitsPerElement, pos, fragmentIndex ) / 8; @@ -47,8 +49,8 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); @@ -57,6 +59,10 @@ void main() { uint32_t bpp = (config.bitsPerElement + 7) / 8; + if (bpp == 1 && (linearByteOffset & 1) != 0) { + return; + } + #ifdef DEBUG if (config.srcAddress + tiledByteOffset + bpp > config.srcEndAddress) { debugPrintfEXT("detiler2d: out of src buffer %d x %d x %d", pos.x, pos.y, pos.z); @@ -71,9 +77,8 @@ void main() { switch (bpp) { case 1: - buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; - break; - + // buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; + // break; case 2: buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl index c27806826..aa366ecc1 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl @@ -36,19 +36,22 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); linearByteOffset += linearSliceOffset; - switch ((config.bitsPerElement + 7) / 8) { - case 1: - buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data; - break; + uint32_t bpp = (config.bitsPerElement + 7) / 8; + if (bpp == 1 && (linearByteOffset & 1) != 0) { + return; + } + + switch (bpp) { + case 1: case 2: buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl index 29be1b02d..8f78a8da4 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler.glsl @@ -1,7 +1,5 @@ #define FOR_ALL_BASE_TYPES(OP) \ - OP(int8_t) \ - OP(uint8_t) \ OP(int16_t) \ OP(uint16_t) \ OP(float16_t) \ @@ -785,11 +783,8 @@ uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint3 } -uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTileMode, - uvec2 dataSize, int arraySlice, uint32_t numFragments, u32vec3 pos, int fragmentIndex) { - uint32_t bitsPerFragment = getBitsPerElement(dfmt); - - bool isBlockCompressed = getTexelsPerElement(dfmt) > 1; +uint64_t getTiledBitOffset2D(uint32_t tileMode, uint32_t macroTileMode, + uvec2 dataSize, int arraySlice, uint32_t numFragments, uint32_t bitsPerElement, u32vec3 pos, int fragmentIndex) { uint32_t tileSwizzleMask = 0; uint32_t numFragmentsPerPixel = 1 << numFragments; uint32_t arrayMode = tileMode_getArrayMode(tileMode); @@ -820,7 +815,6 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil break; } - uint32_t bitsPerElement = bitsPerFragment; uint32_t paddedWidth = dataSize.x; uint32_t paddedHeight = dataSize.y; @@ -849,7 +843,8 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil uint32_t tileSplitBytes = min(kDramRowSize, tileSplitC); - uint32_t numPipes = getPipeCount(tileMode_getPipeConfig(tileMode)); + uint32_t pipeConfig = tileMode_getPipeConfig(tileMode); + uint32_t numPipes = getPipeCount(pipeConfig); uint32_t pipeInterleaveBits = findLSB(kPipeInterleaveBytes); uint32_t pipeInterleaveMask = (1 << pipeInterleaveBits) - 1; uint32_t pipeBits = findLSB(numPipes); @@ -873,7 +868,7 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil xh %= macroTileWidth; yh %= macroTileHeight; } - uint64_t pipe = getPipeIndex(xh, yh, tileMode_getPipeConfig(tileMode)); + uint64_t pipe = getPipeIndex(xh, yh, pipeConfig); uint64_t bank = getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes); @@ -989,16 +984,15 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil return (finalByteOffset << 3) | bitOffset; } - layout(push_constant) uniform Config { uint64_t srcAddress; uint64_t srcEndAddress; uint64_t dstAddress; uint64_t dstEndAddress; uvec2 dataSize; + uvec2 linearDataSize; uint32_t tileMode; uint32_t macroTileMode; - uint32_t dfmt; uint32_t numFragments; uint32_t bitsPerElement; uint32_t tiledSurfaceSize; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl index 30a092f90..439b5dbdf 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl @@ -40,8 +40,8 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); @@ -69,6 +69,8 @@ void main() { switch (bpp) { case 1: + // buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; + // break; case 2: buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl index 074ff0aed..75223be6b 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl @@ -1,4 +1,5 @@ #version 460 +#define DEBUG #extension GL_GOOGLE_include_directive : enable #extension GL_EXT_shader_explicit_arithmetic_types : enable @@ -31,12 +32,12 @@ void main() { } uint64_t tiledByteOffset = getTiledBitOffset2D( - config.dfmt, config.tileMode, config.macroTileMode, config.dataSize, arraySlice, config.numFragments, + config.bitsPerElement, pos, fragmentIndex ) / 8; @@ -46,8 +47,8 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); @@ -55,24 +56,27 @@ void main() { linearByteOffset += linearSliceOffset; uint32_t bpp = (config.bitsPerElement + 7) / 8; + if (bpp == 1 && (linearByteOffset & 1) != 0) { + return; + } #ifdef DEBUG if (config.srcAddress + linearByteOffset + bpp > config.srcEndAddress) { - debugPrintfEXT("tiler2d: out of src buffer %d x %d x %d", pos.x, pos.y, pos.z); + debugPrintfEXT("tiler2d: out of src buffer %d x %d x %d, src offset: %lu, src size: %lu", pos.x, pos.y, pos.z, + linearByteOffset, config.srcEndAddress - config.srcAddress); return; } if (config.dstAddress + tiledByteOffset + bpp > config.dstEndAddress) { - debugPrintfEXT("tiler2d: out of dst buffer %d x %d x %d", pos.x, pos.y, pos.z); + debugPrintfEXT("tiler2d: out of dst buffer %d x %d x %d, offset %lx, size %lx", pos.x, pos.y, pos.z, tiledByteOffset, config.dstEndAddress - config.dstAddress); return; } #endif switch (bpp) { case 1: - buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; - break; - + // buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; + // break; case 2: buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl index 35013d57b..f55b630ec 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl +++ b/rpcsx-gpu/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl @@ -36,19 +36,21 @@ void main() { uint64_t linearByteOffset = computeLinearElementByteOffset( pos, 0, - config.dataSize.x, - config.dataSize.x * config.dataSize.y, + config.linearDataSize.x, + config.linearDataSize.x * config.linearDataSize.y, config.bitsPerElement, 1 << config.numFragments ); linearByteOffset += linearSliceOffset; - switch ((config.bitsPerElement + 7) / 8) { - case 1: - buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data; - break; + uint32_t bpp = (config.bitsPerElement + 7) / 8; + if (bpp == 1 && (linearByteOffset & 1) != 0) { + return; + } + switch (bpp) { + case 1: case 2: buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data; break; diff --git a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler.cpp b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler.cpp index e99f0518c..5091a3707 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler.cpp +++ b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler.cpp @@ -5,9 +5,8 @@ using namespace amdgpu; -// FIXME: should be properly implemented static SurfaceInfo -computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, +computeTexture2dInfo(TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt, std::uint32_t width, std::uint32_t height, std::uint32_t depth, std::uint32_t pitch, int baseArrayLayer, int arrayCount, @@ -32,7 +31,7 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, auto numFragmentsPerPixel = 1 << numFragments; auto isBlockCompressed = getTexelsPerElement(dfmt) > 1; - auto bitsPerElement = bitsPerFragment; + std::uint32_t bitsPerElement = bitsPerFragment; depth = isVolume ? depth : 1; if (isBlockCompressed) { @@ -60,6 +59,10 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, std::uint64_t surfaceOffset = 0; std::uint64_t surfaceSize = 0; + std::uint64_t linearOffset = 0; + + auto macroTileMode = getDefaultMacroTileModes()[computeMacroTileIndex( + tileMode, bitsPerElement, 1 << numFragments)]; SurfaceInfo result; result.width = width; @@ -69,8 +72,10 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, result.numFragments = numFragments; result.bitsPerElement = bitsPerElement; result.arrayLayerCount = arraySliceCount; + result.macroTileMode = macroTileMode; - auto thickness = getMicroTileThickness(arrayMode); + auto arrayMode = tileMode.arrayMode(); + auto numPipes = getPipeCount(tileMode.pipeConfig()); for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) { std::uint32_t elemWidth = std::max(width >> mipLevel, 1); @@ -116,55 +121,91 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type, linearPitch = linearWidth; } - std::uint32_t paddedPitch = - (linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1); - std::uint32_t paddedHeight = - (linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1); - std::uint32_t paddedDepth = linearDepth; + auto thickness = getMicroTileThickness(arrayMode); - if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) { - if (isCubemap) { - linearDepth = std::bit_ceil(linearDepth); - } + uint32_t numBanks = 2 << macroTileMode.numBanks(); + uint32_t macroAspect = 1 << macroTileMode.macroTileAspect(); + uint32_t tileBytes1x = + (thickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight + 7) / + 8; + auto microTileMode = tileMode.microTileMode(); + uint32_t tileSplit = + (microTileMode == kMicroTileModeDepth) + ? (64 << tileMode.sampleSplit()) + : std::max(256U, (1 << tileMode.sampleSplit()) * tileBytes1x); + uint32_t tileSplitC = std::min(kDramRowSize, tileSplit); + uint32_t bankWidth = 1 << macroTileMode.bankWidth(); + uint32_t bankHeight = 1 << macroTileMode.bankHeight(); - paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1); + uint32_t tileSize = std::min( + tileSplitC, (thickness * bitsPerElement * numFragmentsPerPixel * + kMicroTileWidth * kMicroTileHeight + + 7) / + 8); + uint32_t bankHeightAlign = + std::max(1U, kPipeInterleaveBytes / (tileSize * bankWidth)); + + bankHeight = (bankHeight + bankHeightAlign - 1) & ~(bankHeightAlign - 1); + + if (numFragmentsPerPixel == 1) { + uint32_t macroAspectAlign = std::max( + 1U, kPipeInterleaveBytes / (tileSize * numPipes * bankWidth)); + macroAspect = + (macroAspect + macroAspectAlign - 1) & ~(macroAspectAlign - 1); } - std::uint32_t tempPitch = paddedPitch; - std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) * - paddedHeight * bitsPerElement * - numFragmentsPerPixel; + auto depthAlign = thickness; + + // FIXME: rotate tile mode for mipLevel > 0 + + uint32_t outPitch = linearPitch; + uint32_t outHeight = linearHeight; + uint32_t outDepth = linearDepth; + + uint32_t macroTileWidth = + kMicroTileWidth * bankWidth * numPipes * macroAspect; + uint32_t macroTileHeight = + kMicroTileHeight * bankHeight * numBanks / macroAspect; + + uint32_t heightAlign = macroTileHeight; + auto pitchAlign = macroTileWidth; + + outPitch = (outPitch + pitchAlign - 1) & ~(pitchAlign - 1); + outDepth = (outDepth + depthAlign - 1) & ~(depthAlign - 1); + outHeight = (outHeight + heightAlign - 1) & ~(heightAlign - 1); + + std::uint64_t logicalSliceSizeBytes = std::uint64_t(outPitch) * outHeight * + bitsPerElement * numFragmentsPerPixel; logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; - uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; - while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) { - tempPitch += kMicroTileWidth; - logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight * - bitsPerElement * numFragmentsPerPixel; - logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8; - physicalSliceSizeBytes = logicalSliceSizeBytes * thickness; - } + surfaceSize = static_cast(outPitch) * outHeight * + std::bit_ceil(bitsPerElement) * numFragmentsPerPixel; + surfaceSize = (surfaceSize + 7) / 8; - surfaceSize = logicalSliceSizeBytes * paddedDepth; - auto linearSize = - linearDepth * - (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel + - 7) / - 8; + auto linearSize = uint64_t(linearPitch) * linearHeight * bitsPerElement * + numFragmentsPerPixel; + linearSize = linearDepth * ((linearSize + 7) / 8); result.setSubresourceInfo(mipLevel, { - .dataWidth = linearPitch, - .dataHeight = linearHeight, - .dataDepth = linearDepth, - .offset = surfaceOffset, + .tiledWidth = outPitch, + .tiledHeight = outHeight, + .tiledDepth = outDepth, + .tiledOffset = surfaceOffset, .tiledSize = surfaceSize, + .linearPitch = linearPitch, + .linearWidth = linearWidth, + .linearHeight = linearHeight, + .linearDepth = linearDepth, + .linearOffset = linearOffset, .linearSize = linearSize, }); + linearOffset += arraySliceCount * linearSize; surfaceOffset += arraySliceCount * surfaceSize; } - result.totalSize = surfaceOffset; + result.totalTiledSize = surfaceOffset; + result.totalLinearSize = linearOffset; return result; } @@ -222,6 +263,7 @@ computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type, std::uint64_t surfaceOffset = 0; std::uint64_t surfaceSize = 0; + std::uint64_t linearOffset = 0; SurfaceInfo result; result.width = width; @@ -308,25 +350,30 @@ computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type, } surfaceSize = logicalSliceSizeBytes * paddedDepth; - auto linearSize = - linearDepth * - (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel + - 7) / - 8; + auto linearSize = uint64_t(linearPitch) * linearHeight * bitsPerElement * + numFragmentsPerPixel; + linearSize = linearDepth * ((linearSize + 7) / 8); result.setSubresourceInfo(mipLevel, { - .dataWidth = linearPitch, - .dataHeight = linearHeight, - .dataDepth = linearDepth, - .offset = surfaceOffset, + .tiledWidth = linearPitch, + .tiledHeight = linearHeight, + .tiledDepth = linearDepth, + .tiledOffset = surfaceOffset, .tiledSize = surfaceSize, + .linearPitch = linearPitch, + .linearWidth = linearWidth, + .linearHeight = linearHeight, + .linearDepth = linearDepth, + .linearOffset = linearOffset, .linearSize = linearSize, }); surfaceOffset += arraySliceCount * surfaceSize; + linearOffset += arraySliceCount * linearSize; } - result.totalSize = surfaceOffset; + result.totalTiledSize = surfaceOffset; + result.totalLinearSize = linearOffset; return result; } @@ -383,6 +430,7 @@ static SurfaceInfo computeTextureLinearInfo( std::uint64_t surfaceOffset = 0; std::uint64_t surfaceSize = 0; + std::uint64_t linearOffset = 0; SurfaceInfo result; result.width = width; @@ -437,20 +485,25 @@ static SurfaceInfo computeTextureLinearInfo( linearPitch = linearWidth; } - if (arrayMode == kArrayModeLinearGeneral) { - surfaceSize = (static_cast(linearPitch) * - (linearHeight)*bitsPerElement * numFragmentsPerPixel + - 7) / - 8; - surfaceSize *= linearDepth; + auto linearSize = static_cast(linearPitch) * + (linearHeight)*bitsPerElement * numFragmentsPerPixel; + linearSize = linearDepth * ((linearSize + 7) / 8); + + if (arrayMode == kArrayModeLinearGeneral) { + surfaceSize = linearSize; result.setSubresourceInfo(mipLevel, { - .dataWidth = linearPitch, - .dataHeight = linearHeight, - .dataDepth = linearDepth, - .offset = surfaceOffset, + .tiledWidth = linearPitch, + .tiledHeight = linearHeight, + .tiledDepth = linearDepth, + .tiledOffset = surfaceOffset, .tiledSize = surfaceSize, - .linearSize = surfaceSize, + .linearPitch = linearPitch, + .linearWidth = linearWidth, + .linearHeight = linearHeight, + .linearDepth = linearDepth, + .linearOffset = linearOffset, + .linearSize = linearSize, }); } else { if (mipLevel > 0 && pitch > 0) { @@ -487,19 +540,26 @@ static SurfaceInfo computeTextureLinearInfo( surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth; result.setSubresourceInfo(mipLevel, { - .dataWidth = paddedPitch, - .dataHeight = paddedHeight, - .dataDepth = paddedDepth, - .offset = surfaceOffset, + .tiledWidth = paddedPitch, + .tiledHeight = paddedHeight, + .tiledDepth = paddedDepth, + .tiledOffset = surfaceOffset, .tiledSize = surfaceSize, - .linearSize = surfaceSize, + .linearPitch = linearPitch, + .linearWidth = linearWidth, + .linearHeight = linearHeight, + .linearDepth = linearDepth, + .linearOffset = linearOffset, + .linearSize = linearSize, }); } surfaceOffset += arraySliceCount * surfaceSize; + surfaceOffset += arraySliceCount * linearSize; } - result.totalSize = surfaceOffset; + result.totalTiledSize = surfaceOffset; + result.totalLinearSize = linearOffset; return result; } @@ -533,20 +593,10 @@ SurfaceInfo amdgpu::computeSurfaceInfo( case kArrayMode2dTiledThickPrt: case kArrayMode3dTiledThinPrt: case kArrayMode3dTiledThickPrt: - return computeTexture2dInfo(tileMode.arrayMode(), type, dfmt, width, height, - depth, pitch, baseArrayLayer, arrayCount, - baseMipLevel, mipCount, pow2pad); + return computeTexture2dInfo(tileMode, type, dfmt, width, height, depth, + pitch, baseArrayLayer, arrayCount, baseMipLevel, + mipCount, pow2pad); } std::abort(); } - -SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer, - TileMode tileMode) { - return computeSurfaceInfo( - tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1, - tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1, - tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1, - tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1, - tbuffer.pow2pad != 0); -} diff --git a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp index e3ad4dc8e..706ab8b9b 100644 --- a/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp +++ b/rpcsx-gpu/lib/amdgpu-tiler/src/tiler_vulkan.cpp @@ -21,9 +21,10 @@ struct Config { uint64_t dstEndAddress; uint32_t dataWidth; uint32_t dataHeight; + uint32_t linearDataWidth; + uint32_t linearDataHeight; uint32_t tileMode; uint32_t macroTileMode; - uint32_t dfmt; uint32_t numFragments; uint32_t bitsPerElement; uint32_t tiledSurfaceSize; @@ -66,7 +67,7 @@ struct TilerShader { struct amdgpu::GpuTiler::Impl { TilerShader detilerLinear{spirv_detilerLinear_comp}; TilerShader detiler1d{spirv_detiler1d_comp}; - TilerShader detiler2d{spirv_detilerLinear_comp}; + TilerShader detiler2d{spirv_detiler2d_comp}; TilerShader tilerLinear{spirv_tiler2d_comp}; TilerShader tiler1d{spirv_tiler1d_comp}; TilerShader tiler2d{spirv_tiler2d_comp}; @@ -98,29 +99,28 @@ struct amdgpu::GpuTiler::Impl { amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique(); } amdgpu::GpuTiler::~GpuTiler() = default; -void amdgpu::GpuTiler::detile(Scheduler &scheduler, - const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, gnm::DataFormat dfmt, - std::uint64_t srcTiledAddress, - std::uint64_t srcSize, - std::uint64_t dstLinearAddress, - std::uint64_t dstSize, int mipLevel, - int baseArray, int arrayCount) { +void amdgpu::GpuTiler::detile( + Scheduler &scheduler, const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress, + std::uint64_t srcSize, std::uint64_t dstLinearAddress, + std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount) { auto commandBuffer = scheduler.getCommandBuffer(); Config config{}; auto &subresource = info.getSubresourceInfo(mipLevel); - config.srcAddress = srcTiledAddress + subresource.offset; + config.srcAddress = srcTiledAddress + subresource.tiledOffset + + baseArray * subresource.tiledSize; config.srcEndAddress = srcTiledAddress + srcSize; - config.dstAddress = dstLinearAddress; + config.dstAddress = dstLinearAddress + subresource.linearOffset + + baseArray * subresource.linearSize; config.dstEndAddress = dstLinearAddress + dstSize; - config.dataWidth = subresource.dataWidth; - config.dataHeight = subresource.dataHeight; + config.dataWidth = subresource.tiledWidth; + config.dataHeight = subresource.tiledHeight; config.tileMode = tileMode.raw; - config.dfmt = dfmt; + config.macroTileMode = info.macroTileMode.raw; config.numFragments = info.numFragments; config.bitsPerElement = info.bitsPerElement; - uint32_t groupCountZ = subresource.dataDepth; + uint32_t groupCountZ = subresource.tiledDepth; if (arrayCount > 1) { config.tiledSurfaceSize = subresource.tiledSize; @@ -131,6 +131,9 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, config.linearSurfaceSize = 0; } + config.linearDataWidth = subresource.linearPitch; + config.linearDataHeight = subresource.linearHeight; + VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; switch (tileMode.arrayMode()) { @@ -157,46 +160,39 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - config.macroTileMode = - getDefaultMacroTileModes()[computeMacroTileIndex( - tileMode, info.bitsPerElement, - 1 << info.numFragments)] - .raw; - - vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader); + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader); break; } vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config); - vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, - groupCountZ); + vkCmdDispatch(commandBuffer, subresource.linearWidth, + subresource.linearHeight, groupCountZ); } -void amdgpu::GpuTiler::tile(Scheduler &scheduler, - const amdgpu::SurfaceInfo &info, - amdgpu::TileMode tileMode, gnm::DataFormat dfmt, - std::uint64_t srcLinearAddress, - std::uint64_t srcSize, - std::uint64_t dstTiledAddress, - std::uint64_t dstSize, int mipLevel, int baseArray, - int arrayCount) { +void amdgpu::GpuTiler::tile( + Scheduler &scheduler, const amdgpu::SurfaceInfo &info, + amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress, + std::uint64_t srcSize, std::uint64_t dstTiledAddress, std::uint64_t dstSize, + int mipLevel, int baseArray, int arrayCount) { auto commandBuffer = scheduler.getCommandBuffer(); Config config{}; auto &subresource = info.getSubresourceInfo(mipLevel); - config.srcAddress = srcLinearAddress; + config.srcAddress = srcLinearAddress + subresource.linearOffset + + baseArray * subresource.linearSize; config.srcEndAddress = srcLinearAddress + srcSize; - config.dstAddress = dstTiledAddress + subresource.offset; + config.dstAddress = dstTiledAddress + subresource.tiledOffset + + baseArray * subresource.tiledSize; config.dstEndAddress = dstTiledAddress + dstSize; - config.dataWidth = subresource.dataWidth; - config.dataHeight = subresource.dataHeight; + config.dataWidth = subresource.tiledWidth; + config.dataHeight = subresource.tiledHeight; config.tileMode = tileMode.raw; - config.dfmt = dfmt; + config.macroTileMode = info.macroTileMode.raw; config.numFragments = info.numFragments; config.bitsPerElement = info.bitsPerElement; - uint32_t groupCountZ = subresource.dataDepth; + uint32_t groupCountZ = subresource.tiledDepth; if (arrayCount > 1) { config.tiledSurfaceSize = subresource.tiledSize; @@ -207,6 +203,9 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, config.linearSurfaceSize = 0; } + config.linearDataWidth = subresource.linearPitch; + config.linearDataHeight = subresource.linearHeight; + VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT}; switch (tileMode.arrayMode()) { @@ -232,18 +231,13 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler, case amdgpu::kArrayMode3dTiledThick: case amdgpu::kArrayMode3dTiledXThick: case amdgpu::kArrayMode3dTiledThickPrt: - config.macroTileMode = - getDefaultMacroTileModes()[computeMacroTileIndex( - tileMode, info.bitsPerElement, - 1 << info.numFragments)] - .raw; - vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader); + vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader); break; } vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config); - vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight, - groupCountZ); + vkCmdDispatch(commandBuffer, subresource.linearWidth, + subresource.linearHeight, groupCountZ); }