rpcsx-gpu: implement 2d tiler

This commit is contained in:
DH 2024-10-05 20:31:03 +03:00
parent 5f23121d33
commit 7bea1e354f
12 changed files with 275 additions and 232 deletions

View file

@ -534,14 +534,14 @@ struct CachedImage : Cache::Entry {
regions.reserve(image.getMipLevels());
auto tiledBuffer =
tag.getBuffer(baseAddress, info.totalSize, Access::Write);
tag.getBuffer(baseAddress, info.totalTiledSize, Access::Write);
if (isLinear) {
for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) {
auto &regionInfo = info.getSubresourceInfo(mipLevel);
regions.push_back({
.bufferOffset = regionInfo.offset,
.bufferOffset = regionInfo.tiledOffset,
.bufferRowLength =
mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u),
.imageSubresource =
@ -565,14 +565,11 @@ struct CachedImage : Cache::Entry {
tiledBuffer.handle, regions.size(),
regions.data());
} else {
auto tiledSize = info.totalSize;
std::uint64_t linearOffset = 0;
for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) {
auto &regionInfo = info.getSubresourceInfo(mipLevel);
regions.push_back({
.bufferOffset = linearOffset,
.bufferRowLength =
mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u),
.bufferOffset = regionInfo.linearOffset,
.bufferRowLength = regionInfo.linearPitch,
.imageSubresource =
{
.aspectMask = toAspect(kind),
@ -582,18 +579,15 @@ struct CachedImage : Cache::Entry {
},
.imageExtent =
{
.width = std::max(image.getWidth() >> mipLevel, 1u),
.height = std::max(image.getHeight() >> mipLevel, 1u),
.depth = std::max(image.getDepth() >> mipLevel, 1u),
.width = regionInfo.linearWidth,
.height = regionInfo.linearHeight,
.depth = regionInfo.linearDepth,
},
});
linearOffset += regionInfo.linearSize * image.getArrayLayers();
}
auto linearSize = linearOffset;
auto transferBuffer = vk::Buffer::Allocate(
vk::getDeviceLocalMemory(), linearOffset,
vk::getDeviceLocalMemory(), info.totalLinearSize,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
vkCmdCopyImageToBuffer(scheduler.getCommandBuffer(), image.getHandle(),
@ -603,14 +597,11 @@ struct CachedImage : Cache::Entry {
auto &tiler = tag.getDevice()->tiler;
linearOffset = 0;
for (unsigned mipLevel = 0; mipLevel < image.getMipLevels(); ++mipLevel) {
auto &regionInfo = info.getSubresourceInfo(mipLevel);
tiler.tile(scheduler, info, acquiredTileMode, acquiredDfmt,
transferBuffer.getAddress() + linearOffset,
linearSize - linearOffset, tiledBuffer.deviceAddress,
tiledSize, mipLevel, 0, image.getArrayLayers());
linearOffset += regionInfo.linearSize * image.getArrayLayers();
tiler.tile(scheduler, info, acquiredTileMode,
transferBuffer.getAddress(), info.totalLinearSize,
tiledBuffer.deviceAddress, info.totalTiledSize, mipLevel, 0,
image.getArrayLayers());
}
scheduler.afterSubmit([transferBuffer = std::move(transferBuffer)] {});
@ -1157,7 +1148,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
VkBuffer sourceBuffer;
auto tiledBuffer =
getBuffer(key.readAddress, surfaceInfo.totalSize, Access::Read);
getBuffer(key.readAddress, surfaceInfo.totalTiledSize, Access::Read);
if (isLinear) {
sourceBuffer = tiledBuffer.handle;
@ -1165,7 +1156,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) {
auto &info = surfaceInfo.getSubresourceInfo(mipLevel);
regions.push_back({
.bufferOffset = info.offset,
.bufferOffset = info.tiledOffset,
.bufferRowLength =
mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u),
.imageSubresource =
@ -1186,15 +1177,13 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
} else {
auto &tiler = mParent->mDevice->tiler;
std::uint64_t linearOffset = 0;
for (unsigned mipLevel = key.baseMipLevel;
mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) {
auto &info = surfaceInfo.getSubresourceInfo(mipLevel);
regions.push_back({
.bufferOffset = linearOffset,
.bufferRowLength =
mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u),
.bufferOffset = info.linearOffset,
.bufferRowLength = info.linearPitch,
.imageSubresource =
{
.aspectMask = toAspect(key.kind),
@ -1204,37 +1193,29 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
},
.imageExtent =
{
.width = std::max(key.extent.width >> mipLevel, 1u),
.height = std::max(key.extent.height >> mipLevel, 1u),
.depth = std::max(key.extent.depth >> mipLevel, 1u),
.width = info.linearWidth,
.height = info.linearHeight,
.depth = info.linearDepth,
},
});
linearOffset += info.linearSize * key.arrayLayerCount;
}
auto detiledSize = linearOffset;
auto detiledBuffer =
vk::Buffer::Allocate(vk::getDeviceLocalMemory(), detiledSize,
VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR |
VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR);
auto detiledBuffer = vk::Buffer::Allocate(
vk::getDeviceLocalMemory(), surfaceInfo.totalLinearSize,
VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR |
VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR);
sourceBuffer = detiledBuffer.getHandle();
std::uint64_t dstAddress = detiledBuffer.getAddress();
auto linearAddress = detiledBuffer.getAddress();
mScheduler->afterSubmit([detiledBuffer = std::move(detiledBuffer)] {});
for (unsigned mipLevel = key.baseMipLevel;
mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) {
auto &info = surfaceInfo.getSubresourceInfo(mipLevel);
tiler.detile(*mScheduler, surfaceInfo, key.tileMode, key.dfmt,
tiledBuffer.deviceAddress, surfaceInfo.totalSize,
dstAddress, detiledSize, mipLevel, 0, key.arrayLayerCount);
detiledSize -= info.linearSize * key.arrayLayerCount;
dstAddress += info.linearSize * key.arrayLayerCount;
tiler.detile(*mScheduler, surfaceInfo, key.tileMode,
tiledBuffer.deviceAddress, surfaceInfo.totalTiledSize,
linearAddress, surfaceInfo.totalLinearSize, mipLevel, 0,
key.arrayLayerCount);
}
}

View file

@ -115,27 +115,27 @@ struct TileMode {
constexpr TileMode &arrayMode(ArrayMode mode) {
raw = (raw & ~0x0000003c) |
(static_cast<std::uint32_t>(mode) << 2) & 0x0000003c;
((static_cast<std::uint32_t>(mode) << 2) & 0x0000003c);
return *this;
}
constexpr TileMode &pipeConfig(PipeConfig mode) {
raw = (raw & ~0x000007c0) |
(static_cast<std::uint32_t>(mode) << 6) & 0x000007c0;
((static_cast<std::uint32_t>(mode) << 6) & 0x000007c0);
return *this;
}
constexpr TileMode &tileSplit(TileSplit mode) {
raw = (raw & ~0x00003800) |
(static_cast<std::uint32_t>(mode) << 11) & 0x00003800;
((static_cast<std::uint32_t>(mode) << 11) & 0x00003800);
return *this;
}
constexpr TileMode &microTileMode(MicroTileMode mode) {
raw = (raw & ~0x01c00000) |
(static_cast<std::uint32_t>(mode) << 22) & 0x01c00000;
((static_cast<std::uint32_t>(mode) << 22) & 0x01c00000);
return *this;
}
constexpr TileMode &sampleSplit(SampleSplit mode) {
raw = (raw & ~0x06000000) |
(static_cast<std::uint32_t>(mode) << 25) & 0x06000000;
((static_cast<std::uint32_t>(mode) << 25) & 0x06000000);
return *this;
}
};
@ -166,17 +166,24 @@ struct SurfaceInfo {
std::uint32_t height;
std::uint32_t depth;
std::uint32_t pitch;
MacroTileMode macroTileMode;
int arrayLayerCount;
int numFragments;
int bitsPerElement;
std::uint64_t totalSize;
std::uint64_t totalTiledSize;
std::uint64_t totalLinearSize;
struct SubresourceInfo {
std::uint32_t dataWidth;
std::uint32_t dataHeight;
std::uint32_t dataDepth;
std::uint64_t offset;
std::uint32_t tiledWidth;
std::uint32_t tiledHeight;
std::uint32_t tiledDepth;
std::uint64_t tiledOffset;
std::uint64_t tiledSize;
std::uint32_t linearPitch;
std::uint32_t linearWidth;
std::uint32_t linearHeight;
std::uint32_t linearDepth;
std::uint64_t linearOffset;
std::uint64_t linearSize;
};
@ -524,5 +531,4 @@ SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type,
std::uint32_t pitch, int baseArrayLayer,
int arrayCount, int baseMipLevel, int mipCount,
bool pow2pad);
SurfaceInfo computeSurfaceInfo(const gnm::TBuffer &tbuffer, TileMode tileMode);
} // namespace amdgpu

View file

@ -11,15 +11,14 @@ struct GpuTiler {
~GpuTiler();
void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcTiledAddress, std::uint64_t srcSize,
std::uint64_t dstLinearAddress, std::uint64_t dstSize,
int mipLevel, int baseArray, int arrayCount);
amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress,
std::uint64_t srcSize, std::uint64_t dstLinearAddress,
std::uint64_t dstSize, int mipLevel, int baseArray,
int arrayCount);
void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcLinearAddress, std::uint64_t srcSize,
std::uint64_t dstTiledAddress, std::uint64_t dstSize, int mipLevel,
int baseArray, int arrayCount);
amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress,
std::uint64_t srcSize, std::uint64_t dstTiledAddress,
std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount);
private:
std::unique_ptr<Impl> mImpl;

View file

@ -37,8 +37,8 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
@ -63,6 +63,9 @@ void main() {
switch (bpp) {
case 1:
// buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
// break;
case 2:
buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
break;

View file

@ -1,5 +1,7 @@
#version 460
#define DEBUG
#extension GL_GOOGLE_include_directive : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable
#extension GL_EXT_shader_atomic_int64 : enable
@ -32,12 +34,12 @@ void main() {
}
uint64_t tiledByteOffset = getTiledBitOffset2D(
config.dfmt,
config.tileMode,
config.macroTileMode,
config.dataSize,
arraySlice,
config.numFragments,
config.bitsPerElement,
pos,
fragmentIndex
) / 8;
@ -47,8 +49,8 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
@ -57,6 +59,10 @@ void main() {
uint32_t bpp = (config.bitsPerElement + 7) / 8;
if (bpp == 1 && (linearByteOffset & 1) != 0) {
return;
}
#ifdef DEBUG
if (config.srcAddress + tiledByteOffset + bpp > config.srcEndAddress) {
debugPrintfEXT("detiler2d: out of src buffer %d x %d x %d", pos.x, pos.y, pos.z);
@ -71,9 +77,8 @@ void main() {
switch (bpp) {
case 1:
buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
break;
// buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
// break;
case 2:
buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
break;

View file

@ -36,19 +36,22 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
linearByteOffset += linearSliceOffset;
switch ((config.bitsPerElement + 7) / 8) {
case 1:
buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
break;
uint32_t bpp = (config.bitsPerElement + 7) / 8;
if (bpp == 1 && (linearByteOffset & 1) != 0) {
return;
}
switch (bpp) {
case 1:
case 2:
buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
break;

View file

@ -1,7 +1,5 @@
#define FOR_ALL_BASE_TYPES(OP) \
OP(int8_t) \
OP(uint8_t) \
OP(int16_t) \
OP(uint16_t) \
OP(float16_t) \
@ -785,11 +783,8 @@ uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint3
}
uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTileMode,
uvec2 dataSize, int arraySlice, uint32_t numFragments, u32vec3 pos, int fragmentIndex) {
uint32_t bitsPerFragment = getBitsPerElement(dfmt);
bool isBlockCompressed = getTexelsPerElement(dfmt) > 1;
uint64_t getTiledBitOffset2D(uint32_t tileMode, uint32_t macroTileMode,
uvec2 dataSize, int arraySlice, uint32_t numFragments, uint32_t bitsPerElement, u32vec3 pos, int fragmentIndex) {
uint32_t tileSwizzleMask = 0;
uint32_t numFragmentsPerPixel = 1 << numFragments;
uint32_t arrayMode = tileMode_getArrayMode(tileMode);
@ -820,7 +815,6 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil
break;
}
uint32_t bitsPerElement = bitsPerFragment;
uint32_t paddedWidth = dataSize.x;
uint32_t paddedHeight = dataSize.y;
@ -849,7 +843,8 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil
uint32_t tileSplitBytes = min(kDramRowSize, tileSplitC);
uint32_t numPipes = getPipeCount(tileMode_getPipeConfig(tileMode));
uint32_t pipeConfig = tileMode_getPipeConfig(tileMode);
uint32_t numPipes = getPipeCount(pipeConfig);
uint32_t pipeInterleaveBits = findLSB(kPipeInterleaveBytes);
uint32_t pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
uint32_t pipeBits = findLSB(numPipes);
@ -873,7 +868,7 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil
xh %= macroTileWidth;
yh %= macroTileHeight;
}
uint64_t pipe = getPipeIndex(xh, yh, tileMode_getPipeConfig(tileMode));
uint64_t pipe = getPipeIndex(xh, yh, pipeConfig);
uint64_t bank =
getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
@ -989,16 +984,15 @@ uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTil
return (finalByteOffset << 3) | bitOffset;
}
layout(push_constant) uniform Config {
uint64_t srcAddress;
uint64_t srcEndAddress;
uint64_t dstAddress;
uint64_t dstEndAddress;
uvec2 dataSize;
uvec2 linearDataSize;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;

View file

@ -40,8 +40,8 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
@ -69,6 +69,8 @@ void main() {
switch (bpp) {
case 1:
// buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
// break;
case 2:
buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
break;

View file

@ -1,4 +1,5 @@
#version 460
#define DEBUG
#extension GL_GOOGLE_include_directive : enable
#extension GL_EXT_shader_explicit_arithmetic_types : enable
@ -31,12 +32,12 @@ void main() {
}
uint64_t tiledByteOffset = getTiledBitOffset2D(
config.dfmt,
config.tileMode,
config.macroTileMode,
config.dataSize,
arraySlice,
config.numFragments,
config.bitsPerElement,
pos,
fragmentIndex
) / 8;
@ -46,8 +47,8 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
@ -55,24 +56,27 @@ void main() {
linearByteOffset += linearSliceOffset;
uint32_t bpp = (config.bitsPerElement + 7) / 8;
if (bpp == 1 && (linearByteOffset & 1) != 0) {
return;
}
#ifdef DEBUG
if (config.srcAddress + linearByteOffset + bpp > config.srcEndAddress) {
debugPrintfEXT("tiler2d: out of src buffer %d x %d x %d", pos.x, pos.y, pos.z);
debugPrintfEXT("tiler2d: out of src buffer %d x %d x %d, src offset: %lu, src size: %lu", pos.x, pos.y, pos.z,
linearByteOffset, config.srcEndAddress - config.srcAddress);
return;
}
if (config.dstAddress + tiledByteOffset + bpp > config.dstEndAddress) {
debugPrintfEXT("tiler2d: out of dst buffer %d x %d x %d", pos.x, pos.y, pos.z);
debugPrintfEXT("tiler2d: out of dst buffer %d x %d x %d, offset %lx, size %lx", pos.x, pos.y, pos.z, tiledByteOffset, config.dstEndAddress - config.dstAddress);
return;
}
#endif
switch (bpp) {
case 1:
buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
break;
// buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
// break;
case 2:
buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
break;

View file

@ -36,19 +36,21 @@ void main() {
uint64_t linearByteOffset = computeLinearElementByteOffset(
pos,
0,
config.dataSize.x,
config.dataSize.x * config.dataSize.y,
config.linearDataSize.x,
config.linearDataSize.x * config.linearDataSize.y,
config.bitsPerElement,
1 << config.numFragments
);
linearByteOffset += linearSliceOffset;
switch ((config.bitsPerElement + 7) / 8) {
case 1:
buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
break;
uint32_t bpp = (config.bitsPerElement + 7) / 8;
if (bpp == 1 && (linearByteOffset & 1) != 0) {
return;
}
switch (bpp) {
case 1:
case 2:
buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
break;

View file

@ -5,9 +5,8 @@
using namespace amdgpu;
// FIXME: should be properly implemented
static SurfaceInfo
computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
computeTexture2dInfo(TileMode tileMode, gnm::TextureType type,
gnm::DataFormat dfmt, std::uint32_t width,
std::uint32_t height, std::uint32_t depth,
std::uint32_t pitch, int baseArrayLayer, int arrayCount,
@ -32,7 +31,7 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
auto numFragmentsPerPixel = 1 << numFragments;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto bitsPerElement = bitsPerFragment;
std::uint32_t bitsPerElement = bitsPerFragment;
depth = isVolume ? depth : 1;
if (isBlockCompressed) {
@ -60,6 +59,10 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
std::uint64_t linearOffset = 0;
auto macroTileMode = getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, bitsPerElement, 1 << numFragments)];
SurfaceInfo result;
result.width = width;
@ -69,8 +72,10 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
result.numFragments = numFragments;
result.bitsPerElement = bitsPerElement;
result.arrayLayerCount = arraySliceCount;
result.macroTileMode = macroTileMode;
auto thickness = getMicroTileThickness(arrayMode);
auto arrayMode = tileMode.arrayMode();
auto numPipes = getPipeCount(tileMode.pipeConfig());
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
@ -116,55 +121,91 @@ computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
linearPitch = linearWidth;
}
std::uint32_t paddedPitch =
(linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
std::uint32_t paddedHeight =
(linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
std::uint32_t paddedDepth = linearDepth;
auto thickness = getMicroTileThickness(arrayMode);
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
if (isCubemap) {
linearDepth = std::bit_ceil(linearDepth);
}
uint32_t numBanks = 2 << macroTileMode.numBanks();
uint32_t macroAspect = 1 << macroTileMode.macroTileAspect();
uint32_t tileBytes1x =
(thickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight + 7) /
8;
auto microTileMode = tileMode.microTileMode();
uint32_t tileSplit =
(microTileMode == kMicroTileModeDepth)
? (64 << tileMode.sampleSplit())
: std::max(256U, (1 << tileMode.sampleSplit()) * tileBytes1x);
uint32_t tileSplitC = std::min(kDramRowSize, tileSplit);
uint32_t bankWidth = 1 << macroTileMode.bankWidth();
uint32_t bankHeight = 1 << macroTileMode.bankHeight();
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
uint32_t tileSize = std::min(
tileSplitC, (thickness * bitsPerElement * numFragmentsPerPixel *
kMicroTileWidth * kMicroTileHeight +
7) /
8);
uint32_t bankHeightAlign =
std::max(1U, kPipeInterleaveBytes / (tileSize * bankWidth));
bankHeight = (bankHeight + bankHeightAlign - 1) & ~(bankHeightAlign - 1);
if (numFragmentsPerPixel == 1) {
uint32_t macroAspectAlign = std::max(
1U, kPipeInterleaveBytes / (tileSize * numPipes * bankWidth));
macroAspect =
(macroAspect + macroAspectAlign - 1) & ~(macroAspectAlign - 1);
}
std::uint32_t tempPitch = paddedPitch;
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
paddedHeight * bitsPerElement *
numFragmentsPerPixel;
auto depthAlign = thickness;
// FIXME: rotate tile mode for mipLevel > 0
uint32_t outPitch = linearPitch;
uint32_t outHeight = linearHeight;
uint32_t outDepth = linearDepth;
uint32_t macroTileWidth =
kMicroTileWidth * bankWidth * numPipes * macroAspect;
uint32_t macroTileHeight =
kMicroTileHeight * bankHeight * numBanks / macroAspect;
uint32_t heightAlign = macroTileHeight;
auto pitchAlign = macroTileWidth;
outPitch = (outPitch + pitchAlign - 1) & ~(pitchAlign - 1);
outDepth = (outDepth + depthAlign - 1) & ~(depthAlign - 1);
outHeight = (outHeight + heightAlign - 1) & ~(heightAlign - 1);
std::uint64_t logicalSliceSizeBytes = std::uint64_t(outPitch) * outHeight *
bitsPerElement * numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
tempPitch += kMicroTileWidth;
logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
bitsPerElement * numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
}
surfaceSize = static_cast<uint64_t>(outPitch) * outHeight *
std::bit_ceil(bitsPerElement) * numFragmentsPerPixel;
surfaceSize = (surfaceSize + 7) / 8;
surfaceSize = logicalSliceSizeBytes * paddedDepth;
auto linearSize =
linearDepth *
(linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
7) /
8;
auto linearSize = uint64_t(linearPitch) * linearHeight * bitsPerElement *
numFragmentsPerPixel;
linearSize = linearDepth * ((linearSize + 7) / 8);
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledWidth = outPitch,
.tiledHeight = outHeight,
.tiledDepth = outDepth,
.tiledOffset = surfaceOffset,
.tiledSize = surfaceSize,
.linearPitch = linearPitch,
.linearWidth = linearWidth,
.linearHeight = linearHeight,
.linearDepth = linearDepth,
.linearOffset = linearOffset,
.linearSize = linearSize,
});
linearOffset += arraySliceCount * linearSize;
surfaceOffset += arraySliceCount * surfaceSize;
}
result.totalSize = surfaceOffset;
result.totalTiledSize = surfaceOffset;
result.totalLinearSize = linearOffset;
return result;
}
@ -222,6 +263,7 @@ computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
std::uint64_t linearOffset = 0;
SurfaceInfo result;
result.width = width;
@ -308,25 +350,30 @@ computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
}
surfaceSize = logicalSliceSizeBytes * paddedDepth;
auto linearSize =
linearDepth *
(linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
7) /
8;
auto linearSize = uint64_t(linearPitch) * linearHeight * bitsPerElement *
numFragmentsPerPixel;
linearSize = linearDepth * ((linearSize + 7) / 8);
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledWidth = linearPitch,
.tiledHeight = linearHeight,
.tiledDepth = linearDepth,
.tiledOffset = surfaceOffset,
.tiledSize = surfaceSize,
.linearPitch = linearPitch,
.linearWidth = linearWidth,
.linearHeight = linearHeight,
.linearDepth = linearDepth,
.linearOffset = linearOffset,
.linearSize = linearSize,
});
surfaceOffset += arraySliceCount * surfaceSize;
linearOffset += arraySliceCount * linearSize;
}
result.totalSize = surfaceOffset;
result.totalTiledSize = surfaceOffset;
result.totalLinearSize = linearOffset;
return result;
}
@ -383,6 +430,7 @@ static SurfaceInfo computeTextureLinearInfo(
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
std::uint64_t linearOffset = 0;
SurfaceInfo result;
result.width = width;
@ -437,20 +485,25 @@ static SurfaceInfo computeTextureLinearInfo(
linearPitch = linearWidth;
}
if (arrayMode == kArrayModeLinearGeneral) {
surfaceSize = (static_cast<uint64_t>(linearPitch) *
(linearHeight)*bitsPerElement * numFragmentsPerPixel +
7) /
8;
surfaceSize *= linearDepth;
auto linearSize = static_cast<uint64_t>(linearPitch) *
(linearHeight)*bitsPerElement * numFragmentsPerPixel;
linearSize = linearDepth * ((linearSize + 7) / 8);
if (arrayMode == kArrayModeLinearGeneral) {
surfaceSize = linearSize;
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledWidth = linearPitch,
.tiledHeight = linearHeight,
.tiledDepth = linearDepth,
.tiledOffset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = surfaceSize,
.linearPitch = linearPitch,
.linearWidth = linearWidth,
.linearHeight = linearHeight,
.linearDepth = linearDepth,
.linearOffset = linearOffset,
.linearSize = linearSize,
});
} else {
if (mipLevel > 0 && pitch > 0) {
@ -487,19 +540,26 @@ static SurfaceInfo computeTextureLinearInfo(
surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth;
result.setSubresourceInfo(mipLevel, {
.dataWidth = paddedPitch,
.dataHeight = paddedHeight,
.dataDepth = paddedDepth,
.offset = surfaceOffset,
.tiledWidth = paddedPitch,
.tiledHeight = paddedHeight,
.tiledDepth = paddedDepth,
.tiledOffset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = surfaceSize,
.linearPitch = linearPitch,
.linearWidth = linearWidth,
.linearHeight = linearHeight,
.linearDepth = linearDepth,
.linearOffset = linearOffset,
.linearSize = linearSize,
});
}
surfaceOffset += arraySliceCount * surfaceSize;
surfaceOffset += arraySliceCount * linearSize;
}
result.totalSize = surfaceOffset;
result.totalTiledSize = surfaceOffset;
result.totalLinearSize = linearOffset;
return result;
}
@ -533,20 +593,10 @@ SurfaceInfo amdgpu::computeSurfaceInfo(
case kArrayMode2dTiledThickPrt:
case kArrayMode3dTiledThinPrt:
case kArrayMode3dTiledThickPrt:
return computeTexture2dInfo(tileMode.arrayMode(), type, dfmt, width, height,
depth, pitch, baseArrayLayer, arrayCount,
baseMipLevel, mipCount, pow2pad);
return computeTexture2dInfo(tileMode, type, dfmt, width, height, depth,
pitch, baseArrayLayer, arrayCount, baseMipLevel,
mipCount, pow2pad);
}
std::abort();
}
SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer,
TileMode tileMode) {
return computeSurfaceInfo(
tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1,
tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1,
tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1,
tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1,
tbuffer.pow2pad != 0);
}

View file

@ -21,9 +21,10 @@ struct Config {
uint64_t dstEndAddress;
uint32_t dataWidth;
uint32_t dataHeight;
uint32_t linearDataWidth;
uint32_t linearDataHeight;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
@ -66,7 +67,7 @@ struct TilerShader {
struct amdgpu::GpuTiler::Impl {
TilerShader detilerLinear{spirv_detilerLinear_comp};
TilerShader detiler1d{spirv_detiler1d_comp};
TilerShader detiler2d{spirv_detilerLinear_comp};
TilerShader detiler2d{spirv_detiler2d_comp};
TilerShader tilerLinear{spirv_tiler2d_comp};
TilerShader tiler1d{spirv_tiler1d_comp};
TilerShader tiler2d{spirv_tiler2d_comp};
@ -98,29 +99,28 @@ struct amdgpu::GpuTiler::Impl {
amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
amdgpu::GpuTiler::~GpuTiler() = default;
void amdgpu::GpuTiler::detile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcTiledAddress,
std::uint64_t srcSize,
std::uint64_t dstLinearAddress,
std::uint64_t dstSize, int mipLevel,
int baseArray, int arrayCount) {
void amdgpu::GpuTiler::detile(
Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress,
std::uint64_t srcSize, std::uint64_t dstLinearAddress,
std::uint64_t dstSize, int mipLevel, int baseArray, int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
Config config{};
auto &subresource = info.getSubresourceInfo(mipLevel);
config.srcAddress = srcTiledAddress + subresource.offset;
config.srcAddress = srcTiledAddress + subresource.tiledOffset +
baseArray * subresource.tiledSize;
config.srcEndAddress = srcTiledAddress + srcSize;
config.dstAddress = dstLinearAddress;
config.dstAddress = dstLinearAddress + subresource.linearOffset +
baseArray * subresource.linearSize;
config.dstEndAddress = dstLinearAddress + dstSize;
config.dataWidth = subresource.dataWidth;
config.dataHeight = subresource.dataHeight;
config.dataWidth = subresource.tiledWidth;
config.dataHeight = subresource.tiledHeight;
config.tileMode = tileMode.raw;
config.dfmt = dfmt;
config.macroTileMode = info.macroTileMode.raw;
config.numFragments = info.numFragments;
config.bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
uint32_t groupCountZ = subresource.tiledDepth;
if (arrayCount > 1) {
config.tiledSurfaceSize = subresource.tiledSize;
@ -131,6 +131,9 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
config.linearSurfaceSize = 0;
}
config.linearDataWidth = subresource.linearPitch;
config.linearDataHeight = subresource.linearHeight;
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
switch (tileMode.arrayMode()) {
@ -157,46 +160,39 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
config.macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
.raw;
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
break;
}
vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
vkCmdDispatch(commandBuffer, subresource.linearWidth,
subresource.linearHeight, groupCountZ);
}
void amdgpu::GpuTiler::tile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcLinearAddress,
std::uint64_t srcSize,
std::uint64_t dstTiledAddress,
std::uint64_t dstSize, int mipLevel, int baseArray,
int arrayCount) {
void amdgpu::GpuTiler::tile(
Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress,
std::uint64_t srcSize, std::uint64_t dstTiledAddress, std::uint64_t dstSize,
int mipLevel, int baseArray, int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
Config config{};
auto &subresource = info.getSubresourceInfo(mipLevel);
config.srcAddress = srcLinearAddress;
config.srcAddress = srcLinearAddress + subresource.linearOffset +
baseArray * subresource.linearSize;
config.srcEndAddress = srcLinearAddress + srcSize;
config.dstAddress = dstTiledAddress + subresource.offset;
config.dstAddress = dstTiledAddress + subresource.tiledOffset +
baseArray * subresource.tiledSize;
config.dstEndAddress = dstTiledAddress + dstSize;
config.dataWidth = subresource.dataWidth;
config.dataHeight = subresource.dataHeight;
config.dataWidth = subresource.tiledWidth;
config.dataHeight = subresource.tiledHeight;
config.tileMode = tileMode.raw;
config.dfmt = dfmt;
config.macroTileMode = info.macroTileMode.raw;
config.numFragments = info.numFragments;
config.bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
uint32_t groupCountZ = subresource.tiledDepth;
if (arrayCount > 1) {
config.tiledSurfaceSize = subresource.tiledSize;
@ -207,6 +203,9 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
config.linearSurfaceSize = 0;
}
config.linearDataWidth = subresource.linearPitch;
config.linearDataHeight = subresource.linearHeight;
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
switch (tileMode.arrayMode()) {
@ -232,18 +231,13 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
config.macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
.raw;
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
break;
}
vkCmdPushConstants(commandBuffer, mImpl->pipelineLayout,
VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(config), &config);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
vkCmdDispatch(commandBuffer, subresource.linearWidth,
subresource.linearHeight, groupCountZ);
}