mirror of
https://github.com/RPCSX/rpcsx.git
synced 2026-04-20 22:05:12 +00:00
gpu rewrite initial commit
This commit is contained in:
parent
0d4ed51cd9
commit
4cf808facd
133 changed files with 35491 additions and 4 deletions
387
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
Normal file
387
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
Normal file
|
|
@ -0,0 +1,387 @@
|
|||
#include "gnm/constants.hpp"
|
||||
#include <amdgpu/tiler.hpp>
|
||||
#include <gnm/gnm.hpp>
|
||||
#include <bit>
|
||||
|
||||
using namespace amdgpu;
|
||||
|
||||
static constexpr SurfaceInfo
|
||||
computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
|
||||
gnm::DataFormat dfmt, std::uint32_t width,
|
||||
std::uint32_t height, std::uint32_t depth,
|
||||
std::uint32_t pitch, int baseArrayLayer, int arrayCount,
|
||||
int baseMipLevel, int mipCount, bool pow2pad) {
|
||||
bool isCubemap = type == gnm::TextureType::Cube;
|
||||
bool isVolume = type == gnm::TextureType::Dim3D;
|
||||
|
||||
auto bitsPerFragment = getBitsPerElement(dfmt);
|
||||
std::uint32_t arraySliceCount = depth;
|
||||
|
||||
if (isCubemap) {
|
||||
arraySliceCount *= 6;
|
||||
} else if (isVolume) {
|
||||
arraySliceCount = 1;
|
||||
}
|
||||
|
||||
int numFragments = (type == gnm::TextureType::Msaa2D ||
|
||||
type == gnm::TextureType::MsaaArray2D)
|
||||
? (baseArrayLayer + arrayCount - 1)
|
||||
: 0;
|
||||
|
||||
auto numFragmentsPerPixel = 1 << numFragments;
|
||||
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
|
||||
|
||||
auto bitsPerElement = bitsPerFragment;
|
||||
depth = isVolume ? depth : 1;
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
bitsPerElement *= 8;
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
bitsPerElement *= 16;
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pow2pad) {
|
||||
arraySliceCount = std::bit_ceil(arraySliceCount);
|
||||
}
|
||||
|
||||
std::uint64_t surfaceOffset = 0;
|
||||
std::uint64_t surfaceSize = 0;
|
||||
|
||||
SurfaceInfo result;
|
||||
result.width = width;
|
||||
result.height = height;
|
||||
result.depth = depth;
|
||||
result.pitch = pitch;
|
||||
result.numFragments = numFragments;
|
||||
result.bitsPerElement = bitsPerElement;
|
||||
result.arrayLayerCount = arraySliceCount;
|
||||
|
||||
auto thickness = getMicroTileThickness(arrayMode);
|
||||
|
||||
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
|
||||
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
|
||||
std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
|
||||
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
|
||||
std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
|
||||
|
||||
std::uint32_t linearPitch = elemPitch;
|
||||
std::uint32_t linearWidth = elemWidth;
|
||||
std::uint32_t linearHeight = elemHeight;
|
||||
std::uint32_t linearDepth = elemDepth;
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
|
||||
linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
|
||||
linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
|
||||
linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pow2pad) {
|
||||
linearPitch = std::bit_ceil(linearPitch);
|
||||
linearWidth = std::bit_ceil(linearWidth);
|
||||
linearHeight = std::bit_ceil(linearHeight);
|
||||
linearDepth = std::bit_ceil(linearDepth);
|
||||
}
|
||||
|
||||
if (mipLevel > 0 && pitch > 0) {
|
||||
linearPitch = linearWidth;
|
||||
}
|
||||
|
||||
std::uint32_t paddedPitch =
|
||||
(linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
|
||||
std::uint32_t paddedHeight =
|
||||
(linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
|
||||
std::uint32_t paddedDepth = linearDepth;
|
||||
|
||||
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
|
||||
if (isCubemap) {
|
||||
linearDepth = std::bit_ceil(linearDepth);
|
||||
}
|
||||
|
||||
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
|
||||
}
|
||||
|
||||
std::uint32_t tempPitch = paddedPitch;
|
||||
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
|
||||
paddedHeight * bitsPerElement *
|
||||
numFragmentsPerPixel;
|
||||
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
|
||||
|
||||
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
|
||||
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
|
||||
tempPitch += kMicroTileWidth;
|
||||
logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
|
||||
bitsPerElement * numFragmentsPerPixel;
|
||||
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
|
||||
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
|
||||
}
|
||||
|
||||
surfaceSize = logicalSliceSizeBytes * paddedDepth;
|
||||
auto linearSize =
|
||||
linearDepth *
|
||||
(linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
|
||||
7) /
|
||||
8;
|
||||
|
||||
result.setSubresourceInfo(mipLevel, {
|
||||
.dataWidth = linearPitch,
|
||||
.dataHeight = linearHeight,
|
||||
.dataDepth = linearDepth,
|
||||
.offset = surfaceOffset,
|
||||
.tiledSize = surfaceSize,
|
||||
.linearSize = linearSize,
|
||||
});
|
||||
|
||||
surfaceOffset += arraySliceCount * surfaceSize;
|
||||
}
|
||||
|
||||
result.totalSize = surfaceOffset;
|
||||
return result;
|
||||
}
|
||||
|
||||
static constexpr SurfaceInfo computeTextureLinearInfo(
|
||||
ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt,
|
||||
std::uint32_t width, std::uint32_t height, std::uint32_t depth,
|
||||
std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
|
||||
int mipCount, bool pow2pad) {
|
||||
bool isCubemap = type == gnm::TextureType::Cube;
|
||||
bool isVolume = type == gnm::TextureType::Dim3D;
|
||||
|
||||
auto bitsPerFragment = getBitsPerElement(dfmt);
|
||||
std::uint32_t arraySliceCount = depth;
|
||||
|
||||
if (isCubemap) {
|
||||
arraySliceCount *= 6;
|
||||
} else if (isVolume) {
|
||||
arraySliceCount = 1;
|
||||
}
|
||||
|
||||
int numFragments = (type == gnm::TextureType::Msaa2D ||
|
||||
type == gnm::TextureType::MsaaArray2D)
|
||||
? (baseArrayLayer + arrayCount - 1)
|
||||
: 0;
|
||||
|
||||
auto numFragmentsPerPixel = 1 << numFragments;
|
||||
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
|
||||
|
||||
auto bitsPerElement = bitsPerFragment;
|
||||
depth = isVolume ? depth : 1;
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
bitsPerElement *= 8;
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
bitsPerElement *= 16;
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pow2pad) {
|
||||
arraySliceCount = std::bit_ceil(arraySliceCount);
|
||||
}
|
||||
|
||||
std::uint64_t surfaceOffset = 0;
|
||||
std::uint64_t surfaceSize = 0;
|
||||
|
||||
SurfaceInfo result;
|
||||
result.width = width;
|
||||
result.height = height;
|
||||
result.depth = depth;
|
||||
result.pitch = pitch;
|
||||
result.numFragments = numFragments;
|
||||
result.bitsPerElement = bitsPerElement;
|
||||
result.arrayLayerCount = arraySliceCount;
|
||||
|
||||
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
|
||||
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
|
||||
std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
|
||||
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
|
||||
std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
|
||||
|
||||
std::uint32_t linearPitch = elemPitch;
|
||||
std::uint32_t linearWidth = elemWidth;
|
||||
std::uint32_t linearHeight = elemHeight;
|
||||
std::uint32_t linearDepth = elemDepth;
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
|
||||
linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
|
||||
linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
|
||||
linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pow2pad) {
|
||||
linearPitch = std::bit_ceil(linearPitch);
|
||||
linearWidth = std::bit_ceil(linearWidth);
|
||||
linearHeight = std::bit_ceil(linearHeight);
|
||||
linearDepth = std::bit_ceil(linearDepth);
|
||||
}
|
||||
|
||||
if (mipLevel > 0 && pitch > 0) {
|
||||
linearPitch = linearWidth;
|
||||
}
|
||||
|
||||
if (arrayMode == kArrayModeLinearGeneral) {
|
||||
surfaceSize = (static_cast<uint64_t>(linearPitch) *
|
||||
(linearHeight)*bitsPerElement * numFragmentsPerPixel +
|
||||
7) /
|
||||
8;
|
||||
surfaceSize *= linearDepth;
|
||||
|
||||
result.setSubresourceInfo(mipLevel, {
|
||||
.dataWidth = linearPitch,
|
||||
.dataHeight = linearHeight,
|
||||
.dataDepth = linearDepth,
|
||||
.offset = surfaceOffset,
|
||||
.tiledSize = surfaceSize,
|
||||
.linearSize = surfaceSize,
|
||||
});
|
||||
} else {
|
||||
if (mipLevel > 0 && pitch > 0) {
|
||||
linearPitch = linearWidth;
|
||||
}
|
||||
|
||||
auto pitchAlign = std::max(8UL, 64UL / ((bitsPerElement + 7) / 8UL));
|
||||
std::uint32_t paddedPitch =
|
||||
(linearPitch + pitchAlign - 1) & ~(pitchAlign - 1);
|
||||
std::uint32_t paddedHeight = linearHeight;
|
||||
std::uint32_t paddedDepth = linearDepth;
|
||||
|
||||
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
|
||||
if (isCubemap) {
|
||||
linearDepth = std::bit_ceil(linearDepth);
|
||||
}
|
||||
|
||||
auto thickness = getMicroTileThickness(arrayMode);
|
||||
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
|
||||
}
|
||||
|
||||
std::uint32_t pixelsPerPipeInterleave =
|
||||
kPipeInterleaveBytes / ((bitsPerElement + 7) / 8);
|
||||
std::uint32_t sliceAlignInPixel =
|
||||
pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
|
||||
auto pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
|
||||
numFragmentsPerPixel;
|
||||
while (pixelsPerSlice % sliceAlignInPixel) {
|
||||
paddedPitch += pitchAlign;
|
||||
pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
|
||||
numFragmentsPerPixel;
|
||||
}
|
||||
|
||||
surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth;
|
||||
|
||||
result.setSubresourceInfo(mipLevel, {
|
||||
.dataWidth = paddedPitch,
|
||||
.dataHeight = paddedHeight,
|
||||
.dataDepth = paddedDepth,
|
||||
.offset = surfaceOffset,
|
||||
.tiledSize = surfaceSize,
|
||||
.linearSize = surfaceSize,
|
||||
});
|
||||
}
|
||||
|
||||
surfaceOffset += arraySliceCount * surfaceSize;
|
||||
}
|
||||
|
||||
result.totalSize = surfaceOffset;
|
||||
return result;
|
||||
}
|
||||
|
||||
SurfaceInfo amdgpu::computeSurfaceInfo(
|
||||
TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt,
|
||||
std::uint32_t width, std::uint32_t height, std::uint32_t depth,
|
||||
std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
|
||||
int mipCount, bool pow2pad) {
|
||||
switch (tileMode.arrayMode()) {
|
||||
case kArrayModeLinearGeneral:
|
||||
case kArrayModeLinearAligned:
|
||||
return computeTextureLinearInfo(
|
||||
tileMode.arrayMode(), type, dfmt, width, height, depth, pitch,
|
||||
baseArrayLayer, arrayCount, baseMipLevel, mipCount, pow2pad);
|
||||
|
||||
case kArrayMode1dTiledThin:
|
||||
case kArrayMode1dTiledThick:
|
||||
return computeTexture1dInfo(tileMode.arrayMode(), type, dfmt, width, height,
|
||||
depth, pitch, baseArrayLayer, arrayCount,
|
||||
baseMipLevel, mipCount, pow2pad);
|
||||
|
||||
case kArrayMode2dTiledThin:
|
||||
case kArrayMode2dTiledThick:
|
||||
case kArrayMode2dTiledXThick:
|
||||
case kArrayMode3dTiledThin:
|
||||
case kArrayMode3dTiledThick:
|
||||
case kArrayMode3dTiledXThick:
|
||||
case kArrayModeTiledThinPrt:
|
||||
case kArrayModeTiledThickPrt:
|
||||
case kArrayMode2dTiledThinPrt:
|
||||
case kArrayMode2dTiledThickPrt:
|
||||
case kArrayMode3dTiledThinPrt:
|
||||
case kArrayMode3dTiledThickPrt:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
std::abort();
|
||||
}
|
||||
|
||||
SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer,
|
||||
TileMode tileMode) {
|
||||
return computeSurfaceInfo(
|
||||
tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1,
|
||||
tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1,
|
||||
tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1,
|
||||
tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1,
|
||||
tbuffer.pow2pad != 0);
|
||||
}
|
||||
441
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
Normal file
441
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
Normal file
|
|
@ -0,0 +1,441 @@
|
|||
#include "amdgpu/tiler_cpu.hpp"
|
||||
#include "amdgpu/tiler.hpp"
|
||||
#include "gnm/gnm.hpp"
|
||||
|
||||
constexpr std::uint64_t
|
||||
getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded,
|
||||
gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel,
|
||||
int arraySlice, int numFragments, int width, int height,
|
||||
int depth, int pitch, int x, int y, int z) {
|
||||
|
||||
using namespace amdgpu;
|
||||
bool isCubemap = texType == gnm::TextureType::Cube;
|
||||
bool isVolume = texType == gnm::TextureType::Dim3D;
|
||||
|
||||
auto bitsPerFragment = getBitsPerElement(dfmt);
|
||||
uint32_t arraySliceCount = depth;
|
||||
|
||||
if (isCubemap) {
|
||||
arraySliceCount *= 6;
|
||||
} else if (isVolume) {
|
||||
arraySliceCount = 1;
|
||||
}
|
||||
|
||||
auto numFragmentsPerPixel = 1 << numFragments;
|
||||
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
|
||||
auto arrayMode = tileMode.arrayMode();
|
||||
|
||||
auto bitsPerElement = bitsPerFragment;
|
||||
auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1);
|
||||
auto paddedHeight = std::max(height >> mipLevel, 1);
|
||||
|
||||
auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1;
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
bitsPerElement *= 8;
|
||||
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
bitsPerElement *= 16;
|
||||
paddedWidth = std::max((paddedWidth + 3) / 4, 1);
|
||||
paddedHeight = std::max((paddedHeight + 3) / 4, 1);
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isPow2Padded) {
|
||||
arraySliceCount = std::bit_ceil(arraySliceCount);
|
||||
paddedWidth = std::bit_ceil(unsigned(paddedWidth));
|
||||
paddedHeight = std::bit_ceil(unsigned(paddedHeight));
|
||||
}
|
||||
|
||||
uint64_t finalSurfaceOffset = 0;
|
||||
uint64_t finalSurfaceSize = 0;
|
||||
|
||||
auto thickness = getMicroTileThickness(arrayMode);
|
||||
|
||||
for (int i = 0; i <= mipLevel; i++) {
|
||||
finalSurfaceOffset += arraySliceCount * finalSurfaceSize;
|
||||
|
||||
std::uint32_t elemWidth =
|
||||
std::max<std::uint64_t>((i > 0 ? pitch : width) >> i, 1);
|
||||
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> i, 1);
|
||||
std::uint32_t elemDepth =
|
||||
std::max<std::uint64_t>((isVolume ? depth : 1) >> i, 1);
|
||||
|
||||
if (isBlockCompressed) {
|
||||
switch (bitsPerFragment) {
|
||||
case 1:
|
||||
elemWidth = std::max<std::uint64_t>((elemWidth + 7) / 8, 1);
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
elemWidth = std::max<std::uint64_t>((elemWidth + 3) / 4, 1);
|
||||
elemHeight = std::max<std::uint64_t>((elemHeight + 3) / 4, 1);
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isPow2Padded) {
|
||||
elemWidth = std::bit_ceil(elemWidth);
|
||||
elemHeight = std::bit_ceil(elemHeight);
|
||||
elemDepth = std::bit_ceil(elemDepth);
|
||||
}
|
||||
|
||||
elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
|
||||
elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
|
||||
elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1);
|
||||
|
||||
std::uint32_t tempPitch = elemWidth;
|
||||
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
|
||||
elemHeight * bitsPerElement *
|
||||
numFragmentsPerPixel;
|
||||
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
|
||||
|
||||
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
|
||||
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
|
||||
tempPitch += 8;
|
||||
logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight *
|
||||
bitsPerElement * numFragmentsPerPixel;
|
||||
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
|
||||
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
|
||||
}
|
||||
|
||||
finalSurfaceSize = logicalSliceSizeBytes * elemDepth;
|
||||
}
|
||||
|
||||
finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice;
|
||||
|
||||
auto tileBytes =
|
||||
(kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement +
|
||||
7) /
|
||||
8;
|
||||
auto tilesPerRow = paddedWidth / kMicroTileWidth;
|
||||
auto tilesPerSlice =
|
||||
std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U);
|
||||
|
||||
uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement,
|
||||
tileMode.microTileMode(), arrayMode);
|
||||
|
||||
uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes;
|
||||
|
||||
uint64_t tileRowIndex = y / kMicroTileHeight;
|
||||
uint64_t tileColumnIndex = x / kMicroTileWidth;
|
||||
uint64_t tileOffset =
|
||||
(tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;
|
||||
|
||||
uint64_t elementOffset = elementIndex * bitsPerElement;
|
||||
uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset;
|
||||
|
||||
return finalOffset + finalSurfaceOffset * 8;
|
||||
}
|
||||
|
||||
constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height,
|
||||
int pitch, int x, int y, int z) {
|
||||
auto bitsPerFragment = getBitsPerElement(dfmt);
|
||||
|
||||
auto bitsPerElement = bitsPerFragment;
|
||||
auto paddedHeight = height;
|
||||
auto paddedWidth = pitch;
|
||||
|
||||
if (bitsPerFragment == 1) {
|
||||
bitsPerElement *= 8;
|
||||
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
|
||||
}
|
||||
|
||||
uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth;
|
||||
uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement;
|
||||
return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x;
|
||||
}
|
||||
|
||||
constexpr std::uint64_t
|
||||
getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
|
||||
gnm::DataFormat dfmt, amdgpu::TileMode tileMode,
|
||||
amdgpu::MacroTileMode macroTileMode, int mipLevel,
|
||||
int arraySlice, int numFragments, int width, int height,
|
||||
int depth, int pitch, int x, int y, int z, int fragmentIndex) {
|
||||
using namespace amdgpu;
|
||||
|
||||
bool isCubemap = texType == gnm::TextureType::Cube;
|
||||
bool isVolume = texType == gnm::TextureType::Dim3D;
|
||||
auto m_bitsPerFragment = getBitsPerElement(dfmt);
|
||||
|
||||
auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
|
||||
auto tileSwizzleMask = 0;
|
||||
auto numFragmentsPerPixel = 1 << numFragments;
|
||||
auto arrayMode = tileMode.arrayMode();
|
||||
|
||||
auto tileThickness = 1;
|
||||
|
||||
switch (arrayMode) {
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayModeTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThinPrt:
|
||||
case amdgpu::kArrayMode3dTiledThinPrt:
|
||||
tileThickness = 1;
|
||||
break;
|
||||
case amdgpu::kArrayMode1dTiledThick:
|
||||
case amdgpu::kArrayMode2dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayModeTiledThickPrt:
|
||||
case amdgpu::kArrayMode2dTiledThickPrt:
|
||||
case amdgpu::kArrayMode3dTiledThickPrt:
|
||||
tileThickness = 4;
|
||||
break;
|
||||
case amdgpu::kArrayMode2dTiledXThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
tileThickness = 8;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
auto bitsPerElement = m_bitsPerFragment;
|
||||
auto paddedWidth = pitch;
|
||||
auto paddedHeight = height;
|
||||
|
||||
if (m_isBlockCompressed) {
|
||||
switch (m_bitsPerFragment) {
|
||||
case 1:
|
||||
bitsPerElement *= 8;
|
||||
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
|
||||
break;
|
||||
case 4:
|
||||
case 8:
|
||||
bitsPerElement *= 16;
|
||||
paddedWidth = std::max((paddedWidth + 3) / 4, 1);
|
||||
paddedHeight = std::max((paddedHeight + 3) / 4, 1);
|
||||
break;
|
||||
case 16:
|
||||
std::abort();
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto bankWidthHW = macroTileMode.bankWidth();
|
||||
auto bankHeightHW = macroTileMode.bankHeight();
|
||||
auto macroAspectHW = macroTileMode.macroTileAspect();
|
||||
auto numBanksHW = macroTileMode.numBanks();
|
||||
|
||||
auto bankWidth = 1 << bankWidthHW;
|
||||
auto bankHeight = 1 << bankHeightHW;
|
||||
unsigned numBanks = 2 << numBanksHW;
|
||||
auto macroTileAspect = 1 << macroAspectHW;
|
||||
|
||||
uint32_t tileBytes1x =
|
||||
(tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
|
||||
7) /
|
||||
8;
|
||||
|
||||
auto sampleSplitHw = tileMode.sampleSplit();
|
||||
auto tileSplitHw = tileMode.tileSplit();
|
||||
uint32_t sampleSplit = 1 << sampleSplitHw;
|
||||
uint32_t tileSplitC =
|
||||
(tileMode.microTileMode() == amdgpu::kMicroTileModeDepth)
|
||||
? (64 << tileSplitHw)
|
||||
: std::max(256U, tileBytes1x * sampleSplit);
|
||||
|
||||
auto tileSplitBytes = std::min(kDramRowSize, tileSplitC);
|
||||
|
||||
auto numPipes = getPipeCount(tileMode.pipeConfig());
|
||||
auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes);
|
||||
auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
|
||||
auto pipeBits = std::countr_zero(numPipes);
|
||||
auto bankBits = std::countr_zero(numBanks);
|
||||
// auto pipeMask = (numPipes - 1) << pipeInterleaveBits;
|
||||
auto bankSwizzleMask = tileSwizzleMask;
|
||||
auto pipeSwizzleMask = 0;
|
||||
auto macroTileWidth =
|
||||
(kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
|
||||
auto macroTileHeight =
|
||||
(kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;
|
||||
|
||||
auto microTileMode = tileMode.microTileMode();
|
||||
|
||||
uint64_t elementIndex =
|
||||
getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode);
|
||||
|
||||
uint32_t xh = x, yh = y;
|
||||
if (arrayMode == amdgpu::kArrayModeTiledThinPrt ||
|
||||
arrayMode == amdgpu::kArrayModeTiledThickPrt) {
|
||||
xh %= macroTileWidth;
|
||||
yh %= macroTileHeight;
|
||||
}
|
||||
uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig());
|
||||
uint64_t bank =
|
||||
getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
|
||||
|
||||
uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
|
||||
bitsPerElement * numFragmentsPerPixel +
|
||||
7) /
|
||||
8;
|
||||
|
||||
uint64_t elementOffset = 0;
|
||||
if (microTileMode == amdgpu::kMicroTileModeDepth) {
|
||||
uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
|
||||
elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
|
||||
} else {
|
||||
uint64_t fragmentOffset =
|
||||
fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
|
||||
elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
|
||||
}
|
||||
|
||||
uint64_t slicesPerTile = 1;
|
||||
uint64_t tileSplitSlice = 0;
|
||||
if (tileBytes > tileSplitBytes && tileThickness == 1) {
|
||||
slicesPerTile = tileBytes / tileSplitBytes;
|
||||
tileSplitSlice = elementOffset / (tileSplitBytes * 8);
|
||||
elementOffset %= (tileSplitBytes * 8);
|
||||
tileBytes = tileSplitBytes;
|
||||
}
|
||||
|
||||
uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
|
||||
(macroTileHeight / kMicroTileHeight) * tileBytes /
|
||||
(numPipes * numBanks);
|
||||
uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
|
||||
uint64_t macroTileRowIndex = y / macroTileHeight;
|
||||
uint64_t macroTileColumnIndex = x / macroTileWidth;
|
||||
uint64_t macroTileIndex =
|
||||
(macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
|
||||
uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
|
||||
uint64_t macroTilesPerSlice =
|
||||
macroTilesPerRow * (paddedHeight / macroTileHeight);
|
||||
uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;
|
||||
|
||||
uint32_t slice = z;
|
||||
uint64_t sliceOffset =
|
||||
(tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
|
||||
if (arraySlice != 0) {
|
||||
slice = arraySlice;
|
||||
}
|
||||
|
||||
uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight;
|
||||
uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth;
|
||||
uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
|
||||
uint64_t tileOffset = tileIndex * tileBytes;
|
||||
|
||||
uint64_t bankSwizzle = bankSwizzleMask;
|
||||
uint64_t pipeSwizzle = pipeSwizzleMask;
|
||||
|
||||
uint64_t pipeSliceRotation = 0;
|
||||
switch (arrayMode) {
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
pipeSliceRotation =
|
||||
std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
pipeSwizzle += pipeSliceRotation;
|
||||
pipeSwizzle &= (numPipes - 1);
|
||||
pipe = pipe ^ pipeSwizzle;
|
||||
|
||||
uint32_t sliceRotation = 0;
|
||||
switch (arrayMode) {
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayMode2dTiledThick:
|
||||
case amdgpu::kArrayMode2dTiledXThick:
|
||||
sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
|
||||
break;
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) *
|
||||
(slice / tileThickness) / numPipes;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint64_t tileSplitSliceRotation = 0;
|
||||
switch (arrayMode) {
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode2dTiledThinPrt:
|
||||
case amdgpu::kArrayMode3dTiledThinPrt:
|
||||
tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
bank ^= bankSwizzle + sliceRotation;
|
||||
bank ^= tileSplitSliceRotation;
|
||||
bank &= (numBanks - 1);
|
||||
|
||||
uint64_t totalOffset =
|
||||
(sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
|
||||
uint64_t bitOffset = totalOffset & 0x7;
|
||||
totalOffset /= 8;
|
||||
|
||||
uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
|
||||
uint64_t offset = totalOffset >> pipeInterleaveBits;
|
||||
|
||||
uint64_t finalByteOffset =
|
||||
pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
|
||||
(bank << (pipeInterleaveBits + pipeBits)) |
|
||||
(offset << (pipeInterleaveBits + pipeBits + bankBits));
|
||||
return (finalByteOffset << 3) | bitOffset;
|
||||
}
|
||||
|
||||
std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType,
|
||||
bool isPow2Padded, int numFragments,
|
||||
gnm::DataFormat dfmt,
|
||||
amdgpu::TileMode tileMode,
|
||||
amdgpu::MacroTileMode macroTileMode,
|
||||
int mipLevel, int arraySlice, int width,
|
||||
int height, int depth, int pitch, int x,
|
||||
int y, int z, int fragmentIndex) {
|
||||
switch (tileMode.arrayMode()) {
|
||||
case amdgpu::kArrayModeLinearGeneral:
|
||||
case amdgpu::kArrayModeLinearAligned:
|
||||
return getTiledOffsetLinear(dfmt, height, pitch, x, y, z);
|
||||
|
||||
case amdgpu::kArrayMode1dTiledThin:
|
||||
case amdgpu::kArrayMode1dTiledThick: {
|
||||
return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel,
|
||||
arraySlice, numFragments, width, height, depth,
|
||||
pitch, x, y, z);
|
||||
}
|
||||
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayMode2dTiledThick:
|
||||
case amdgpu::kArrayMode2dTiledXThick:
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
case amdgpu::kArrayModeTiledThinPrt:
|
||||
case amdgpu::kArrayModeTiledThickPrt:
|
||||
case amdgpu::kArrayMode2dTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThickPrt:
|
||||
case amdgpu::kArrayMode3dTiledThinPrt:
|
||||
case amdgpu::kArrayMode3dTiledThickPrt:
|
||||
return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode,
|
||||
macroTileMode, mipLevel, arraySlice, numFragments,
|
||||
width, height, depth, pitch, x, y, z,
|
||||
fragmentIndex);
|
||||
}
|
||||
|
||||
std::abort();
|
||||
}
|
||||
354
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
Normal file
354
rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
#include "amdgpu/tiler_vulkan.hpp"
|
||||
#include "Scheduler.hpp"
|
||||
#include "amdgpu/tiler.hpp"
|
||||
#include <bit>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <vk.hpp>
|
||||
|
||||
#include <shaders/detiler1d.comp.h>
|
||||
#include <shaders/detiler2d.comp.h>
|
||||
#include <shaders/detilerLinear.comp.h>
|
||||
#include <shaders/tiler1d.comp.h>
|
||||
#include <shaders/tiler2d.comp.h>
|
||||
#include <shaders/tilerLinear.comp.h>
|
||||
|
||||
struct TilerDecriptorSetLayout {
|
||||
VkDescriptorSetLayout layout;
|
||||
|
||||
TilerDecriptorSetLayout() {
|
||||
std::vector<VkDescriptorSetLayoutBinding> bindings{{
|
||||
.binding = 0,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
}};
|
||||
|
||||
VkDescriptorSetLayoutCreateInfo layoutInfo{
|
||||
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
|
||||
.bindingCount = static_cast<uint32_t>(bindings.size()),
|
||||
.pBindings = bindings.data(),
|
||||
};
|
||||
|
||||
VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo,
|
||||
nullptr, &layout));
|
||||
}
|
||||
|
||||
~TilerDecriptorSetLayout() {
|
||||
vkDestroyDescriptorSetLayout(vk::context->device, layout,
|
||||
vk::context->allocator);
|
||||
}
|
||||
};
|
||||
|
||||
struct TilerShader {
|
||||
VkShaderEXT shader;
|
||||
|
||||
TilerShader(TilerDecriptorSetLayout &setLayout,
|
||||
std::span<const std::uint32_t> spirv) {
|
||||
|
||||
VkShaderCreateInfoEXT shaderInfo{
|
||||
.sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT,
|
||||
.flags = 0,
|
||||
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.nextStage = 0,
|
||||
.codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT,
|
||||
.codeSize = spirv.size_bytes(),
|
||||
.pCode = spirv.data(),
|
||||
.pName = "main",
|
||||
.setLayoutCount = 1,
|
||||
.pSetLayouts = &setLayout.layout,
|
||||
.pushConstantRangeCount = 0,
|
||||
.pPushConstantRanges = 0,
|
||||
.pSpecializationInfo = 0,
|
||||
};
|
||||
|
||||
VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &shaderInfo, nullptr,
|
||||
&shader));
|
||||
}
|
||||
|
||||
~TilerShader() {
|
||||
vk::DestroyShaderEXT(vk::context->device, shader, vk::context->allocator);
|
||||
}
|
||||
};
|
||||
|
||||
struct amdgpu::GpuTiler::Impl {
|
||||
TilerDecriptorSetLayout descriptorSetLayout;
|
||||
std::mutex descriptorMtx;
|
||||
VkDescriptorSet descriptorSets[4]{};
|
||||
VkDescriptorPool descriptorPool;
|
||||
std::uint32_t inUseDescriptorSets = 0;
|
||||
|
||||
vk::Buffer configData;
|
||||
TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp};
|
||||
TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp};
|
||||
TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp};
|
||||
TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp};
|
||||
TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp};
|
||||
TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp};
|
||||
VkPipelineLayout pipelineLayout;
|
||||
|
||||
struct Config {
|
||||
uint64_t srcAddress;
|
||||
uint64_t dstAddress;
|
||||
uint32_t dataWidth;
|
||||
uint32_t dataHeight;
|
||||
uint32_t tileMode;
|
||||
uint32_t numFragments;
|
||||
uint32_t bitsPerElement;
|
||||
uint32_t tiledSurfaceSize;
|
||||
uint32_t linearSurfaceSize;
|
||||
};
|
||||
|
||||
Impl() {
|
||||
std::size_t count = 256;
|
||||
|
||||
configData = vk::Buffer::Allocate(
|
||||
vk::getHostVisibleMemory(), sizeof(Config) * count,
|
||||
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
|
||||
|
||||
VkPipelineLayoutCreateInfo piplineLayoutInfo{
|
||||
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
||||
.setLayoutCount = 1,
|
||||
.pSetLayouts = &descriptorSetLayout.layout,
|
||||
};
|
||||
|
||||
VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo,
|
||||
nullptr, &pipelineLayout));
|
||||
|
||||
{
|
||||
VkDescriptorPoolSize poolSizes[]{{
|
||||
.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
.descriptorCount = 1,
|
||||
}};
|
||||
|
||||
VkDescriptorPoolCreateInfo info{
|
||||
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
|
||||
.maxSets = static_cast<std::uint32_t>(std::size(descriptorSets)) * 4,
|
||||
.poolSizeCount = static_cast<uint32_t>(std::size(poolSizes)),
|
||||
.pPoolSizes = poolSizes,
|
||||
};
|
||||
|
||||
VK_VERIFY(vkCreateDescriptorPool(
|
||||
vk::context->device, &info, vk::context->allocator, &descriptorPool));
|
||||
}
|
||||
|
||||
VkDescriptorSetAllocateInfo info{
|
||||
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
|
||||
.descriptorPool = descriptorPool,
|
||||
.descriptorSetCount = 1,
|
||||
.pSetLayouts = &descriptorSetLayout.layout,
|
||||
};
|
||||
for (std::size_t i = 0; i < std::size(descriptorSets); ++i) {
|
||||
VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info,
|
||||
descriptorSets + i));
|
||||
}
|
||||
}
|
||||
|
||||
~Impl() {
|
||||
vkDestroyDescriptorPool(vk::context->device, descriptorPool,
|
||||
vk::context->allocator);
|
||||
vkDestroyPipelineLayout(vk::context->device, pipelineLayout,
|
||||
vk::context->allocator);
|
||||
}
|
||||
|
||||
std::uint32_t allocateDescriptorSlot() {
|
||||
std::lock_guard lock(descriptorMtx);
|
||||
|
||||
auto result = std::countl_one(inUseDescriptorSets);
|
||||
rx::dieIf(result >= std::size(descriptorSets),
|
||||
"out of tiler descriptor sets");
|
||||
inUseDescriptorSets |= (1 << result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void releaseDescriptorSlot(std::uint32_t slot) {
|
||||
std::lock_guard lock(descriptorMtx);
|
||||
inUseDescriptorSets &= ~(1u << slot);
|
||||
}
|
||||
};
|
||||
|
||||
amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
|
||||
amdgpu::GpuTiler::~GpuTiler() = default;
|
||||
|
||||
void amdgpu::GpuTiler::detile(Scheduler &scheduler,
|
||||
const amdgpu::SurfaceInfo &info,
|
||||
amdgpu::TileMode tileMode,
|
||||
std::uint64_t srcTiledAddress,
|
||||
std::uint64_t dstLinearAddress, int mipLevel,
|
||||
int baseArray, int arrayCount) {
|
||||
auto commandBuffer = scheduler.getCommandBuffer();
|
||||
auto slot = mImpl->allocateDescriptorSlot();
|
||||
|
||||
auto configOffset = slot * sizeof(Impl::Config);
|
||||
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
|
||||
configOffset);
|
||||
|
||||
auto &subresource = info.getSubresourceInfo(mipLevel);
|
||||
config->srcAddress = srcTiledAddress + subresource.offset +
|
||||
(subresource.tiledSize * baseArray);
|
||||
config->dstAddress = dstLinearAddress + (subresource.linearSize * baseArray);
|
||||
config->dataWidth = subresource.dataWidth;
|
||||
config->dataHeight = subresource.dataHeight;
|
||||
config->tileMode = tileMode.raw;
|
||||
config->numFragments = info.numFragments;
|
||||
config->bitsPerElement = info.bitsPerElement;
|
||||
uint32_t groupCountZ = subresource.dataDepth;
|
||||
|
||||
if (arrayCount > 1) {
|
||||
config->tiledSurfaceSize = subresource.tiledSize;
|
||||
config->linearSurfaceSize = subresource.linearSize;
|
||||
groupCountZ = arrayCount;
|
||||
} else {
|
||||
config->tiledSurfaceSize = 0;
|
||||
config->linearSurfaceSize = 0;
|
||||
}
|
||||
|
||||
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
|
||||
|
||||
switch (tileMode.arrayMode()) {
|
||||
case amdgpu::kArrayModeLinearGeneral:
|
||||
case amdgpu::kArrayModeLinearAligned:
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages,
|
||||
&mImpl->detilerLinear.shader);
|
||||
break;
|
||||
|
||||
case amdgpu::kArrayMode1dTiledThin:
|
||||
case amdgpu::kArrayMode1dTiledThick:
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
|
||||
break;
|
||||
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayModeTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThick:
|
||||
case amdgpu::kArrayMode2dTiledXThick:
|
||||
case amdgpu::kArrayModeTiledThickPrt:
|
||||
case amdgpu::kArrayMode2dTiledThickPrt:
|
||||
case amdgpu::kArrayMode3dTiledThinPrt:
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
case amdgpu::kArrayMode3dTiledThickPrt:
|
||||
std::abort();
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
|
||||
break;
|
||||
}
|
||||
|
||||
VkDescriptorBufferInfo bufferInfo{
|
||||
.buffer = mImpl->configData.getHandle(),
|
||||
.offset = configOffset,
|
||||
.range = sizeof(Impl::Config),
|
||||
};
|
||||
|
||||
VkWriteDescriptorSet writeDescSet{
|
||||
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
|
||||
.dstSet = mImpl->descriptorSets[slot],
|
||||
.dstBinding = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
.pBufferInfo = &bufferInfo,
|
||||
};
|
||||
|
||||
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
|
||||
|
||||
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
mImpl->pipelineLayout, 0, 1,
|
||||
&mImpl->descriptorSets[slot], 0, nullptr);
|
||||
|
||||
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
|
||||
groupCountZ);
|
||||
|
||||
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
|
||||
}
|
||||
|
||||
void amdgpu::GpuTiler::tile(Scheduler &scheduler,
|
||||
const amdgpu::SurfaceInfo &info,
|
||||
amdgpu::TileMode tileMode,
|
||||
std::uint64_t srcLinearAddress,
|
||||
std::uint64_t dstTiledAddress, int mipLevel,
|
||||
int baseArray, int arrayCount) {
|
||||
auto commandBuffer = scheduler.getCommandBuffer();
|
||||
auto slot = mImpl->allocateDescriptorSlot();
|
||||
|
||||
auto configOffset = slot * sizeof(Impl::Config);
|
||||
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
|
||||
configOffset);
|
||||
|
||||
auto &subresource = info.getSubresourceInfo(mipLevel);
|
||||
config->srcAddress = srcLinearAddress + subresource.offset +
|
||||
subresource.linearSize * baseArray;
|
||||
config->dstAddress = dstTiledAddress;
|
||||
config->dataWidth = subresource.dataWidth;
|
||||
config->dataHeight = subresource.dataHeight;
|
||||
config->tileMode = tileMode.raw;
|
||||
config->numFragments = info.numFragments;
|
||||
config->bitsPerElement = info.bitsPerElement;
|
||||
uint32_t groupCountZ = subresource.dataDepth;
|
||||
|
||||
if (arrayCount > 1) {
|
||||
config->tiledSurfaceSize = subresource.tiledSize;
|
||||
config->linearSurfaceSize = subresource.linearSize;
|
||||
groupCountZ = arrayCount;
|
||||
} else {
|
||||
config->tiledSurfaceSize = 0;
|
||||
config->linearSurfaceSize = 0;
|
||||
}
|
||||
|
||||
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
|
||||
|
||||
switch (tileMode.arrayMode()) {
|
||||
case amdgpu::kArrayModeLinearGeneral:
|
||||
case amdgpu::kArrayModeLinearAligned:
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tilerLinear.shader);
|
||||
break;
|
||||
|
||||
case amdgpu::kArrayMode1dTiledThin:
|
||||
case amdgpu::kArrayMode1dTiledThick:
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
|
||||
break;
|
||||
|
||||
case amdgpu::kArrayMode2dTiledThin:
|
||||
case amdgpu::kArrayModeTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThinPrt:
|
||||
case amdgpu::kArrayMode2dTiledThick:
|
||||
case amdgpu::kArrayMode2dTiledXThick:
|
||||
case amdgpu::kArrayModeTiledThickPrt:
|
||||
case amdgpu::kArrayMode2dTiledThickPrt:
|
||||
case amdgpu::kArrayMode3dTiledThinPrt:
|
||||
case amdgpu::kArrayMode3dTiledThin:
|
||||
case amdgpu::kArrayMode3dTiledThick:
|
||||
case amdgpu::kArrayMode3dTiledXThick:
|
||||
case amdgpu::kArrayMode3dTiledThickPrt:
|
||||
std::abort();
|
||||
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
|
||||
break;
|
||||
}
|
||||
|
||||
VkDescriptorBufferInfo bufferInfo{
|
||||
.buffer = mImpl->configData.getHandle(),
|
||||
.offset = configOffset,
|
||||
.range = sizeof(Impl::Config),
|
||||
};
|
||||
|
||||
VkWriteDescriptorSet writeDescSet{
|
||||
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
|
||||
.dstSet = mImpl->descriptorSets[slot],
|
||||
.dstBinding = 0,
|
||||
.descriptorCount = 1,
|
||||
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
|
||||
.pBufferInfo = &bufferInfo,
|
||||
};
|
||||
|
||||
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
|
||||
|
||||
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
|
||||
mImpl->pipelineLayout, 0, 1,
|
||||
&mImpl->descriptorSets[slot], 0, nullptr);
|
||||
|
||||
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
|
||||
groupCountZ);
|
||||
|
||||
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue