gpu rewrite initial commit

This commit is contained in:
DH 2024-09-25 16:00:55 +03:00
parent 0d4ed51cd9
commit 4cf808facd
133 changed files with 35491 additions and 4 deletions

View file

@ -0,0 +1,387 @@
#include "gnm/constants.hpp"
#include <amdgpu/tiler.hpp>
#include <gnm/gnm.hpp>
#include <bit>
using namespace amdgpu;
static constexpr SurfaceInfo
computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
gnm::DataFormat dfmt, std::uint32_t width,
std::uint32_t height, std::uint32_t depth,
std::uint32_t pitch, int baseArrayLayer, int arrayCount,
int baseMipLevel, int mipCount, bool pow2pad) {
bool isCubemap = type == gnm::TextureType::Cube;
bool isVolume = type == gnm::TextureType::Dim3D;
auto bitsPerFragment = getBitsPerElement(dfmt);
std::uint32_t arraySliceCount = depth;
if (isCubemap) {
arraySliceCount *= 6;
} else if (isVolume) {
arraySliceCount = 1;
}
int numFragments = (type == gnm::TextureType::Msaa2D ||
type == gnm::TextureType::MsaaArray2D)
? (baseArrayLayer + arrayCount - 1)
: 0;
auto numFragmentsPerPixel = 1 << numFragments;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto bitsPerElement = bitsPerFragment;
depth = isVolume ? depth : 1;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
bitsPerElement *= 8;
break;
case 4:
case 8:
bitsPerElement *= 16;
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
arraySliceCount = std::bit_ceil(arraySliceCount);
}
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
SurfaceInfo result;
result.width = width;
result.height = height;
result.depth = depth;
result.pitch = pitch;
result.numFragments = numFragments;
result.bitsPerElement = bitsPerElement;
result.arrayLayerCount = arraySliceCount;
auto thickness = getMicroTileThickness(arrayMode);
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
std::uint32_t linearPitch = elemPitch;
std::uint32_t linearWidth = elemWidth;
std::uint32_t linearHeight = elemHeight;
std::uint32_t linearDepth = elemDepth;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
break;
case 4:
case 8:
linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
linearPitch = std::bit_ceil(linearPitch);
linearWidth = std::bit_ceil(linearWidth);
linearHeight = std::bit_ceil(linearHeight);
linearDepth = std::bit_ceil(linearDepth);
}
if (mipLevel > 0 && pitch > 0) {
linearPitch = linearWidth;
}
std::uint32_t paddedPitch =
(linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
std::uint32_t paddedHeight =
(linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
std::uint32_t paddedDepth = linearDepth;
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
if (isCubemap) {
linearDepth = std::bit_ceil(linearDepth);
}
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
}
std::uint32_t tempPitch = paddedPitch;
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
paddedHeight * bitsPerElement *
numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
tempPitch += kMicroTileWidth;
logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
bitsPerElement * numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
}
surfaceSize = logicalSliceSizeBytes * paddedDepth;
auto linearSize =
linearDepth *
(linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
7) /
8;
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = linearSize,
});
surfaceOffset += arraySliceCount * surfaceSize;
}
result.totalSize = surfaceOffset;
return result;
}
static constexpr SurfaceInfo computeTextureLinearInfo(
ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt,
std::uint32_t width, std::uint32_t height, std::uint32_t depth,
std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
int mipCount, bool pow2pad) {
bool isCubemap = type == gnm::TextureType::Cube;
bool isVolume = type == gnm::TextureType::Dim3D;
auto bitsPerFragment = getBitsPerElement(dfmt);
std::uint32_t arraySliceCount = depth;
if (isCubemap) {
arraySliceCount *= 6;
} else if (isVolume) {
arraySliceCount = 1;
}
int numFragments = (type == gnm::TextureType::Msaa2D ||
type == gnm::TextureType::MsaaArray2D)
? (baseArrayLayer + arrayCount - 1)
: 0;
auto numFragmentsPerPixel = 1 << numFragments;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto bitsPerElement = bitsPerFragment;
depth = isVolume ? depth : 1;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
bitsPerElement *= 8;
break;
case 4:
case 8:
bitsPerElement *= 16;
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
arraySliceCount = std::bit_ceil(arraySliceCount);
}
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
SurfaceInfo result;
result.width = width;
result.height = height;
result.depth = depth;
result.pitch = pitch;
result.numFragments = numFragments;
result.bitsPerElement = bitsPerElement;
result.arrayLayerCount = arraySliceCount;
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
std::uint32_t linearPitch = elemPitch;
std::uint32_t linearWidth = elemWidth;
std::uint32_t linearHeight = elemHeight;
std::uint32_t linearDepth = elemDepth;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
break;
case 4:
case 8:
linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
linearPitch = std::bit_ceil(linearPitch);
linearWidth = std::bit_ceil(linearWidth);
linearHeight = std::bit_ceil(linearHeight);
linearDepth = std::bit_ceil(linearDepth);
}
if (mipLevel > 0 && pitch > 0) {
linearPitch = linearWidth;
}
if (arrayMode == kArrayModeLinearGeneral) {
surfaceSize = (static_cast<uint64_t>(linearPitch) *
(linearHeight)*bitsPerElement * numFragmentsPerPixel +
7) /
8;
surfaceSize *= linearDepth;
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = surfaceSize,
});
} else {
if (mipLevel > 0 && pitch > 0) {
linearPitch = linearWidth;
}
auto pitchAlign = std::max(8UL, 64UL / ((bitsPerElement + 7) / 8UL));
std::uint32_t paddedPitch =
(linearPitch + pitchAlign - 1) & ~(pitchAlign - 1);
std::uint32_t paddedHeight = linearHeight;
std::uint32_t paddedDepth = linearDepth;
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
if (isCubemap) {
linearDepth = std::bit_ceil(linearDepth);
}
auto thickness = getMicroTileThickness(arrayMode);
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
}
std::uint32_t pixelsPerPipeInterleave =
kPipeInterleaveBytes / ((bitsPerElement + 7) / 8);
std::uint32_t sliceAlignInPixel =
pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
auto pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
numFragmentsPerPixel;
while (pixelsPerSlice % sliceAlignInPixel) {
paddedPitch += pitchAlign;
pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
numFragmentsPerPixel;
}
surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth;
result.setSubresourceInfo(mipLevel, {
.dataWidth = paddedPitch,
.dataHeight = paddedHeight,
.dataDepth = paddedDepth,
.offset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = surfaceSize,
});
}
surfaceOffset += arraySliceCount * surfaceSize;
}
result.totalSize = surfaceOffset;
return result;
}
SurfaceInfo amdgpu::computeSurfaceInfo(
TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt,
std::uint32_t width, std::uint32_t height, std::uint32_t depth,
std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
int mipCount, bool pow2pad) {
switch (tileMode.arrayMode()) {
case kArrayModeLinearGeneral:
case kArrayModeLinearAligned:
return computeTextureLinearInfo(
tileMode.arrayMode(), type, dfmt, width, height, depth, pitch,
baseArrayLayer, arrayCount, baseMipLevel, mipCount, pow2pad);
case kArrayMode1dTiledThin:
case kArrayMode1dTiledThick:
return computeTexture1dInfo(tileMode.arrayMode(), type, dfmt, width, height,
depth, pitch, baseArrayLayer, arrayCount,
baseMipLevel, mipCount, pow2pad);
case kArrayMode2dTiledThin:
case kArrayMode2dTiledThick:
case kArrayMode2dTiledXThick:
case kArrayMode3dTiledThin:
case kArrayMode3dTiledThick:
case kArrayMode3dTiledXThick:
case kArrayModeTiledThinPrt:
case kArrayModeTiledThickPrt:
case kArrayMode2dTiledThinPrt:
case kArrayMode2dTiledThickPrt:
case kArrayMode3dTiledThinPrt:
case kArrayMode3dTiledThickPrt:
std::abort();
}
std::abort();
}
SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer,
TileMode tileMode) {
return computeSurfaceInfo(
tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1,
tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1,
tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1,
tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1,
tbuffer.pow2pad != 0);
}

View file

@ -0,0 +1,441 @@
#include "amdgpu/tiler_cpu.hpp"
#include "amdgpu/tiler.hpp"
#include "gnm/gnm.hpp"
constexpr std::uint64_t
getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded,
gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel,
int arraySlice, int numFragments, int width, int height,
int depth, int pitch, int x, int y, int z) {
using namespace amdgpu;
bool isCubemap = texType == gnm::TextureType::Cube;
bool isVolume = texType == gnm::TextureType::Dim3D;
auto bitsPerFragment = getBitsPerElement(dfmt);
uint32_t arraySliceCount = depth;
if (isCubemap) {
arraySliceCount *= 6;
} else if (isVolume) {
arraySliceCount = 1;
}
auto numFragmentsPerPixel = 1 << numFragments;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto arrayMode = tileMode.arrayMode();
auto bitsPerElement = bitsPerFragment;
auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1);
auto paddedHeight = std::max(height >> mipLevel, 1);
auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
bitsPerElement *= 8;
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
break;
case 4:
case 8:
bitsPerElement *= 16;
paddedWidth = std::max((paddedWidth + 3) / 4, 1);
paddedHeight = std::max((paddedHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (isPow2Padded) {
arraySliceCount = std::bit_ceil(arraySliceCount);
paddedWidth = std::bit_ceil(unsigned(paddedWidth));
paddedHeight = std::bit_ceil(unsigned(paddedHeight));
}
uint64_t finalSurfaceOffset = 0;
uint64_t finalSurfaceSize = 0;
auto thickness = getMicroTileThickness(arrayMode);
for (int i = 0; i <= mipLevel; i++) {
finalSurfaceOffset += arraySliceCount * finalSurfaceSize;
std::uint32_t elemWidth =
std::max<std::uint64_t>((i > 0 ? pitch : width) >> i, 1);
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> i, 1);
std::uint32_t elemDepth =
std::max<std::uint64_t>((isVolume ? depth : 1) >> i, 1);
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
elemWidth = std::max<std::uint64_t>((elemWidth + 7) / 8, 1);
break;
case 4:
case 8:
elemWidth = std::max<std::uint64_t>((elemWidth + 3) / 4, 1);
elemHeight = std::max<std::uint64_t>((elemHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (isPow2Padded) {
elemWidth = std::bit_ceil(elemWidth);
elemHeight = std::bit_ceil(elemHeight);
elemDepth = std::bit_ceil(elemDepth);
}
elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1);
std::uint32_t tempPitch = elemWidth;
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
elemHeight * bitsPerElement *
numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
tempPitch += 8;
logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight *
bitsPerElement * numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
}
finalSurfaceSize = logicalSliceSizeBytes * elemDepth;
}
finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice;
auto tileBytes =
(kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement +
7) /
8;
auto tilesPerRow = paddedWidth / kMicroTileWidth;
auto tilesPerSlice =
std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U);
uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement,
tileMode.microTileMode(), arrayMode);
uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes;
uint64_t tileRowIndex = y / kMicroTileHeight;
uint64_t tileColumnIndex = x / kMicroTileWidth;
uint64_t tileOffset =
(tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;
uint64_t elementOffset = elementIndex * bitsPerElement;
uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset;
return finalOffset + finalSurfaceOffset * 8;
}
constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height,
int pitch, int x, int y, int z) {
auto bitsPerFragment = getBitsPerElement(dfmt);
auto bitsPerElement = bitsPerFragment;
auto paddedHeight = height;
auto paddedWidth = pitch;
if (bitsPerFragment == 1) {
bitsPerElement *= 8;
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
}
uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth;
uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement;
return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x;
}
constexpr std::uint64_t
getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
gnm::DataFormat dfmt, amdgpu::TileMode tileMode,
amdgpu::MacroTileMode macroTileMode, int mipLevel,
int arraySlice, int numFragments, int width, int height,
int depth, int pitch, int x, int y, int z, int fragmentIndex) {
using namespace amdgpu;
bool isCubemap = texType == gnm::TextureType::Cube;
bool isVolume = texType == gnm::TextureType::Dim3D;
auto m_bitsPerFragment = getBitsPerElement(dfmt);
auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto tileSwizzleMask = 0;
auto numFragmentsPerPixel = 1 << numFragments;
auto arrayMode = tileMode.arrayMode();
auto tileThickness = 1;
switch (arrayMode) {
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayModeTiledThinPrt:
case amdgpu::kArrayMode2dTiledThinPrt:
case amdgpu::kArrayMode3dTiledThinPrt:
tileThickness = 1;
break;
case amdgpu::kArrayMode1dTiledThick:
case amdgpu::kArrayMode2dTiledThick:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayModeTiledThickPrt:
case amdgpu::kArrayMode2dTiledThickPrt:
case amdgpu::kArrayMode3dTiledThickPrt:
tileThickness = 4;
break;
case amdgpu::kArrayMode2dTiledXThick:
case amdgpu::kArrayMode3dTiledXThick:
tileThickness = 8;
break;
default:
break;
}
auto bitsPerElement = m_bitsPerFragment;
auto paddedWidth = pitch;
auto paddedHeight = height;
if (m_isBlockCompressed) {
switch (m_bitsPerFragment) {
case 1:
bitsPerElement *= 8;
paddedWidth = std::max((paddedWidth + 7) / 8, 1);
break;
case 4:
case 8:
bitsPerElement *= 16;
paddedWidth = std::max((paddedWidth + 3) / 4, 1);
paddedHeight = std::max((paddedHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
auto bankWidthHW = macroTileMode.bankWidth();
auto bankHeightHW = macroTileMode.bankHeight();
auto macroAspectHW = macroTileMode.macroTileAspect();
auto numBanksHW = macroTileMode.numBanks();
auto bankWidth = 1 << bankWidthHW;
auto bankHeight = 1 << bankHeightHW;
unsigned numBanks = 2 << numBanksHW;
auto macroTileAspect = 1 << macroAspectHW;
uint32_t tileBytes1x =
(tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
7) /
8;
auto sampleSplitHw = tileMode.sampleSplit();
auto tileSplitHw = tileMode.tileSplit();
uint32_t sampleSplit = 1 << sampleSplitHw;
uint32_t tileSplitC =
(tileMode.microTileMode() == amdgpu::kMicroTileModeDepth)
? (64 << tileSplitHw)
: std::max(256U, tileBytes1x * sampleSplit);
auto tileSplitBytes = std::min(kDramRowSize, tileSplitC);
auto numPipes = getPipeCount(tileMode.pipeConfig());
auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes);
auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
auto pipeBits = std::countr_zero(numPipes);
auto bankBits = std::countr_zero(numBanks);
// auto pipeMask = (numPipes - 1) << pipeInterleaveBits;
auto bankSwizzleMask = tileSwizzleMask;
auto pipeSwizzleMask = 0;
auto macroTileWidth =
(kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
auto macroTileHeight =
(kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;
auto microTileMode = tileMode.microTileMode();
uint64_t elementIndex =
getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode);
uint32_t xh = x, yh = y;
if (arrayMode == amdgpu::kArrayModeTiledThinPrt ||
arrayMode == amdgpu::kArrayModeTiledThickPrt) {
xh %= macroTileWidth;
yh %= macroTileHeight;
}
uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig());
uint64_t bank =
getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
bitsPerElement * numFragmentsPerPixel +
7) /
8;
uint64_t elementOffset = 0;
if (microTileMode == amdgpu::kMicroTileModeDepth) {
uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
} else {
uint64_t fragmentOffset =
fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
}
uint64_t slicesPerTile = 1;
uint64_t tileSplitSlice = 0;
if (tileBytes > tileSplitBytes && tileThickness == 1) {
slicesPerTile = tileBytes / tileSplitBytes;
tileSplitSlice = elementOffset / (tileSplitBytes * 8);
elementOffset %= (tileSplitBytes * 8);
tileBytes = tileSplitBytes;
}
uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
(macroTileHeight / kMicroTileHeight) * tileBytes /
(numPipes * numBanks);
uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
uint64_t macroTileRowIndex = y / macroTileHeight;
uint64_t macroTileColumnIndex = x / macroTileWidth;
uint64_t macroTileIndex =
(macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
uint64_t macroTilesPerSlice =
macroTilesPerRow * (paddedHeight / macroTileHeight);
uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;
uint32_t slice = z;
uint64_t sliceOffset =
(tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
if (arraySlice != 0) {
slice = arraySlice;
}
uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight;
uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth;
uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
uint64_t tileOffset = tileIndex * tileBytes;
uint64_t bankSwizzle = bankSwizzleMask;
uint64_t pipeSwizzle = pipeSwizzleMask;
uint64_t pipeSliceRotation = 0;
switch (arrayMode) {
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
pipeSliceRotation =
std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
break;
default:
break;
}
pipeSwizzle += pipeSliceRotation;
pipeSwizzle &= (numPipes - 1);
pipe = pipe ^ pipeSwizzle;
uint32_t sliceRotation = 0;
switch (arrayMode) {
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayMode2dTiledThick:
case amdgpu::kArrayMode2dTiledXThick:
sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
break;
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) *
(slice / tileThickness) / numPipes;
break;
default:
break;
}
uint64_t tileSplitSliceRotation = 0;
switch (arrayMode) {
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode2dTiledThinPrt:
case amdgpu::kArrayMode3dTiledThinPrt:
tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
break;
default:
break;
}
bank ^= bankSwizzle + sliceRotation;
bank ^= tileSplitSliceRotation;
bank &= (numBanks - 1);
uint64_t totalOffset =
(sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
uint64_t bitOffset = totalOffset & 0x7;
totalOffset /= 8;
uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
uint64_t offset = totalOffset >> pipeInterleaveBits;
uint64_t finalByteOffset =
pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
(bank << (pipeInterleaveBits + pipeBits)) |
(offset << (pipeInterleaveBits + pipeBits + bankBits));
return (finalByteOffset << 3) | bitOffset;
}
std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType,
bool isPow2Padded, int numFragments,
gnm::DataFormat dfmt,
amdgpu::TileMode tileMode,
amdgpu::MacroTileMode macroTileMode,
int mipLevel, int arraySlice, int width,
int height, int depth, int pitch, int x,
int y, int z, int fragmentIndex) {
switch (tileMode.arrayMode()) {
case amdgpu::kArrayModeLinearGeneral:
case amdgpu::kArrayModeLinearAligned:
return getTiledOffsetLinear(dfmt, height, pitch, x, y, z);
case amdgpu::kArrayMode1dTiledThin:
case amdgpu::kArrayMode1dTiledThick: {
return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel,
arraySlice, numFragments, width, height, depth,
pitch, x, y, z);
}
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayMode2dTiledThick:
case amdgpu::kArrayMode2dTiledXThick:
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayModeTiledThinPrt:
case amdgpu::kArrayModeTiledThickPrt:
case amdgpu::kArrayMode2dTiledThinPrt:
case amdgpu::kArrayMode2dTiledThickPrt:
case amdgpu::kArrayMode3dTiledThinPrt:
case amdgpu::kArrayMode3dTiledThickPrt:
return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode,
macroTileMode, mipLevel, arraySlice, numFragments,
width, height, depth, pitch, x, y, z,
fragmentIndex);
}
std::abort();
}

View file

@ -0,0 +1,354 @@
#include "amdgpu/tiler_vulkan.hpp"
#include "Scheduler.hpp"
#include "amdgpu/tiler.hpp"
#include <bit>
#include <cstring>
#include <memory>
#include <vk.hpp>
#include <shaders/detiler1d.comp.h>
#include <shaders/detiler2d.comp.h>
#include <shaders/detilerLinear.comp.h>
#include <shaders/tiler1d.comp.h>
#include <shaders/tiler2d.comp.h>
#include <shaders/tilerLinear.comp.h>
struct TilerDecriptorSetLayout {
VkDescriptorSetLayout layout;
TilerDecriptorSetLayout() {
std::vector<VkDescriptorSetLayoutBinding> bindings{{
.binding = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
}};
VkDescriptorSetLayoutCreateInfo layoutInfo{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = static_cast<uint32_t>(bindings.size()),
.pBindings = bindings.data(),
};
VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo,
nullptr, &layout));
}
~TilerDecriptorSetLayout() {
vkDestroyDescriptorSetLayout(vk::context->device, layout,
vk::context->allocator);
}
};
struct TilerShader {
VkShaderEXT shader;
TilerShader(TilerDecriptorSetLayout &setLayout,
std::span<const std::uint32_t> spirv) {
VkShaderCreateInfoEXT shaderInfo{
.sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT,
.flags = 0,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.nextStage = 0,
.codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT,
.codeSize = spirv.size_bytes(),
.pCode = spirv.data(),
.pName = "main",
.setLayoutCount = 1,
.pSetLayouts = &setLayout.layout,
.pushConstantRangeCount = 0,
.pPushConstantRanges = 0,
.pSpecializationInfo = 0,
};
VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &shaderInfo, nullptr,
&shader));
}
~TilerShader() {
vk::DestroyShaderEXT(vk::context->device, shader, vk::context->allocator);
}
};
struct amdgpu::GpuTiler::Impl {
TilerDecriptorSetLayout descriptorSetLayout;
std::mutex descriptorMtx;
VkDescriptorSet descriptorSets[4]{};
VkDescriptorPool descriptorPool;
std::uint32_t inUseDescriptorSets = 0;
vk::Buffer configData;
TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp};
TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp};
TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp};
TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp};
TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp};
TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp};
VkPipelineLayout pipelineLayout;
struct Config {
uint64_t srcAddress;
uint64_t dstAddress;
uint32_t dataWidth;
uint32_t dataHeight;
uint32_t tileMode;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
};
Impl() {
std::size_t count = 256;
configData = vk::Buffer::Allocate(
vk::getHostVisibleMemory(), sizeof(Config) * count,
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
VkPipelineLayoutCreateInfo piplineLayoutInfo{
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &descriptorSetLayout.layout,
};
VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo,
nullptr, &pipelineLayout));
{
VkDescriptorPoolSize poolSizes[]{{
.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = 1,
}};
VkDescriptorPoolCreateInfo info{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = static_cast<std::uint32_t>(std::size(descriptorSets)) * 4,
.poolSizeCount = static_cast<uint32_t>(std::size(poolSizes)),
.pPoolSizes = poolSizes,
};
VK_VERIFY(vkCreateDescriptorPool(
vk::context->device, &info, vk::context->allocator, &descriptorPool));
}
VkDescriptorSetAllocateInfo info{
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
.descriptorPool = descriptorPool,
.descriptorSetCount = 1,
.pSetLayouts = &descriptorSetLayout.layout,
};
for (std::size_t i = 0; i < std::size(descriptorSets); ++i) {
VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info,
descriptorSets + i));
}
}
~Impl() {
vkDestroyDescriptorPool(vk::context->device, descriptorPool,
vk::context->allocator);
vkDestroyPipelineLayout(vk::context->device, pipelineLayout,
vk::context->allocator);
}
std::uint32_t allocateDescriptorSlot() {
std::lock_guard lock(descriptorMtx);
auto result = std::countl_one(inUseDescriptorSets);
rx::dieIf(result >= std::size(descriptorSets),
"out of tiler descriptor sets");
inUseDescriptorSets |= (1 << result);
return result;
}
void releaseDescriptorSlot(std::uint32_t slot) {
std::lock_guard lock(descriptorMtx);
inUseDescriptorSets &= ~(1u << slot);
}
};
amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
amdgpu::GpuTiler::~GpuTiler() = default;
void amdgpu::GpuTiler::detile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode,
std::uint64_t srcTiledAddress,
std::uint64_t dstLinearAddress, int mipLevel,
int baseArray, int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
auto slot = mImpl->allocateDescriptorSlot();
auto configOffset = slot * sizeof(Impl::Config);
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
configOffset);
auto &subresource = info.getSubresourceInfo(mipLevel);
config->srcAddress = srcTiledAddress + subresource.offset +
(subresource.tiledSize * baseArray);
config->dstAddress = dstLinearAddress + (subresource.linearSize * baseArray);
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
if (arrayCount > 1) {
config->tiledSurfaceSize = subresource.tiledSize;
config->linearSurfaceSize = subresource.linearSize;
groupCountZ = arrayCount;
} else {
config->tiledSurfaceSize = 0;
config->linearSurfaceSize = 0;
}
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
switch (tileMode.arrayMode()) {
case amdgpu::kArrayModeLinearGeneral:
case amdgpu::kArrayModeLinearAligned:
vk::CmdBindShadersEXT(commandBuffer, 1, stages,
&mImpl->detilerLinear.shader);
break;
case amdgpu::kArrayMode1dTiledThin:
case amdgpu::kArrayMode1dTiledThick:
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
break;
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayModeTiledThinPrt:
case amdgpu::kArrayMode2dTiledThinPrt:
case amdgpu::kArrayMode2dTiledThick:
case amdgpu::kArrayMode2dTiledXThick:
case amdgpu::kArrayModeTiledThickPrt:
case amdgpu::kArrayMode2dTiledThickPrt:
case amdgpu::kArrayMode3dTiledThinPrt:
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
std::abort();
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
break;
}
VkDescriptorBufferInfo bufferInfo{
.buffer = mImpl->configData.getHandle(),
.offset = configOffset,
.range = sizeof(Impl::Config),
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = mImpl->descriptorSets[slot],
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
mImpl->pipelineLayout, 0, 1,
&mImpl->descriptorSets[slot], 0, nullptr);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
}
void amdgpu::GpuTiler::tile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode,
std::uint64_t srcLinearAddress,
std::uint64_t dstTiledAddress, int mipLevel,
int baseArray, int arrayCount) {
auto commandBuffer = scheduler.getCommandBuffer();
auto slot = mImpl->allocateDescriptorSlot();
auto configOffset = slot * sizeof(Impl::Config);
auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
configOffset);
auto &subresource = info.getSubresourceInfo(mipLevel);
config->srcAddress = srcLinearAddress + subresource.offset +
subresource.linearSize * baseArray;
config->dstAddress = dstTiledAddress;
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
if (arrayCount > 1) {
config->tiledSurfaceSize = subresource.tiledSize;
config->linearSurfaceSize = subresource.linearSize;
groupCountZ = arrayCount;
} else {
config->tiledSurfaceSize = 0;
config->linearSurfaceSize = 0;
}
VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
switch (tileMode.arrayMode()) {
case amdgpu::kArrayModeLinearGeneral:
case amdgpu::kArrayModeLinearAligned:
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tilerLinear.shader);
break;
case amdgpu::kArrayMode1dTiledThin:
case amdgpu::kArrayMode1dTiledThick:
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
break;
case amdgpu::kArrayMode2dTiledThin:
case amdgpu::kArrayModeTiledThinPrt:
case amdgpu::kArrayMode2dTiledThinPrt:
case amdgpu::kArrayMode2dTiledThick:
case amdgpu::kArrayMode2dTiledXThick:
case amdgpu::kArrayModeTiledThickPrt:
case amdgpu::kArrayMode2dTiledThickPrt:
case amdgpu::kArrayMode3dTiledThinPrt:
case amdgpu::kArrayMode3dTiledThin:
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
std::abort();
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
break;
}
VkDescriptorBufferInfo bufferInfo{
.buffer = mImpl->configData.getHandle(),
.offset = configOffset,
.range = sizeof(Impl::Config),
};
VkWriteDescriptorSet writeDescSet{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.dstSet = mImpl->descriptorSets[slot],
.dstBinding = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.pBufferInfo = &bufferInfo,
};
vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
mImpl->pipelineLayout, 0, 1,
&mImpl->descriptorSets[slot], 0, nullptr);
vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
groupCountZ);
scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
}