gpu rewrite initial commit

2026-04-20 22:05:12 +00:00 · 2024-09-25 16:00:55 +03:00 · 2024-09-25 16:00:55 +03:00 · 4cf808facd
commit 4cf808facd
parent 0d4ed51cd9
133 changed files with 35491 additions and 4 deletions
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
@ -0,0 +1,387 @@
+#include "gnm/constants.hpp"
+#include <amdgpu/tiler.hpp>
+#include <gnm/gnm.hpp>
+#include <bit>
+
+using namespace amdgpu;
+
+static constexpr SurfaceInfo
+computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
+                     gnm::DataFormat dfmt, std::uint32_t width,
+                     std::uint32_t height, std::uint32_t depth,
+                     std::uint32_t pitch, int baseArrayLayer, int arrayCount,
+                     int baseMipLevel, int mipCount, bool pow2pad) {
+  bool isCubemap = type == gnm::TextureType::Cube;
+  bool isVolume = type == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  std::uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  int numFragments = (type == gnm::TextureType::Msaa2D ||
+                      type == gnm::TextureType::MsaaArray2D)
+                         ? (baseArrayLayer + arrayCount - 1)
+                         : 0;
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+
+  auto bitsPerElement = bitsPerFragment;
+  depth = isVolume ? depth : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (pow2pad) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+  }
+
+  std::uint64_t surfaceOffset = 0;
+  std::uint64_t surfaceSize = 0;
+
+  SurfaceInfo result;
+  result.width = width;
+  result.height = height;
+  result.depth = depth;
+  result.pitch = pitch;
+  result.numFragments = numFragments;
+  result.bitsPerElement = bitsPerElement;
+  result.arrayLayerCount = arraySliceCount;
+
+  auto thickness = getMicroTileThickness(arrayMode);
+
+  for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
+    std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
+    std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
+    std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
+
+    std::uint32_t linearPitch = elemPitch;
+    std::uint32_t linearWidth = elemWidth;
+    std::uint32_t linearHeight = elemHeight;
+    std::uint32_t linearDepth = elemDepth;
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
+        linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (pow2pad) {
+      linearPitch = std::bit_ceil(linearPitch);
+      linearWidth = std::bit_ceil(linearWidth);
+      linearHeight = std::bit_ceil(linearHeight);
+      linearDepth = std::bit_ceil(linearDepth);
+    }
+
+    if (mipLevel > 0 && pitch > 0) {
+      linearPitch = linearWidth;
+    }
+
+    std::uint32_t paddedPitch =
+        (linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
+    std::uint32_t paddedHeight =
+        (linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
+    std::uint32_t paddedDepth = linearDepth;
+
+    if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
+      if (isCubemap) {
+        linearDepth = std::bit_ceil(linearDepth);
+      }
+
+      paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
+    }
+
+    std::uint32_t tempPitch = paddedPitch;
+    std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
+                                          paddedHeight * bitsPerElement *
+                                          numFragmentsPerPixel;
+    logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+
+    uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
+      tempPitch += kMicroTileWidth;
+      logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
+                              bitsPerElement * numFragmentsPerPixel;
+      logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+      physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    }
+
+    surfaceSize = logicalSliceSizeBytes * paddedDepth;
+    auto linearSize =
+        linearDepth *
+        (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
+         7) /
+        8;
+
+    result.setSubresourceInfo(mipLevel, {
+                                            .dataWidth = linearPitch,
+                                            .dataHeight = linearHeight,
+                                            .dataDepth = linearDepth,
+                                            .offset = surfaceOffset,
+                                            .tiledSize = surfaceSize,
+                                            .linearSize = linearSize,
+                                        });
+
+    surfaceOffset += arraySliceCount * surfaceSize;
+  }
+
+  result.totalSize = surfaceOffset;
+  return result;
+}
+
+static constexpr SurfaceInfo computeTextureLinearInfo(
+    ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt,
+    std::uint32_t width, std::uint32_t height, std::uint32_t depth,
+    std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
+    int mipCount, bool pow2pad) {
+  bool isCubemap = type == gnm::TextureType::Cube;
+  bool isVolume = type == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  std::uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  int numFragments = (type == gnm::TextureType::Msaa2D ||
+                      type == gnm::TextureType::MsaaArray2D)
+                         ? (baseArrayLayer + arrayCount - 1)
+                         : 0;
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+
+  auto bitsPerElement = bitsPerFragment;
+  depth = isVolume ? depth : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (pow2pad) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+  }
+
+  std::uint64_t surfaceOffset = 0;
+  std::uint64_t surfaceSize = 0;
+
+  SurfaceInfo result;
+  result.width = width;
+  result.height = height;
+  result.depth = depth;
+  result.pitch = pitch;
+  result.numFragments = numFragments;
+  result.bitsPerElement = bitsPerElement;
+  result.arrayLayerCount = arraySliceCount;
+
+  for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
+    std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
+    std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
+    std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
+
+    std::uint32_t linearPitch = elemPitch;
+    std::uint32_t linearWidth = elemWidth;
+    std::uint32_t linearHeight = elemHeight;
+    std::uint32_t linearDepth = elemDepth;
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
+        linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (pow2pad) {
+      linearPitch = std::bit_ceil(linearPitch);
+      linearWidth = std::bit_ceil(linearWidth);
+      linearHeight = std::bit_ceil(linearHeight);
+      linearDepth = std::bit_ceil(linearDepth);
+    }
+
+    if (mipLevel > 0 && pitch > 0) {
+      linearPitch = linearWidth;
+    }
+
+    if (arrayMode == kArrayModeLinearGeneral) {
+      surfaceSize = (static_cast<uint64_t>(linearPitch) *
+                         (linearHeight)*bitsPerElement * numFragmentsPerPixel +
+                     7) /
+                    8;
+      surfaceSize *= linearDepth;
+
+      result.setSubresourceInfo(mipLevel, {
+                                              .dataWidth = linearPitch,
+                                              .dataHeight = linearHeight,
+                                              .dataDepth = linearDepth,
+                                              .offset = surfaceOffset,
+                                              .tiledSize = surfaceSize,
+                                              .linearSize = surfaceSize,
+                                          });
+    } else {
+      if (mipLevel > 0 && pitch > 0) {
+        linearPitch = linearWidth;
+      }
+
+      auto pitchAlign = std::max(8UL, 64UL / ((bitsPerElement + 7) / 8UL));
+      std::uint32_t paddedPitch =
+          (linearPitch + pitchAlign - 1) & ~(pitchAlign - 1);
+      std::uint32_t paddedHeight = linearHeight;
+      std::uint32_t paddedDepth = linearDepth;
+
+      if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
+        if (isCubemap) {
+          linearDepth = std::bit_ceil(linearDepth);
+        }
+
+        auto thickness = getMicroTileThickness(arrayMode);
+        paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
+      }
+
+      std::uint32_t pixelsPerPipeInterleave =
+          kPipeInterleaveBytes / ((bitsPerElement + 7) / 8);
+      std::uint32_t sliceAlignInPixel =
+          pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
+      auto pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
+                            numFragmentsPerPixel;
+      while (pixelsPerSlice % sliceAlignInPixel) {
+        paddedPitch += pitchAlign;
+        pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
+                         numFragmentsPerPixel;
+      }
+
+      surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth;
+
+      result.setSubresourceInfo(mipLevel, {
+                                              .dataWidth = paddedPitch,
+                                              .dataHeight = paddedHeight,
+                                              .dataDepth = paddedDepth,
+                                              .offset = surfaceOffset,
+                                              .tiledSize = surfaceSize,
+                                              .linearSize = surfaceSize,
+                                          });
+    }
+
+    surfaceOffset += arraySliceCount * surfaceSize;
+  }
+
+  result.totalSize = surfaceOffset;
+  return result;
+}
+
+SurfaceInfo amdgpu::computeSurfaceInfo(
+    TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt,
+    std::uint32_t width, std::uint32_t height, std::uint32_t depth,
+    std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
+    int mipCount, bool pow2pad) {
+  switch (tileMode.arrayMode()) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+    return computeTextureLinearInfo(
+        tileMode.arrayMode(), type, dfmt, width, height, depth, pitch,
+        baseArrayLayer, arrayCount, baseMipLevel, mipCount, pow2pad);
+
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+    return computeTexture1dInfo(tileMode.arrayMode(), type, dfmt, width, height,
+                                depth, pitch, baseArrayLayer, arrayCount,
+                                baseMipLevel, mipCount, pow2pad);
+
+  case kArrayMode2dTiledThin:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+  case kArrayModeTiledThinPrt:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThickPrt:
+    std::abort();
+  }
+
+  std::abort();
+}
+
+SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer,
+                                       TileMode tileMode) {
+  return computeSurfaceInfo(
+      tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1,
+      tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1,
+      tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1,
+      tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1,
+      tbuffer.pow2pad != 0);
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
@ -0,0 +1,441 @@
+#include "amdgpu/tiler_cpu.hpp"
+#include "amdgpu/tiler.hpp"
+#include "gnm/gnm.hpp"
+
+constexpr std::uint64_t
+getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded,
+                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel,
+                 int arraySlice, int numFragments, int width, int height,
+                 int depth, int pitch, int x, int y, int z) {
+
+  using namespace amdgpu;
+  bool isCubemap = texType == gnm::TextureType::Cube;
+  bool isVolume = texType == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+  auto arrayMode = tileMode.arrayMode();
+
+  auto bitsPerElement = bitsPerFragment;
+  auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1);
+  auto paddedHeight = std::max(height >> mipLevel, 1);
+
+  auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
+      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (isPow2Padded) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+    paddedWidth = std::bit_ceil(unsigned(paddedWidth));
+    paddedHeight = std::bit_ceil(unsigned(paddedHeight));
+  }
+
+  uint64_t finalSurfaceOffset = 0;
+  uint64_t finalSurfaceSize = 0;
+
+  auto thickness = getMicroTileThickness(arrayMode);
+
+  for (int i = 0; i <= mipLevel; i++) {
+    finalSurfaceOffset += arraySliceCount * finalSurfaceSize;
+
+    std::uint32_t elemWidth =
+        std::max<std::uint64_t>((i > 0 ? pitch : width) >> i, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> i, 1);
+    std::uint32_t elemDepth =
+        std::max<std::uint64_t>((isVolume ? depth : 1) >> i, 1);
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        elemWidth = std::max<std::uint64_t>((elemWidth + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        elemWidth = std::max<std::uint64_t>((elemWidth + 3) / 4, 1);
+        elemHeight = std::max<std::uint64_t>((elemHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (isPow2Padded) {
+      elemWidth = std::bit_ceil(elemWidth);
+      elemHeight = std::bit_ceil(elemHeight);
+      elemDepth = std::bit_ceil(elemDepth);
+    }
+
+    elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
+    elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
+    elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1);
+
+    std::uint32_t tempPitch = elemWidth;
+    std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
+                                          elemHeight * bitsPerElement *
+                                          numFragmentsPerPixel;
+    logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+
+    uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
+      tempPitch += 8;
+      logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight *
+                              bitsPerElement * numFragmentsPerPixel;
+      logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+      physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    }
+
+    finalSurfaceSize = logicalSliceSizeBytes * elemDepth;
+  }
+
+  finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice;
+
+  auto tileBytes =
+      (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement +
+       7) /
+      8;
+  auto tilesPerRow = paddedWidth / kMicroTileWidth;
+  auto tilesPerSlice =
+      std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U);
+
+  uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement,
+                                          tileMode.microTileMode(), arrayMode);
+
+  uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes;
+
+  uint64_t tileRowIndex = y / kMicroTileHeight;
+  uint64_t tileColumnIndex = x / kMicroTileWidth;
+  uint64_t tileOffset =
+      (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;
+
+  uint64_t elementOffset = elementIndex * bitsPerElement;
+  uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset;
+
+  return finalOffset + finalSurfaceOffset * 8;
+}
+
+constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height,
+                                             int pitch, int x, int y, int z) {
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+
+  auto bitsPerElement = bitsPerFragment;
+  auto paddedHeight = height;
+  auto paddedWidth = pitch;
+
+  if (bitsPerFragment == 1) {
+    bitsPerElement *= 8;
+    paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+  }
+
+  uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth;
+  uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement;
+  return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x;
+}
+
+constexpr std::uint64_t
+getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
+                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode,
+                 amdgpu::MacroTileMode macroTileMode, int mipLevel,
+                 int arraySlice, int numFragments, int width, int height,
+                 int depth, int pitch, int x, int y, int z, int fragmentIndex) {
+  using namespace amdgpu;
+
+  bool isCubemap = texType == gnm::TextureType::Cube;
+  bool isVolume = texType == gnm::TextureType::Dim3D;
+  auto m_bitsPerFragment = getBitsPerElement(dfmt);
+
+  auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+  auto tileSwizzleMask = 0;
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto arrayMode = tileMode.arrayMode();
+
+  auto tileThickness = 1;
+
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+    tileThickness = 1;
+    break;
+  case amdgpu::kArrayMode1dTiledThick:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    tileThickness = 4;
+    break;
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    tileThickness = 8;
+    break;
+  default:
+    break;
+  }
+
+  auto bitsPerElement = m_bitsPerFragment;
+  auto paddedWidth = pitch;
+  auto paddedHeight = height;
+
+  if (m_isBlockCompressed) {
+    switch (m_bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
+      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
+      break;
+    case 16:
+      std::abort();
+      break;
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  auto bankWidthHW = macroTileMode.bankWidth();
+  auto bankHeightHW = macroTileMode.bankHeight();
+  auto macroAspectHW = macroTileMode.macroTileAspect();
+  auto numBanksHW = macroTileMode.numBanks();
+
+  auto bankWidth = 1 << bankWidthHW;
+  auto bankHeight = 1 << bankHeightHW;
+  unsigned numBanks = 2 << numBanksHW;
+  auto macroTileAspect = 1 << macroAspectHW;
+
+  uint32_t tileBytes1x =
+      (tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
+       7) /
+      8;
+
+  auto sampleSplitHw = tileMode.sampleSplit();
+  auto tileSplitHw = tileMode.tileSplit();
+  uint32_t sampleSplit = 1 << sampleSplitHw;
+  uint32_t tileSplitC =
+      (tileMode.microTileMode() == amdgpu::kMicroTileModeDepth)
+          ? (64 << tileSplitHw)
+          : std::max(256U, tileBytes1x * sampleSplit);
+
+  auto tileSplitBytes = std::min(kDramRowSize, tileSplitC);
+
+  auto numPipes = getPipeCount(tileMode.pipeConfig());
+  auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes);
+  auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
+  auto pipeBits = std::countr_zero(numPipes);
+  auto bankBits = std::countr_zero(numBanks);
+  // auto pipeMask = (numPipes - 1) << pipeInterleaveBits;
+  auto bankSwizzleMask = tileSwizzleMask;
+  auto pipeSwizzleMask = 0;
+  auto macroTileWidth =
+      (kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
+  auto macroTileHeight =
+      (kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;
+
+  auto microTileMode = tileMode.microTileMode();
+
+  uint64_t elementIndex =
+      getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode);
+
+  uint32_t xh = x, yh = y;
+  if (arrayMode == amdgpu::kArrayModeTiledThinPrt ||
+      arrayMode == amdgpu::kArrayModeTiledThickPrt) {
+    xh %= macroTileWidth;
+    yh %= macroTileHeight;
+  }
+  uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig());
+  uint64_t bank =
+      getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
+
+  uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
+                            bitsPerElement * numFragmentsPerPixel +
+                        7) /
+                       8;
+
+  uint64_t elementOffset = 0;
+  if (microTileMode == amdgpu::kMicroTileModeDepth) {
+    uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
+    elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
+  } else {
+    uint64_t fragmentOffset =
+        fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
+    elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
+  }
+
+  uint64_t slicesPerTile = 1;
+  uint64_t tileSplitSlice = 0;
+  if (tileBytes > tileSplitBytes && tileThickness == 1) {
+    slicesPerTile = tileBytes / tileSplitBytes;
+    tileSplitSlice = elementOffset / (tileSplitBytes * 8);
+    elementOffset %= (tileSplitBytes * 8);
+    tileBytes = tileSplitBytes;
+  }
+
+  uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
+                            (macroTileHeight / kMicroTileHeight) * tileBytes /
+                            (numPipes * numBanks);
+  uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
+  uint64_t macroTileRowIndex = y / macroTileHeight;
+  uint64_t macroTileColumnIndex = x / macroTileWidth;
+  uint64_t macroTileIndex =
+      (macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
+  uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
+  uint64_t macroTilesPerSlice =
+      macroTilesPerRow * (paddedHeight / macroTileHeight);
+  uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;
+
+  uint32_t slice = z;
+  uint64_t sliceOffset =
+      (tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
+  if (arraySlice != 0) {
+    slice = arraySlice;
+  }
+
+  uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight;
+  uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth;
+  uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
+  uint64_t tileOffset = tileIndex * tileBytes;
+
+  uint64_t bankSwizzle = bankSwizzleMask;
+  uint64_t pipeSwizzle = pipeSwizzleMask;
+
+  uint64_t pipeSliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    pipeSliceRotation =
+        std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
+    break;
+  default:
+    break;
+  }
+  pipeSwizzle += pipeSliceRotation;
+  pipeSwizzle &= (numPipes - 1);
+  pipe = pipe ^ pipeSwizzle;
+
+  uint32_t sliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+    sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
+    break;
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) *
+                    (slice / tileThickness) / numPipes;
+    break;
+  default:
+    break;
+  }
+  uint64_t tileSplitSliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+    tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
+    break;
+  default:
+    break;
+  }
+  bank ^= bankSwizzle + sliceRotation;
+  bank ^= tileSplitSliceRotation;
+  bank &= (numBanks - 1);
+
+  uint64_t totalOffset =
+      (sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
+  uint64_t bitOffset = totalOffset & 0x7;
+  totalOffset /= 8;
+
+  uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
+  uint64_t offset = totalOffset >> pipeInterleaveBits;
+
+  uint64_t finalByteOffset =
+      pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
+      (bank << (pipeInterleaveBits + pipeBits)) |
+      (offset << (pipeInterleaveBits + pipeBits + bankBits));
+  return (finalByteOffset << 3) | bitOffset;
+}
+
+std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType,
+                                     bool isPow2Padded, int numFragments,
+                                     gnm::DataFormat dfmt,
+                                     amdgpu::TileMode tileMode,
+                                     amdgpu::MacroTileMode macroTileMode,
+                                     int mipLevel, int arraySlice, int width,
+                                     int height, int depth, int pitch, int x,
+                                     int y, int z, int fragmentIndex) {
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    return getTiledOffsetLinear(dfmt, height, pitch, x, y, z);
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick: {
+    return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel,
+                            arraySlice, numFragments, width, height, depth,
+                            pitch, x, y, z);
+  }
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode,
+                            macroTileMode, mipLevel, arraySlice, numFragments,
+                            width, height, depth, pitch, x, y, z,
+                            fragmentIndex);
+  }
+
+  std::abort();
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
@ -0,0 +1,354 @@
+#include "amdgpu/tiler_vulkan.hpp"
+#include "Scheduler.hpp"
+#include "amdgpu/tiler.hpp"
+#include <bit>
+#include <cstring>
+#include <memory>
+#include <vk.hpp>
+
+#include <shaders/detiler1d.comp.h>
+#include <shaders/detiler2d.comp.h>
+#include <shaders/detilerLinear.comp.h>
+#include <shaders/tiler1d.comp.h>
+#include <shaders/tiler2d.comp.h>
+#include <shaders/tilerLinear.comp.h>
+
+struct TilerDecriptorSetLayout {
+  VkDescriptorSetLayout layout;
+
+  TilerDecriptorSetLayout() {
+    std::vector<VkDescriptorSetLayoutBinding> bindings{{
+        .binding = 0,
+        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+    }};
+
+    VkDescriptorSetLayoutCreateInfo layoutInfo{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = static_cast<uint32_t>(bindings.size()),
+        .pBindings = bindings.data(),
+    };
+
+    VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo,
+                                          nullptr, &layout));
+  }
+
+  ~TilerDecriptorSetLayout() {
+    vkDestroyDescriptorSetLayout(vk::context->device, layout,
+                                 vk::context->allocator);
+  }
+};
+
+struct TilerShader {
+  VkShaderEXT shader;
+
+  TilerShader(TilerDecriptorSetLayout &setLayout,
+              std::span<const std::uint32_t> spirv) {
+
+    VkShaderCreateInfoEXT shaderInfo{
+        .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT,
+        .flags = 0,
+        .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+        .nextStage = 0,
+        .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT,
+        .codeSize = spirv.size_bytes(),
+        .pCode = spirv.data(),
+        .pName = "main",
+        .setLayoutCount = 1,
+        .pSetLayouts = &setLayout.layout,
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = 0,
+        .pSpecializationInfo = 0,
+    };
+
+    VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &shaderInfo, nullptr,
+                                   &shader));
+  }
+
+  ~TilerShader() {
+    vk::DestroyShaderEXT(vk::context->device, shader, vk::context->allocator);
+  }
+};
+
+struct amdgpu::GpuTiler::Impl {
+  TilerDecriptorSetLayout descriptorSetLayout;
+  std::mutex descriptorMtx;
+  VkDescriptorSet descriptorSets[4]{};
+  VkDescriptorPool descriptorPool;
+  std::uint32_t inUseDescriptorSets = 0;
+
+  vk::Buffer configData;
+  TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp};
+  TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp};
+  TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp};
+  TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp};
+  TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp};
+  TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp};
+  VkPipelineLayout pipelineLayout;
+
+  struct Config {
+    uint64_t srcAddress;
+    uint64_t dstAddress;
+    uint32_t dataWidth;
+    uint32_t dataHeight;
+    uint32_t tileMode;
+    uint32_t numFragments;
+    uint32_t bitsPerElement;
+    uint32_t tiledSurfaceSize;
+    uint32_t linearSurfaceSize;
+  };
+
+  Impl() {
+    std::size_t count = 256;
+
+    configData = vk::Buffer::Allocate(
+        vk::getHostVisibleMemory(), sizeof(Config) * count,
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+            VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+
+    VkPipelineLayoutCreateInfo piplineLayoutInfo{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &descriptorSetLayout.layout,
+    };
+
+    VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo,
+                                     nullptr, &pipelineLayout));
+
+    {
+      VkDescriptorPoolSize poolSizes[]{{
+          .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          .descriptorCount = 1,
+      }};
+
+      VkDescriptorPoolCreateInfo info{
+          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+          .maxSets = static_cast<std::uint32_t>(std::size(descriptorSets)) * 4,
+          .poolSizeCount = static_cast<uint32_t>(std::size(poolSizes)),
+          .pPoolSizes = poolSizes,
+      };
+
+      VK_VERIFY(vkCreateDescriptorPool(
+          vk::context->device, &info, vk::context->allocator, &descriptorPool));
+    }
+
+    VkDescriptorSetAllocateInfo info{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = descriptorPool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &descriptorSetLayout.layout,
+    };
+    for (std::size_t i = 0; i < std::size(descriptorSets); ++i) {
+      VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info,
+                                         descriptorSets + i));
+    }
+  }
+
+  ~Impl() {
+    vkDestroyDescriptorPool(vk::context->device, descriptorPool,
+                            vk::context->allocator);
+    vkDestroyPipelineLayout(vk::context->device, pipelineLayout,
+                            vk::context->allocator);
+  }
+
+  std::uint32_t allocateDescriptorSlot() {
+    std::lock_guard lock(descriptorMtx);
+
+    auto result = std::countl_one(inUseDescriptorSets);
+    rx::dieIf(result >= std::size(descriptorSets),
+              "out of tiler descriptor sets");
+    inUseDescriptorSets |= (1 << result);
+
+    return result;
+  }
+
+  void releaseDescriptorSlot(std::uint32_t slot) {
+    std::lock_guard lock(descriptorMtx);
+    inUseDescriptorSets &= ~(1u << slot);
+  }
+};
+
+amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
+amdgpu::GpuTiler::~GpuTiler() = default;
+
+void amdgpu::GpuTiler::detile(Scheduler &scheduler,
+                              const amdgpu::SurfaceInfo &info,
+                              amdgpu::TileMode tileMode,
+                              std::uint64_t srcTiledAddress,
+                              std::uint64_t dstLinearAddress, int mipLevel,
+                              int baseArray, int arrayCount) {
+  auto commandBuffer = scheduler.getCommandBuffer();
+  auto slot = mImpl->allocateDescriptorSlot();
+
+  auto configOffset = slot * sizeof(Impl::Config);
+  auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
+                                                 configOffset);
+
+  auto &subresource = info.getSubresourceInfo(mipLevel);
+  config->srcAddress = srcTiledAddress + subresource.offset +
+                       (subresource.tiledSize * baseArray);
+  config->dstAddress = dstLinearAddress + (subresource.linearSize * baseArray);
+  config->dataWidth = subresource.dataWidth;
+  config->dataHeight = subresource.dataHeight;
+  config->tileMode = tileMode.raw;
+  config->numFragments = info.numFragments;
+  config->bitsPerElement = info.bitsPerElement;
+  uint32_t groupCountZ = subresource.dataDepth;
+
+  if (arrayCount > 1) {
+    config->tiledSurfaceSize = subresource.tiledSize;
+    config->linearSurfaceSize = subresource.linearSize;
+    groupCountZ = arrayCount;
+  } else {
+    config->tiledSurfaceSize = 0;
+    config->linearSurfaceSize = 0;
+  }
+
+  VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
+
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages,
+                          &mImpl->detilerLinear.shader);
+    break;
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
+    break;
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    std::abort();
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
+    break;
+  }
+
+  VkDescriptorBufferInfo bufferInfo{
+      .buffer = mImpl->configData.getHandle(),
+      .offset = configOffset,
+      .range = sizeof(Impl::Config),
+  };
+
+  VkWriteDescriptorSet writeDescSet{
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = mImpl->descriptorSets[slot],
+      .dstBinding = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      .pBufferInfo = &bufferInfo,
+  };
+
+  vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
+
+  vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          mImpl->pipelineLayout, 0, 1,
+                          &mImpl->descriptorSets[slot], 0, nullptr);
+
+  vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
+                groupCountZ);
+
+  scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
+}
+
+void amdgpu::GpuTiler::tile(Scheduler &scheduler,
+                            const amdgpu::SurfaceInfo &info,
+                            amdgpu::TileMode tileMode,
+                            std::uint64_t srcLinearAddress,
+                            std::uint64_t dstTiledAddress, int mipLevel,
+                            int baseArray, int arrayCount) {
+  auto commandBuffer = scheduler.getCommandBuffer();
+  auto slot = mImpl->allocateDescriptorSlot();
+
+  auto configOffset = slot * sizeof(Impl::Config);
+  auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
+                                                 configOffset);
+
+  auto &subresource = info.getSubresourceInfo(mipLevel);
+  config->srcAddress = srcLinearAddress + subresource.offset +
+                       subresource.linearSize * baseArray;
+  config->dstAddress = dstTiledAddress;
+  config->dataWidth = subresource.dataWidth;
+  config->dataHeight = subresource.dataHeight;
+  config->tileMode = tileMode.raw;
+  config->numFragments = info.numFragments;
+  config->bitsPerElement = info.bitsPerElement;
+  uint32_t groupCountZ = subresource.dataDepth;
+
+  if (arrayCount > 1) {
+    config->tiledSurfaceSize = subresource.tiledSize;
+    config->linearSurfaceSize = subresource.linearSize;
+    groupCountZ = arrayCount;
+  } else {
+    config->tiledSurfaceSize = 0;
+    config->linearSurfaceSize = 0;
+  }
+
+  VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
+
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tilerLinear.shader);
+    break;
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
+    break;
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    std::abort();
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
+    break;
+  }
+
+  VkDescriptorBufferInfo bufferInfo{
+      .buffer = mImpl->configData.getHandle(),
+      .offset = configOffset,
+      .range = sizeof(Impl::Config),
+  };
+
+  VkWriteDescriptorSet writeDescSet{
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = mImpl->descriptorSets[slot],
+      .dstBinding = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      .pBufferInfo = &bufferInfo,
+  };
+
+  vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
+
+  vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          mImpl->pipelineLayout, 0, 1,
+                          &mImpl->descriptorSets[slot], 0, nullptr);
+
+  vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
+                groupCountZ);
+
+  scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
+}