gpu2: implement depth textures

initial 2d tiler implementation
fixed mtbuf index order
simplify v_mac_*_f32 instructions
This commit is contained in:
DH 2024-09-28 03:12:12 +03:00
parent 61d58b696f
commit 4185b1aa40
18 changed files with 790 additions and 99 deletions

View file

@ -232,10 +232,25 @@ struct CachedIndexBuffer : Cache::Entry {
gnm::PrimitiveType primType;
};
constexpr VkImageAspectFlags toAspect(ImageKind kind) {
switch (kind) {
case ImageKind::Color:
return VK_IMAGE_ASPECT_COLOR_BIT;
case ImageKind::Depth:
return VK_IMAGE_ASPECT_DEPTH_BIT;
case ImageKind::Stencil:
return VK_IMAGE_ASPECT_STENCIL_BIT;
}
return VK_IMAGE_ASPECT_NONE;
}
struct CachedImage : Cache::Entry {
vk::Image image;
ImageKind kind;
SurfaceInfo info;
TileMode acquiredTileMode;
gnm::DataFormat acquiredDfmt{};
void flush(Cache::Tag &tag, Scheduler &scheduler, std::uint64_t beginAddress,
std::uint64_t endAddress) override {
@ -246,7 +261,7 @@ struct CachedImage : Cache::Entry {
// std::printf("writing image to buffer to %lx\n", baseAddress);
VkImageSubresourceRange subresourceRange{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.aspectMask = toAspect(kind),
.baseMipLevel = 0,
.levelCount = image.getMipLevels(),
.baseArrayLayer = 0,
@ -270,7 +285,7 @@ struct CachedImage : Cache::Entry {
mipLevel > 0 ? 0 : std::max(info.pitch >> mipLevel, 1u),
.imageSubresource =
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.aspectMask = toAspect(kind),
.mipLevel = mipLevel,
.baseArrayLayer = 0,
.layerCount = image.getArrayLayers(),
@ -287,9 +302,9 @@ struct CachedImage : Cache::Entry {
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
transferBuffer.getHandle(), 1, &region);
tiler.tile(scheduler, info, acquiredTileMode, transferBuffer.getAddress(),
tiledBuffer.deviceAddress, mipLevel, 0,
image.getArrayLayers());
tiler.tile(scheduler, info, acquiredTileMode, acquiredDfmt,
transferBuffer.getAddress(), tiledBuffer.deviceAddress,
mipLevel, 0, image.getArrayLayers());
}
transitionImageLayout(scheduler.getCommandBuffer(), image,
@ -307,7 +322,8 @@ struct CachedImageView : Cache::Entry {
ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) {
return {
.address = buffer.address(),
.readAddress = buffer.address(),
.writeAddress = buffer.address(),
.type = buffer.type,
.dfmt = buffer.dfmt,
.nfmt = buffer.nfmt,
@ -324,6 +340,7 @@ ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) {
.mipCount = buffer.last_level - buffer.base_level + 1u,
.baseArrayLayer = static_cast<std::uint32_t>(buffer.base_array),
.arrayLayerCount = buffer.last_array - buffer.base_array + 1u,
.kind = ImageKind::Color,
.pow2pad = buffer.pow2pad != 0,
};
}
@ -714,7 +731,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
if ((access & Access::Read) != Access::None) {
auto tiledBuffer =
getBuffer(key.address, surfaceInfo.totalSize, Access::Read);
getBuffer(key.readAddress, surfaceInfo.totalSize, Access::Read);
auto &tiler = mParent->mDevice->tiler;
auto detiledBuffer =
@ -722,7 +739,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
VK_BUFFER_USAGE_2_TRANSFER_DST_BIT_KHR |
VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR);
VkImageSubresourceRange subresourceRange{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.aspectMask = toAspect(key.kind),
.baseMipLevel = key.baseMipLevel,
.levelCount = key.mipCount,
.baseArrayLayer = key.baseArrayLayer,
@ -756,8 +773,8 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
.size = info.linearSize * key.arrayLayerCount,
});
} else {
tiler.detile(*mScheduler, surfaceInfo, key.tileMode, srcAddress,
dstAddress, mipLevel, 0, key.arrayLayerCount);
tiler.detile(*mScheduler, surfaceInfo, key.tileMode, key.dfmt,
srcAddress, dstAddress, mipLevel, 0, key.arrayLayerCount);
}
regions.push_back({
@ -766,7 +783,7 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
mipLevel > 0 ? 0 : std::max(key.pitch >> mipLevel, 1u),
.imageSubresource =
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.aspectMask = toAspect(key.kind),
.mipLevel = mipLevel,
.baseArrayLayer = key.baseArrayLayer,
.layerCount = key.arrayLayerCount,
@ -808,8 +825,13 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) {
auto cached = std::make_shared<CachedImage>();
cached->image = std::move(image);
cached->info = std::move(surfaceInfo);
cached->baseAddress = key.address;
cached->baseAddress = (access & Access::Write) != Access::None
? key.writeAddress
: key.readAddress;
cached->kind = key.kind;
cached->acquiredAccess = access;
cached->acquiredTileMode = key.tileMode;
cached->acquiredDfmt = key.dfmt;
mAcquiredResources.push_back(cached);
return {.handle = cached->image.getHandle()};
@ -827,14 +849,16 @@ Cache::ImageView Cache::Tag::getImageView(const ImageViewKey &key,
.a = gnm::toVkComponentSwizzle(key.A),
},
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.aspectMask = toAspect(key.kind),
.baseMipLevel = key.baseMipLevel,
.levelCount = key.mipCount,
.baseArrayLayer = key.baseArrayLayer,
.layerCount = key.arrayLayerCount,
});
auto cached = std::make_shared<CachedImageView>();
cached->baseAddress = key.address;
cached->baseAddress = (access & Access::Write) != Access::None
? key.writeAddress
: key.readAddress;
cached->acquiredAccess = access;
cached->view = std::move(result);

View file

@ -22,8 +22,15 @@ struct ShaderKey {
shader::gcn::Environment env;
};
enum class ImageKind {
Color,
Depth,
Stencil
};
struct ImageKey {
std::uint64_t address;
std::uint64_t readAddress;
std::uint64_t writeAddress;
gnm::TextureType type;
gnm::DataFormat dfmt;
gnm::NumericFormat nfmt;
@ -35,6 +42,7 @@ struct ImageKey {
unsigned mipCount = 1;
unsigned baseArrayLayer = 0;
unsigned arrayLayerCount = 1;
ImageKind kind = ImageKind::Color;
bool pow2pad = false;
static ImageKey createFrom(const gnm::TBuffer &tbuffer);

View file

@ -301,10 +301,10 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
.layerCount = 1,
});
amdgpu::flip(cacheTag, commandBuffer, vk::context->swapchainExtent,
buffer.address, swapchainImageView,
{bufferAttr.width, bufferAttr.height}, compSwap,
getDefaultTileModes()[13], dfmt, nfmt);
amdgpu::flip(
cacheTag, commandBuffer, vk::context->swapchainExtent, buffer.address,
swapchainImageView, {bufferAttr.width, bufferAttr.height}, compSwap,
getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], dfmt, nfmt);
transitionImageLayout(commandBuffer, swapchainImage,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
@ -316,11 +316,11 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
});
} else {
ImageKey frameKey{
.address = buffer.address,
.readAddress = buffer.address,
.type = gnm::TextureType::Dim2D,
.dfmt = dfmt,
.nfmt = nfmt,
.tileMode = getDefaultTileModes()[13],
.tileMode = getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8],
.extent =
{
.width = bufferAttr.width,
@ -429,7 +429,7 @@ bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
};
vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence);
// vkQueueWaitIdle(queue);
vkQueueWaitIdle(vk::context->presentQueue);
}
scheduler.then([=, this, cacheTag = std::move(cacheTag)] {

View file

@ -405,6 +405,20 @@ struct DbRenderControl {
};
};
struct DbDepthView {
union {
struct {
std::uint32_t sliceStart : 11;
std::uint32_t : 2;
std::uint32_t sliceMax : 11;
bool zReadOnly : 1;
bool stencilReadOnly : 1;
};
std::uint32_t raw;
};
};
struct CbBlendControl {
union {
struct {
@ -574,7 +588,7 @@ struct Registers {
union {
Register<0x0, DbRenderControl> dbRenderControl;
Register<0x1> dbCountControl;
Register<0x2> dbDepthView;
Register<0x2, DbDepthView> dbDepthView;
Register<0x3> dbRenderOverride;
Register<0x4> dbRenderOverride2;
Register<0x5> dbHTileDataBase;

View file

@ -1,6 +1,7 @@
#include "Renderer.hpp"
#include "Device.hpp"
#include "gnm/descriptors.hpp"
#include "gnm/gnm.hpp"
#include "rx/MemoryTable.hpp"
#include <amdgpu/tiler.hpp>
@ -227,7 +228,8 @@ struct ShaderResources : eval::Evaluator {
bufferMemoryTable.map(*pointerBase,
*pointerBase + *pointerOffset + pointer.size,
Access::Read);
resourceSlotToAddress.push_back({slotOffset + pointer.resourceSlot, *pointerBase});
resourceSlotToAddress.push_back(
{slotOffset + pointer.resourceSlot, *pointerBase});
}
for (auto &bufferRes : res.buffers) {
@ -352,7 +354,8 @@ struct ShaderResources : eval::Evaluator {
sSampler.force_unorm_coords = true;
}
slotResources[slotOffset + sampler.resourceSlot] = samplerResources.size();
slotResources[slotOffset + sampler.resourceSlot] =
samplerResources.size();
samplerResources.push_back(
cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler)));
}
@ -503,11 +506,78 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
VkRect2D viewPortScissors[8]{};
unsigned renderTargets = 0;
VkRenderingAttachmentInfo depthAttachment{};
VkRenderingAttachmentInfo stencilAttachment{};
auto depthAccess = Access::None;
auto stencilAccess = Access::None;
if (pipe.context.dbDepthControl.depthEnable) {
if (!pipe.context.dbRenderControl.depthClearEnable) {
depthAccess |= Access::Read;
}
if (!pipe.context.dbDepthView.zReadOnly) {
depthAccess |= Access::Write;
}
}
if (pipe.context.dbDepthControl.stencilEnable) {
if (!pipe.context.dbRenderControl.stencilClearEnable) {
stencilAccess |= Access::Read;
}
if (!pipe.context.dbDepthView.stencilReadOnly) {
stencilAccess |= Access::Write;
}
}
if (depthAccess != Access::None) {
auto viewPortScissor = pipe.context.paScScreenScissor;
auto viewPortRect = gnm::toVkRect2D(viewPortScissor);
auto imageView = cacheTag.getImageView(
{{
.readAddress = pipe.context.dbZReadBase,
.writeAddress = pipe.context.dbZWriteBase,
.dfmt = gnm::getDataFormat(pipe.context.dbZInfo.format),
.nfmt = gnm::getNumericFormat(pipe.context.dbZInfo.format),
.extent =
{
.width = viewPortRect.extent.width,
.height = viewPortRect.extent.height,
.depth = 1,
},
.pitch = viewPortRect.extent.width,
.kind = ImageKind::Depth,
}},
depthAccess);
depthAttachment = {
.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
.imageView = imageView.handle,
.imageLayout = VK_IMAGE_LAYOUT_GENERAL,
.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
.storeOp = VK_ATTACHMENT_STORE_OP_STORE,
};
if ((depthAccess & Access::Read) == Access::None) {
depthAttachment.clearValue.depthStencil.depth = pipe.context.dbDepthClear;
depthAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
}
if ((depthAccess & Access::Write) == Access::None) {
depthAttachment.storeOp = VK_ATTACHMENT_STORE_OP_NONE;
}
}
for (auto &cbColor : pipe.context.cbColor) {
if (targetMask == 0) {
break;
}
if (cbColor.info.dfmt == gnm::kDataFormatInvalid) {
continue;
}
auto viewPortScissor = pipe.context.paScScreenScissor;
// viewPortScissor = gnm::intersection(
// viewPortScissor, pipe.context.paScVportScissor[renderTargets]);
@ -533,7 +603,9 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
ImageViewKey renderTargetInfo{};
renderTargetInfo.type = gnm::TextureType::Dim2D;
renderTargetInfo.pitch = vkViewPortScissor.extent.width;
renderTargetInfo.address = static_cast<std::uint64_t>(cbColor.base) << 8;
renderTargetInfo.readAddress = static_cast<std::uint64_t>(cbColor.base)
<< 8;
renderTargetInfo.writeAddress = renderTargetInfo.readAddress;
renderTargetInfo.extent.width = vkViewPortScissor.extent.width;
renderTargetInfo.extent.height = vkViewPortScissor.extent.height;
renderTargetInfo.extent.depth = 1;
@ -545,9 +617,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
renderTargetInfo.tileMode =
cbColor.info.linearGeneral
? TileMode{.raw = 0}
: getDefaultTileModes()[/*cbColor.attrib.tileModeIndex*/
13];
: getDefaultTileModes()[cbColor.attrib.tileModeIndex];
// std::printf("draw to %lx\n", renderTargetInfo.address);
auto access = Access::None;
@ -613,6 +683,10 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
targetMask >>= 4;
}
if (renderTargets == 0) {
return;
}
// if (pipe.context.cbTargetMask == 0) {
// return;
// }
@ -654,7 +728,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
.vgprCount = pgm.rsrc1.getVGprCount(),
.sgprCount = pgm.rsrc1.getSGprCount(),
.userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr),
.supportsBarycentric = vk::context->supportsBarycentric,
// .supportsBarycentric = vk::context->supportsBarycentric,
.supportsInt8 = vk::context->supportsInt8,
.supportsInt64Atomics = vk::context->supportsInt64Atomics,
};
@ -754,29 +828,33 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
break;
case shader::gcn::ConfigType::ViewPortOffsetX:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[0].xOffset / (viewPorts[0].width / 2.f) -
pipe.context.paClVports[slot.data].xOffset /
(viewPorts[0].width / 2.f) -
1);
break;
case shader::gcn::ConfigType::ViewPortOffsetY:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[0].yOffset / (viewPorts[0].height / 2.f) -
pipe.context.paClVports[slot.data].yOffset /
(viewPorts[slot.data].height / 2.f) -
1);
break;
case shader::gcn::ConfigType::ViewPortOffsetZ:
configPtr[index] =
std::bit_cast<std::uint32_t>(pipe.context.paClVports[0].zOffset);
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].zOffset);
break;
case shader::gcn::ConfigType::ViewPortScaleX:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[0].xScale / (viewPorts[0].width / 2.f));
pipe.context.paClVports[slot.data].xScale /
(viewPorts[slot.data].width / 2.f));
break;
case shader::gcn::ConfigType::ViewPortScaleY:
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[0].yScale / (viewPorts[0].height / 2.f));
pipe.context.paClVports[slot.data].yScale /
(viewPorts[slot.data].height / 2.f));
break;
case shader::gcn::ConfigType::ViewPortScaleZ:
configPtr[index] =
std::bit_cast<std::uint32_t>(pipe.context.paClVports[0].zScale);
configPtr[index] = std::bit_cast<std::uint32_t>(
pipe.context.paClVports[slot.data].zScale);
break;
case shader::gcn::ConfigType::PsInputVGpr:
if (slot.data > psVgprInputs) {
@ -882,8 +960,8 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
.layerCount = 1,
.colorAttachmentCount = renderTargets,
.pColorAttachments = colorAttachments,
// .pDepthAttachment = &depthAttachment,
// .pStencilAttachment = &stencilAttachment,
.pDepthAttachment = &depthAttachment,
// .pStencilAttachment = &stencilAttachment,
};
vkCmdBeginRendering(commandBuffer, &renderInfo);
@ -1092,7 +1170,7 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
ImageViewKey framebuffer{};
framebuffer.type = gnm::TextureType::Dim2D;
framebuffer.pitch = imageExtent.width;
framebuffer.address = address;
framebuffer.readAddress = address;
framebuffer.extent.width = imageExtent.width;
framebuffer.extent.height = imageExtent.height;
framebuffer.extent.depth = 1;

View file

@ -1,11 +1,11 @@
#pragma once
#include <array>
#include <bit>
#include <cstdint>
#include <cstdlib>
#include <gnm/constants.hpp>
#include <gnm/descriptors.hpp>
#include <bit>
namespace amdgpu {
inline constexpr uint32_t kMicroTileWidth = 8;
@ -496,6 +496,28 @@ constexpr std::uint32_t getPipeCount(PipeConfig pipeConfig) {
}
}
constexpr int computeMacroTileIndex(amdgpu::TileMode tileMode,
uint32_t bitsPerElement,
uint32_t numFragmentsPerPixel) {
auto arrayMode = tileMode.arrayMode();
auto microTileMode = tileMode.microTileMode();
auto sampleSplitHw = tileMode.sampleSplit();
auto tileSplitHw = tileMode.tileSplit();
uint32_t tileThickness = getMicroTileThickness(arrayMode);
uint32_t tileBytes1x =
bitsPerElement * kMicroTileWidth * kMicroTileHeight * tileThickness / 8;
uint32_t sampleSplit = 1 << sampleSplitHw;
uint32_t colorTileSplit = std::max(256U, sampleSplit * tileBytes1x);
uint32_t tileSplit = (microTileMode == amdgpu::kMicroTileModeDepth)
? (64UL << tileSplitHw)
: colorTileSplit;
uint32_t tileSplitC = std::min(kDramRowSize, tileSplit);
uint32_t tileBytes = std::min(tileSplitC, numFragmentsPerPixel * tileBytes1x);
uint32_t mtmIndex = std::countr_zero(tileBytes / 64);
return isPrt(arrayMode) ? mtmIndex + 8 : mtmIndex;
}
SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type,
gnm::DataFormat dfmt, std::uint32_t width,
std::uint32_t height, std::uint32_t depth,

View file

@ -1,4 +1,5 @@
#pragma once
#include "gnm/constants.hpp"
#include "tiler.hpp"
#include <Scheduler.hpp>
#include <memory>
@ -10,13 +11,13 @@ struct GpuTiler {
~GpuTiler();
void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress,
std::uint64_t dstLinearAddress, int mipLevel, int baseArray,
int arrayCount);
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcTiledAddress, std::uint64_t dstLinearAddress,
int mipLevel, int baseArray, int arrayCount);
void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress,
std::uint64_t dstTiledAddress, int mipLevel, int baseArray,
int arrayCount);
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcLinearAddress, std::uint64_t dstTiledAddress,
int mipLevel, int baseArray, int arrayCount);
private:
std::unique_ptr<Impl> mImpl;

View file

@ -18,17 +18,24 @@ void main() {
uvec3 pos = gl_GlobalInvocationID;
uint64_t tiledSliceOffset = 0;
uint64_t linearSliceOffset = 0;
int arraySlice = 0;
int fragmentIndex = 0;
if (config.tiledSurfaceSize != 0) {
tiledSliceOffset = pos.z * config.tiledSurfaceSize;
linearSliceOffset = pos.z * config.linearSurfaceSize;
pos.z = 0;
}
uint64_t tiledByteOffset = getTiledBitOffset1D(
uint64_t tiledByteOffset = getTiledBitOffset2D(
config.dfmt,
config.tileMode,
pos,
config.macroTileMode,
config.dataSize,
config.bitsPerElement
arraySlice,
config.numFragments,
pos,
fragmentIndex
) / 8;
tiledByteOffset += tiledSliceOffset;

View file

@ -537,6 +537,86 @@ uint32_t tileMode_getSampleSplit(uint32_t tileMode) {
return (tileMode & 0x06000000) >> 25;
}
uint32_t macroTileMode_getBankWidth(uint32_t tileMode) {
return (tileMode & 0x00000003) >> 0;
}
uint32_t macroTileMode_getBankHeight(uint32_t tileMode) {
return (tileMode & 0x0000000c) >> 2;
}
uint32_t macroTileMode_getMacroTileAspect(uint32_t tileMode) {
return (tileMode & 0x00000030) >> 4;
}
uint32_t macroTileMode_getNumBanks(uint32_t tileMode) {
return (tileMode & 0x000000c0) >> 6;
}
uint32_t getPipeCount(uint32_t pipeConfig) {
switch (pipeConfig) {
case kPipeConfigP8_32x32_8x16:
case kPipeConfigP8_32x32_16x16:
return 8;
case kPipeConfigP16:
return 16;
default:
return 0;
}
}
uint32_t getPipeIndex(uint32_t x, uint32_t y, uint32_t pipeCfg) {
uint32_t pipe = 0;
switch (pipeCfg) {
case kPipeConfigP8_32x32_8x16:
pipe |= (((x >> 4) ^ (y >> 3) ^ (x >> 5)) & 0x1) << 0;
pipe |= (((x >> 3) ^ (y >> 4)) & 0x1) << 1;
pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
break;
case kPipeConfigP8_32x32_16x16:
pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0;
pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1;
pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
break;
case kPipeConfigP16:
pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0;
pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1;
pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
pipe |= (((x >> 6) ^ (y >> 5)) & 0x1) << 3;
break;
}
return pipe;
}
uint32_t getBankIndex(uint32_t x, uint32_t y, uint32_t bank_width, uint32_t bank_height, uint32_t num_banks, uint32_t num_pipes) {
uint32_t x_shift_offset = findLSB(bank_width * num_pipes);
uint32_t y_shift_offset = findLSB(bank_height);
uint32_t xs = x >> x_shift_offset;
uint32_t ys = y >> y_shift_offset;
uint32_t bank = 0;
switch (num_banks) {
case 2:
bank |= (((xs >> 3) ^ (ys >> 3)) & 0x1) << 0;
break;
case 4:
bank |= (((xs >> 3) ^ (ys >> 4)) & 0x1) << 0;
bank |= (((xs >> 4) ^ (ys >> 3)) & 0x1) << 1;
break;
case 8:
bank |= (((xs >> 3) ^ (ys >> 5)) & 0x1) << 0;
bank |= (((xs >> 4) ^ (ys >> 4) ^ (ys >> 5)) & 0x1) << 1;
bank |= (((xs >> 5) ^ (ys >> 3)) & 0x1) << 2;
break;
case 16:
bank |= (((xs >> 3) ^ (ys >> 6)) & 0x1) << 0;
bank |= (((xs >> 4) ^ (ys >> 5) ^ (ys >> 6)) & 0x1) << 1;
bank |= (((xs >> 5) ^ (ys >> 4)) & 0x1) << 2;
bank |= (((xs >> 6) ^ (ys >> 3)) & 0x1) << 3;
break;
default:
break;
}
return bank;
}
uint32_t bit_ceil(uint32_t x) {
x = x - 1;
x |= x >> 1;
@ -704,13 +784,223 @@ uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint3
return (sliceOffset + tileOffset) * 8 + elementOffset;
}
uint64_t getTiledBitOffset2D(uint32_t dfmt, uint32_t tileMode, uint32_t macroTileMode,
uvec2 dataSize, int arraySlice, uint32_t numFragments, u32vec3 pos, int fragmentIndex) {
uint32_t bitsPerFragment = getBitsPerElement(dfmt);
bool isBlockCompressed = getTexelsPerElement(dfmt) > 1;
uint32_t tileSwizzleMask = 0;
uint32_t numFragmentsPerPixel = 1 << numFragments;
uint32_t arrayMode = tileMode_getArrayMode(tileMode);
uint32_t tileThickness = 1;
switch (arrayMode) {
case kArrayMode2dTiledThin:
case kArrayMode3dTiledThin:
case kArrayModeTiledThinPrt:
case kArrayMode2dTiledThinPrt:
case kArrayMode3dTiledThinPrt:
tileThickness = 1;
break;
case kArrayMode1dTiledThick:
case kArrayMode2dTiledThick:
case kArrayMode3dTiledThick:
case kArrayModeTiledThickPrt:
case kArrayMode2dTiledThickPrt:
case kArrayMode3dTiledThickPrt:
tileThickness = 4;
break;
case kArrayMode2dTiledXThick:
case kArrayMode3dTiledXThick:
tileThickness = 8;
break;
default:
break;
}
uint32_t bitsPerElement = bitsPerFragment;
uint32_t paddedWidth = dataSize.x;
uint32_t paddedHeight = dataSize.y;
uint32_t bankWidthHW = macroTileMode_getBankWidth(macroTileMode);
uint32_t bankHeightHW = macroTileMode_getBankHeight(macroTileMode);
uint32_t macroAspectHW = macroTileMode_getMacroTileAspect(macroTileMode);
uint32_t numBanksHW = macroTileMode_getNumBanks(macroTileMode);
uint32_t bankWidth = 1 << bankWidthHW;
uint32_t bankHeight = 1 << bankHeightHW;
uint32_t numBanks = 2 << numBanksHW;
uint32_t macroTileAspect = 1 << macroAspectHW;
uint32_t tileBytes1x =
(tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
7) /
8;
uint32_t sampleSplitHw = tileMode_getSampleSplit(tileMode);
uint32_t tileSplitHw = tileMode_getTileSplit(tileMode);
uint32_t sampleSplit = 1 << sampleSplitHw;
uint32_t tileSplitC =
(tileMode_getMicroTileMode(tileMode) == kMicroTileModeDepth)
? (64 << tileSplitHw)
: max(256U, tileBytes1x * sampleSplit);
uint32_t tileSplitBytes = min(kDramRowSize, tileSplitC);
uint32_t numPipes = getPipeCount(tileMode_getPipeConfig(tileMode));
uint32_t pipeInterleaveBits = findLSB(kPipeInterleaveBytes);
uint32_t pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
uint32_t pipeBits = findLSB(numPipes);
uint32_t bankBits = findLSB(numBanks);
uint32_t bankSwizzleMask = tileSwizzleMask;
uint32_t pipeSwizzleMask = 0;
uint32_t macroTileWidth =
(kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
uint32_t macroTileHeight =
(kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;
uint32_t microTileMode = tileMode_getMicroTileMode(tileMode);
uint64_t elementIndex =
getElementIndex(pos, bitsPerElement, microTileMode, arrayMode);
uint32_t xh = pos.x;
uint32_t yh = pos.y;
if (arrayMode == kArrayModeTiledThinPrt ||
arrayMode == kArrayModeTiledThickPrt) {
xh %= macroTileWidth;
yh %= macroTileHeight;
}
uint64_t pipe = getPipeIndex(xh, yh, tileMode_getPipeConfig(tileMode));
uint64_t bank =
getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
bitsPerElement * numFragmentsPerPixel +
7) /
8;
uint64_t elementOffset = 0;
if (microTileMode == kMicroTileModeDepth) {
uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
} else {
uint64_t fragmentOffset =
fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
}
uint64_t slicesPerTile = 1;
uint64_t tileSplitSlice = 0;
if (tileBytes > tileSplitBytes && tileThickness == 1) {
slicesPerTile = tileBytes / tileSplitBytes;
tileSplitSlice = elementOffset / (tileSplitBytes * 8);
elementOffset %= (tileSplitBytes * 8);
tileBytes = tileSplitBytes;
}
uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
(macroTileHeight / kMicroTileHeight) * tileBytes /
(numPipes * numBanks);
uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
uint64_t macroTileRowIndex = pos.y / macroTileHeight;
uint64_t macroTileColumnIndex = pos.x / macroTileWidth;
uint64_t macroTileIndex =
(macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
uint64_t macroTilesPerSlice =
macroTilesPerRow * (paddedHeight / macroTileHeight);
uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;
uint32_t slice = pos.z;
uint64_t sliceOffset =
(tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
if (arraySlice != 0) {
slice = arraySlice;
}
uint64_t tileRowIndex = (pos.y / kMicroTileHeight) % bankHeight;
uint64_t tileColumnIndex = ((pos.x / kMicroTileWidth) / numPipes) % bankWidth;
uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
uint64_t tileOffset = tileIndex * tileBytes;
uint64_t bankSwizzle = bankSwizzleMask;
uint64_t pipeSwizzle = pipeSwizzleMask;
uint64_t pipeSliceRotation = 0;
switch (arrayMode) {
case kArrayMode3dTiledThin:
case kArrayMode3dTiledThick:
case kArrayMode3dTiledXThick:
pipeSliceRotation =
max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
break;
default:
break;
}
pipeSwizzle += pipeSliceRotation;
pipeSwizzle &= (numPipes - 1);
pipe = pipe ^ pipeSwizzle;
uint64_t sliceRotation = 0;
switch (arrayMode) {
case kArrayMode2dTiledThin:
case kArrayMode2dTiledThick:
case kArrayMode2dTiledXThick:
sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
break;
case kArrayMode3dTiledThin:
case kArrayMode3dTiledThick:
case kArrayMode3dTiledXThick:
sliceRotation = max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness) / numPipes;
break;
default:
break;
}
uint64_t tileSplitSliceRotation = 0;
switch (arrayMode) {
case kArrayMode2dTiledThin:
case kArrayMode3dTiledThin:
case kArrayMode2dTiledThinPrt:
case kArrayMode3dTiledThinPrt:
tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
break;
default:
break;
}
bank ^= bankSwizzle + sliceRotation;
bank ^= tileSplitSliceRotation;
bank &= (numBanks - 1);
uint64_t totalOffset =
(sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
uint64_t bitOffset = totalOffset & 0x7;
totalOffset /= 8;
uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
uint64_t offset = totalOffset >> pipeInterleaveBits;
uint64_t finalByteOffset =
pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
(bank << (pipeInterleaveBits + pipeBits)) |
(offset << (pipeInterleaveBits + pipeBits + bankBits));
return (finalByteOffset << 3) | bitOffset;
}
layout(binding=0) uniform Config {
uint64_t srcAddress;
uint64_t dstAddress;
uvec2 dataSize;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
uint32_t padding0;
uint32_t padding1;
} config;

View file

@ -18,17 +18,23 @@ void main() {
uvec3 pos = gl_GlobalInvocationID;
uint64_t tiledSliceOffset = 0;
uint64_t linearSliceOffset = 0;
int arraySlice = 0;
int fragmentIndex = 0;
if (config.tiledSurfaceSize != 0) {
tiledSliceOffset = pos.z * config.tiledSurfaceSize;
linearSliceOffset = pos.z * config.linearSurfaceSize;
pos.z = 0;
}
uint64_t tiledByteOffset = getTiledBitOffset1D(
uint64_t tiledByteOffset = getTiledBitOffset2D(
config.dfmt,
config.tileMode,
pos,
config.macroTileMode,
config.dataSize,
config.bitsPerElement
arraySlice,
config.numFragments,
pos,
fragmentIndex
) / 8;
tiledByteOffset += tiledSliceOffset;

View file

@ -5,6 +5,169 @@
using namespace amdgpu;
// FIXME: should be properly implemented
static SurfaceInfo
computeTexture2dInfo(ArrayMode arrayMode, gnm::TextureType type,
gnm::DataFormat dfmt, std::uint32_t width,
std::uint32_t height, std::uint32_t depth,
std::uint32_t pitch, int baseArrayLayer, int arrayCount,
int baseMipLevel, int mipCount, bool pow2pad) {
bool isCubemap = type == gnm::TextureType::Cube;
bool isVolume = type == gnm::TextureType::Dim3D;
auto bitsPerFragment = getBitsPerElement(dfmt);
std::uint32_t arraySliceCount = depth;
if (isCubemap) {
arraySliceCount *= 6;
} else if (isVolume) {
arraySliceCount = 1;
}
int numFragments = (type == gnm::TextureType::Msaa2D ||
type == gnm::TextureType::MsaaArray2D)
? (baseArrayLayer + arrayCount - 1)
: 0;
auto numFragmentsPerPixel = 1 << numFragments;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto bitsPerElement = bitsPerFragment;
depth = isVolume ? depth : 1;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
bitsPerElement *= 8;
break;
case 4:
case 8:
bitsPerElement *= 16;
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
arraySliceCount = std::bit_ceil(arraySliceCount);
}
std::uint64_t surfaceOffset = 0;
std::uint64_t surfaceSize = 0;
SurfaceInfo result;
result.width = width;
result.height = height;
result.depth = depth;
result.pitch = pitch;
result.numFragments = numFragments;
result.bitsPerElement = bitsPerElement;
result.arrayLayerCount = arraySliceCount;
auto thickness = getMicroTileThickness(arrayMode);
for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
std::uint32_t linearPitch = elemPitch;
std::uint32_t linearWidth = elemWidth;
std::uint32_t linearHeight = elemHeight;
std::uint32_t linearDepth = elemDepth;
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
break;
case 4:
case 8:
linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
break;
case 16:
std::abort();
break;
default:
std::abort();
break;
}
}
if (pow2pad) {
linearPitch = std::bit_ceil(linearPitch);
linearWidth = std::bit_ceil(linearWidth);
linearHeight = std::bit_ceil(linearHeight);
linearDepth = std::bit_ceil(linearDepth);
}
if (mipLevel > 0 && pitch > 0) {
linearPitch = linearWidth;
}
std::uint32_t paddedPitch =
(linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
std::uint32_t paddedHeight =
(linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
std::uint32_t paddedDepth = linearDepth;
if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
if (isCubemap) {
linearDepth = std::bit_ceil(linearDepth);
}
paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
}
std::uint32_t tempPitch = paddedPitch;
std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
paddedHeight * bitsPerElement *
numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
tempPitch += kMicroTileWidth;
logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
bitsPerElement * numFragmentsPerPixel;
logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
}
surfaceSize = logicalSliceSizeBytes * paddedDepth;
auto linearSize =
linearDepth *
(linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
7) /
8;
result.setSubresourceInfo(mipLevel, {
.dataWidth = linearPitch,
.dataHeight = linearHeight,
.dataDepth = linearDepth,
.offset = surfaceOffset,
.tiledSize = surfaceSize,
.linearSize = linearSize,
});
surfaceOffset += arraySliceCount * surfaceSize;
}
result.totalSize = surfaceOffset;
return result;
}
static SurfaceInfo
computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
gnm::DataFormat dfmt, std::uint32_t width,
@ -370,7 +533,9 @@ SurfaceInfo amdgpu::computeSurfaceInfo(
case kArrayMode2dTiledThickPrt:
case kArrayMode3dTiledThinPrt:
case kArrayMode3dTiledThickPrt:
std::abort();
return computeTexture2dInfo(tileMode.arrayMode(), type, dfmt, width, height,
depth, pitch, baseArrayLayer, arrayCount,
baseMipLevel, mipCount, pow2pad);
}
std::abort();

View file

@ -175,9 +175,9 @@ getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
bool isCubemap = texType == gnm::TextureType::Cube;
bool isVolume = texType == gnm::TextureType::Dim3D;
auto m_bitsPerFragment = getBitsPerElement(dfmt);
auto bitsPerFragment = getBitsPerElement(dfmt);
auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
auto tileSwizzleMask = 0;
auto numFragmentsPerPixel = 1 << numFragments;
auto arrayMode = tileMode.arrayMode();
@ -208,12 +208,12 @@ getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
break;
}
auto bitsPerElement = m_bitsPerFragment;
auto bitsPerElement = bitsPerFragment;
auto paddedWidth = pitch;
auto paddedHeight = height;
if (m_isBlockCompressed) {
switch (m_bitsPerFragment) {
if (isBlockCompressed) {
switch (bitsPerFragment) {
case 1:
bitsPerElement *= 8;
paddedWidth = std::max((paddedWidth + 7) / 8, 1);

View file

@ -93,10 +93,13 @@ struct amdgpu::GpuTiler::Impl {
uint32_t dataWidth;
uint32_t dataHeight;
uint32_t tileMode;
uint32_t macroTileMode;
uint32_t dfmt;
uint32_t numFragments;
uint32_t bitsPerElement;
uint32_t tiledSurfaceSize;
uint32_t linearSurfaceSize;
uint32_t padding[2];
};
Impl() {
@ -119,7 +122,8 @@ struct amdgpu::GpuTiler::Impl {
{
VkDescriptorPoolSize poolSizes[]{{
.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
.descriptorCount = static_cast<std::uint32_t>(std::size(descriptorSets)) * 2,
.descriptorCount =
static_cast<std::uint32_t>(std::size(descriptorSets)) * 2,
}};
VkDescriptorPoolCreateInfo info{
@ -174,7 +178,7 @@ amdgpu::GpuTiler::~GpuTiler() = default;
void amdgpu::GpuTiler::detile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcTiledAddress,
std::uint64_t dstLinearAddress, int mipLevel,
int baseArray, int arrayCount) {
@ -192,6 +196,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->dfmt = dfmt;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
@ -231,8 +236,13 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
std::abort();
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
config->macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
.raw;
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
break;
}
@ -265,7 +275,7 @@ void amdgpu::GpuTiler::detile(Scheduler &scheduler,
void amdgpu::GpuTiler::tile(Scheduler &scheduler,
const amdgpu::SurfaceInfo &info,
amdgpu::TileMode tileMode,
amdgpu::TileMode tileMode, gnm::DataFormat dfmt,
std::uint64_t srcLinearAddress,
std::uint64_t dstTiledAddress, int mipLevel,
int baseArray, int arrayCount) {
@ -283,6 +293,7 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
config->dataWidth = subresource.dataWidth;
config->dataHeight = subresource.dataHeight;
config->tileMode = tileMode.raw;
config->dfmt = dfmt;
config->numFragments = info.numFragments;
config->bitsPerElement = info.bitsPerElement;
uint32_t groupCountZ = subresource.dataDepth;
@ -321,8 +332,12 @@ void amdgpu::GpuTiler::tile(Scheduler &scheduler,
case amdgpu::kArrayMode3dTiledThick:
case amdgpu::kArrayMode3dTiledXThick:
case amdgpu::kArrayMode3dTiledThickPrt:
std::abort();
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
config->macroTileMode =
getDefaultMacroTileModes()[computeMacroTileIndex(
tileMode, info.bitsPerElement,
1 << info.numFragments)]
.raw;
vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
break;
}

View file

@ -15,6 +15,7 @@
#extension GL_EXT_shader_atomic_float2 : require
#extension GL_EXT_nonuniform_qualifier: require
#extension GL_EXT_samplerless_texture_functions : require
#extension GL_EXT_debug_printf : enable
#define FLT_MAX 3.402823466e+38
#define FLT_MIN 1.175494351e-38
@ -235,6 +236,8 @@ float32_t ps_input_vgpr(int32_t index, f32vec4 fragCoord, bool frontFace) {
case kPsVGprInputPosFixed:
return 0;
}
// debugPrintfEXT("ps_input_vgpr: invalid index %d", index);
return 0;
}
@ -385,8 +388,10 @@ uint32_t v_cndmask_b32(uint32_t x, uint32_t y, uint64_t mask) {
float32_t v_add_f32(float32_t x, float32_t y) { return x + y; }
float32_t v_sub_f32(float32_t x, float32_t y) { return x - y; }
float32_t v_subrev_f32(float32_t x, float32_t y) { return y - x; }
float32_t v_mac_legacy_f32(float32_t x, float32_t y, float32_t dst) {
return x == 0 || y == 0 ? dst : fma(x, y, dst);
void v_mac_legacy_f32(inout float32_t dst, float32_t x, float32_t y) {
if (!(x == 0 || y == 0)) {
dst = fma(x, y, dst);
}
}
float32_t v_mul_legacy_f32(float32_t x, float32_t y) {
return x == 0 || y == 0 ? 0 : x * y;
@ -425,7 +430,7 @@ uint32_t v_and_b32(uint32_t x, uint32_t y) { return x & y; }
uint32_t v_or_b32(uint32_t x, uint32_t y) { return x | y; }
uint32_t v_xor_b32(uint32_t x, uint32_t y) { return x ^ y; }
uint32_t v_bfm_b32(uint32_t x, uint32_t y) { return ((1 << (x & 0x1f)) - 1) << (y & 0x1f); }
float32_t v_mac_f32(float32_t x, float32_t y, float32_t dst) { return fma(x, y, dst); }
void v_mac_f32(inout float32_t dst, float32_t x, float32_t y) { dst = fma(x, y, dst); }
float32_t v_madmk_f32(float32_t x, float32_t y, float32_t k) { return fma(x, k, y); }
float32_t v_madak_f32(float32_t x, float32_t y, float32_t k) { return fma(x, y, k); }
uint32_t v_bcnt_u32_b32(uint32_t x) { return bitCount(x); }
@ -2575,6 +2580,8 @@ void image_sample(inout f32vec4 vdata, f32vec3 vaddr, int32_t textureIndexHint,
return;
}
// debugPrintfEXT("image_sample: textureType: %u, coord: %v3f, result: %v4f, dmask: %u", textureType, vaddr, result, dmask);
int vdataIndex = 0;
for (int i = 0; i < 4; ++i) {
if ((dmask & (1 << i)) != 0) {

View file

@ -1422,12 +1422,6 @@ static void createInitialValues(GcnConverter &converter,
if (stage != gcn::Stage::Cs) {
context.writeReg(loc, builder, gcn::RegId::Exec, 0, context.imm64(1));
// context.writeReg(loc, builder, gcn::RegId::ThreadId, 0,
// context.imm32(0));
replaceVariableWithConstant(
context.getOrCreateRegisterVariable(gcn::RegId::ThreadId),
context.imm32(0));
}
if (stage == gcn::Stage::VsVs || stage == gcn::Stage::GsVs ||
@ -1561,6 +1555,12 @@ gcn::convertToSpv(Context &context, ir::Region body,
createInitialValues(converter, env, stage, result.info, body);
instructionsToSpv(converter, importer, stage, env, semanticInfo, result.info,
body);
if (stage != gcn::Stage::Cs) {
replaceVariableWithConstant(
context.getOrCreateRegisterVariable(gcn::RegId::ThreadId),
context.imm32(0));
}
createEntryPoint(context, stage, std::move(body));
for (int userSgpr = std::countr_zero(context.requiredUserSgprs);

View file

@ -127,8 +127,6 @@ readVop2Inst(GcnInstruction &inst, std::uint64_t &address,
if (op == ir::vop2::MADMK_F32 || op == ir::vop2::MADAK_F32) {
inst.addOperand(createImmediateGcnOperand(address));
} else if (op == ir::vop2::MAC_F32) {
inst.addOperand(createVgprGcnOperand(vdst).withR());
}
}
@ -343,8 +341,6 @@ readVop3Inst(GcnInstruction &inst, std::uint64_t &address,
.withNeg(((neg >> 2) & 1) != 0));
} else if (op == ir::vop3::MADMK_F32 || op == ir::vop3::MADAK_F32) {
inst.addOperand(createImmediateGcnOperand(address));
} else if (op == ir::vop3::MAC_F32) {
inst.addOperand(createSgprGcnOperand(address, vdst).withRW());
}
} else if (op >= 384 && op < ir::vop1::OpCount + 384) {
// vop1
@ -527,14 +523,14 @@ readMtbufInst(GcnInstruction &inst, std::uint64_t &address,
inst.op = op;
inst.addOperand(createVgprGcnOperand(vdata).withAccess(dataAccess));
if (idxen) {
inst.addOperand(createVgprGcnOperand(vaddr).withR());
if (offen) {
inst.addOperand(createVgprGcnOperand(vaddr + (idxen ? 1 : 0)).withR());
} else {
inst.addOperand(GcnOperand::createConstant(0u));
}
if (offen) {
inst.addOperand(createVgprGcnOperand(vaddr + (idxen ? 1 : 0)).withR());
if (idxen) {
inst.addOperand(createVgprGcnOperand(vaddr).withR());
} else {
inst.addOperand(GcnOperand::createConstant(0u));
}

View file

@ -1081,6 +1081,23 @@ static ir::Value deserializeGcnRegion(
auto instSem =
semInfo.findSemantic(ir::getInstructionId(isaInst.kind, isaInst.op));
auto createExecTest = [&] {
auto mergeBlock = builder.createSpvLabel(loc);
gcn::Builder::createInsertBefore(converter, mergeBlock)
.createSpvBranch(loc, mergeBlock);
auto instBlock = gcn::Builder::createInsertAfter(converter, instrBegin)
.createSpvLabel(loc);
auto prependInstBuilder =
gcn::Builder::createInsertBefore(converter, instBlock);
auto exec = prependInstBuilder.createValue(
loc, ir::amdgpu::EXEC_TEST,
converter.getType(execTestSem->returnType));
prependInstBuilder.createSpvSelectionMerge(
loc, mergeBlock, ir::spv::SelectionControl::None);
prependInstBuilder.createSpvBranchConditional(loc, exec, instBlock,
mergeBlock);
};
if (instSem == nullptr) {
if (isaInst == ir::sopp::BRANCH) {
auto target =
@ -1268,6 +1285,9 @@ static ir::Value deserializeGcnRegion(
inst.addOperand(createOperandRead(loc, paramBuilder, uint32TV, op));
}
if (isaInst == ir::exp::EXP) {
createExecTest();
}
continue;
}
@ -1400,20 +1420,7 @@ static ir::Value deserializeGcnRegion(
}
if (!hasDestination && injectExecTest) {
auto mergeBlock = builder.createSpvLabel(loc);
gcn::Builder::createInsertBefore(converter, mergeBlock)
.createSpvBranch(loc, mergeBlock);
auto instBlock = gcn::Builder::createInsertAfter(converter, instrBegin)
.createSpvLabel(loc);
auto prependInstBuilder =
gcn::Builder::createInsertBefore(converter, instBlock);
auto exec = prependInstBuilder.createValue(
loc, ir::amdgpu::EXEC_TEST,
converter.getType(execTestSem->returnType));
prependInstBuilder.createSpvSelectionMerge(
loc, mergeBlock, ir::spv::SelectionControl::None);
prependInstBuilder.createSpvBranchConditional(loc, exec, instBlock,
mergeBlock);
createExecTest();
}
}

View file

@ -252,5 +252,56 @@ constexpr ZFormat getZFormat(DataFormat dfmt) {
constexpr StencilFormat getStencilFormat(DataFormat dfmt) {
return dfmt == kDataFormat8 ? kStencil8 : kStencilInvalid;
}
} // namespace gnm
constexpr DataFormat getDataFormat(ZFormat format) {
switch (format) {
case kZFormat32Float:
return kDataFormat32;
case kZFormat16:
return kDataFormat16;
case kZFormatInvalid:
break;
}
return kDataFormatInvalid;
}
constexpr NumericFormat getNumericFormat(ZFormat format) {
switch (format) {
case kZFormat32Float:
return kNumericFormatFloat;
case kZFormat16:
return kNumericFormatUInt;
case kZFormatInvalid:
break;
}
return kNumericFormatUNorm;
}
constexpr DataFormat getDataFormat(StencilFormat format) {
switch (format) {
case kStencil8:
return kDataFormat8;
case kStencilInvalid:
break;
}
return kDataFormatInvalid;
}
constexpr NumericFormat getNumericFormat(StencilFormat format) {
switch (format) {
case kStencil8:
return kNumericFormatSInt;
case kStencilInvalid:
break;
}
return kNumericFormatUNorm;
}
} // namespace gnm