diff --git a/rpcsx/gpu/Cache.cpp b/rpcsx/gpu/Cache.cpp index 4692cb54c..69ae65231 100644 --- a/rpcsx/gpu/Cache.cpp +++ b/rpcsx/gpu/Cache.cpp @@ -200,6 +200,68 @@ void Cache::ShaderResources::loadResources( buffer.address()); } + for (auto &imageBuffer : res.imageBuffers) { + auto word0 = eval(imageBuffer.words[0]).zExtScalar(); + auto word1 = eval(imageBuffer.words[1]).zExtScalar(); + auto word2 = eval(imageBuffer.words[2]).zExtScalar(); + auto word3 = eval(imageBuffer.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate V#"); + } + + gnm::TBuffer tbuffer{}; + std::memcpy(reinterpret_cast(&tbuffer), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 3, &*word3, + sizeof(std::uint32_t)); + + if (imageBuffer.words[4] != nullptr) { + auto word4 = eval(imageBuffer.words[4]).zExtScalar(); + auto word5 = eval(imageBuffer.words[5]).zExtScalar(); + auto word6 = eval(imageBuffer.words[6]).zExtScalar(); + auto word7 = eval(imageBuffer.words[7]).zExtScalar(); + + if (!word4 || !word5 || !word6 || !word7) { + res.dump(); + rx::die("failed to evaluate 256 bit T#"); + } + + std::memcpy(reinterpret_cast(&tbuffer) + 4, &*word4, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 5, &*word5, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 6, &*word6, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&tbuffer) + 7, &*word7, + sizeof(std::uint32_t)); + } + + auto info = computeSurfaceInfo( + getDefaultTileModes()[tbuffer.tiling_idx], tbuffer.type, tbuffer.dfmt, + tbuffer.width + 1, tbuffer.height + 1, tbuffer.depth + 1, + tbuffer.pitch + 1, 0, tbuffer.last_array + 1, 0, tbuffer.last_level + 1, + tbuffer.pow2pad != 0); + + if (auto it = imageMemoryTable.queryArea(tbuffer.address()); + it != imageMemoryTable.end() && + it.beginAddress() == tbuffer.address() && + it.size() == info.totalTiledSize) { + it.get().second |= imageBuffer.access; + } else { + imageMemoryTable.map( + tbuffer.address(), tbuffer.address() + info.totalTiledSize, + {ImageBufferKey::createFrom(tbuffer), imageBuffer.access}); + } + resourceSlotToAddress.emplace_back(slotOffset + imageBuffer.resourceSlot, + tbuffer.address()); + } + for (auto &texture : res.textures) { auto word0 = eval(texture.words[0]).zExtScalar(); auto word1 = eval(texture.words[1]).zExtScalar(); @@ -325,6 +387,28 @@ void Cache::ShaderResources::buildMemoryTable(MemoryTable &memoryTable) { } } } +void Cache::ShaderResources::buildImageMemoryTable(MemoryTable &memoryTable) { + memoryTable.count = 0; + + for (auto p : imageMemoryTable) { + auto range = rx::AddressRange::fromBeginEnd(p.beginAddress, p.endAddress); + auto buffer = cacheTag->getImageBuffer(p.payload.first, p.payload.second); + + auto memoryTableSlot = memoryTable.count; + memoryTable.slots[memoryTable.count++] = { + .address = p.beginAddress, + .size = range.size(), + .flags = static_cast(p.payload.second), + .deviceAddress = buffer.deviceAddress, + }; + + for (auto [slot, address] : resourceSlotToAddress) { + if (address >= p.beginAddress && address < p.endAddress) { + slotResources[slot] = memoryTableSlot; + } + } + } +} std::uint32_t Cache::ShaderResources::getResourceSlot(std::uint32_t id) { if (auto it = slotResources.find(id); it != slotResources.end()) { @@ -386,6 +470,10 @@ Cache::ShaderResources::eval(ir::InstructionId instId, rx::die("resource depends on texture value"); } + if (instId == ir::amdgpu::IMAGE_BUFFER) { + rx::die("resource depends on image buffer value"); + } + if (instId == ir::amdgpu::SAMPLER) { rx::die("resource depends on sampler value"); } @@ -865,6 +953,7 @@ struct CachedImage : Cache::Entry { SurfaceInfo info; bool expensive() { + return false; if (kDisableCache) { return false; } @@ -1139,7 +1228,7 @@ ImageBufferKey ImageBufferKey::createFrom(const ImageKey &imageKey) { } SamplerKey SamplerKey::createFrom(const gnm::SSampler &sampler) { - float lodBias = ((std::int16_t(sampler.lod_bias) << 2) >> 2) / float(256.f); + float lodBias = sampler.lod_bias / 256.f; // FIXME: lodBias can be scaled by gnm::TBuffer return { @@ -1152,8 +1241,8 @@ SamplerKey SamplerKey::createFrom(const gnm::SSampler &sampler) { .mipLodBias = lodBias, .maxAnisotropy = 0, // max_aniso_ratio .compareOp = toVkCompareOp(sampler.depth_compare_func), - .minLod = static_cast(sampler.min_lod), - .maxLod = static_cast(sampler.max_lod), + .minLod = sampler.min_lod / 256.f, + .maxLod = sampler.max_lod / 256.f, .borderColor = toVkBorderColor(sampler.border_color_type), .anisotropyEnable = false, .compareEnable = sampler.depth_compare_func != gnm::CompareFunc::Never, @@ -1334,12 +1423,9 @@ Cache::Buffer Cache::Tag::getBuffer(rx::AddressRange range, Access access) { auto it = table.queryArea(range.beginAddress()); if (it == table.end() || !it.range().contains(range)) { - if (mParent->flushImages(*this, range)) { - mScheduler->submit(); - mScheduler->wait(); - } - - if (mParent->flushImageBuffers(*this, range)) { + auto flushRange = mParent->flushImages(*this, range); + flushRange = flushRange.merge(mParent->flushImageBuffers(*this, range)); + if (flushRange) { mScheduler->submit(); mScheduler->wait(); } @@ -1375,18 +1461,18 @@ Cache::Buffer Cache::Tag::getBuffer(rx::AddressRange range, Access access) { addressRange.beginAddress(), addressRange.size()) || !mParent->isInSync(addressRange, cached->tagId)) { - if (mParent->flushImages(*this, range)) { + auto flushedRange = mParent->flushImages(*this, range); + flushedRange = + flushedRange.merge(mParent->flushImageBuffers(*this, range)); + + if (flushedRange) { getScheduler().submit(); getScheduler().wait(); } - if (mParent->flushImageBuffers(*this, range)) { - getScheduler().submit(); - getScheduler().wait(); - } - - mParent->trackUpdate(EntryType::HostVisibleBuffer, addressRange, it.get(), - getReadId(), cached->expensive()); + mParent->trackUpdate( + EntryType::HostVisibleBuffer, addressRange, it.get(), getReadId(), + (access & Access::Write) == Access::None && cached->expensive()); amdgpu::RemoteMemory memory{mParent->mVmId}; cached->update(addressRange, memory.getPointer(addressRange.beginAddress())); @@ -1448,13 +1534,18 @@ Cache::Buffer Cache::Tag::getInternalDeviceLocalBuffer(std::uint64_t size) { } void Cache::Tag::buildDescriptors(VkDescriptorSet descriptorSet) { + auto &res = mStorage->shaderResources; auto memoryTableBuffer = getMemoryTable(); + auto imageMemoryTableBuffer = getImageMemoryTable(); auto memoryTable = std::bit_cast(memoryTableBuffer.data); - mStorage->shaderResources.buildMemoryTable(*memoryTable); + auto imageMemoryTable = + std::bit_cast(imageMemoryTableBuffer.data); - for (auto &sampler : mStorage->shaderResources.samplerResources) { - uint32_t index = - &sampler - mStorage->shaderResources.samplerResources.data(); + res.buildMemoryTable(*memoryTable); + res.buildImageMemoryTable(*imageMemoryTable); + + for (auto &sampler : res.samplerResources) { + uint32_t index = &sampler - res.samplerResources.data(); VkDescriptorImageInfo samplerInfo{.sampler = sampler.handle}; @@ -1471,8 +1562,8 @@ void Cache::Tag::buildDescriptors(VkDescriptorSet descriptorSet) { vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); } - for (auto &imageResources : mStorage->shaderResources.imageResources) { - auto dim = (&imageResources - mStorage->shaderResources.imageResources) + 1; + for (auto &imageResources : res.imageResources) { + auto dim = (&imageResources - res.imageResources) + 1; auto binding = static_cast( Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, dim)); @@ -1725,8 +1816,9 @@ Cache::ImageBuffer Cache::Tag::getImageBuffer(const ImageBufferKey &key, auto tiledBuffer = getBuffer(range, Access::Read); if (tiledBuffer.tagId != cached->tagId) { - mParent->trackUpdate(EntryType::ImageBuffer, range, it.get(), - tiledBuffer.tagId, cached->expensive()); + mParent->trackUpdate( + EntryType::ImageBuffer, range, it.get(), tiledBuffer.tagId, + (access & Access::Write) == Access::None && cached->expensive()); cached->update(this, cached->addressRange, tiledBuffer); } @@ -1881,8 +1973,9 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { imageBufferKey.address = key.readAddress; auto imageBuffer = getImageBuffer(imageBufferKey, Access::Read); if (imageBuffer.tagId != cached->tagId) { - mParent->trackUpdate(EntryType::Image, storeRange, it.get(), - imageBuffer.tagId, cached->expensive()); + mParent->trackUpdate( + EntryType::Image, storeRange, it.get(), imageBuffer.tagId, + (access & Access::Write) == Access::None && cached->expensive()); cached->update(this, cached->addressRange, imageBuffer); } @@ -1992,6 +2085,11 @@ void Cache::Tag::release() { mAcquiredMemoryTable = -1; } + if (mAcquiredImageMemoryTable + 1 != 0) { + getCache()->mMemoryTablePool.release(mAcquiredImageMemoryTable); + mAcquiredImageMemoryTable = -1; + } + std::vector> tmpResources; bool hasSubmits = false; @@ -2141,6 +2239,7 @@ Cache::Shader Cache::GraphicsTag::getShader( } std::uint64_t memoryTableAddress = getMemoryTable().deviceAddress; + std::uint64_t imageMemoryTableAddress = getImageMemoryTable().deviceAddress; std::uint64_t gdsAddress = mParent->getGdsBuffer().getAddress(); mStorage->shaderResources.cacheTag = this; @@ -2228,6 +2327,14 @@ Cache::Shader Cache::GraphicsTag::getShader( configPtr[index] = static_cast(memoryTableAddress >> 32); } break; + case gcn::ConfigType::ImageMemoryTable: + if (slot.data == 0) { + configPtr[index] = static_cast(imageMemoryTableAddress); + } else { + configPtr[index] = + static_cast(imageMemoryTableAddress >> 32); + } + break; case gcn::ConfigType::Gds: if (slot.data == 0) { configPtr[index] = static_cast(gdsAddress); @@ -2295,6 +2402,7 @@ Cache::ComputeTag::getShader(const Registers::ComputeConfig &pgm) { } std::uint64_t memoryTableAddress = getMemoryTable().deviceAddress; + std::uint64_t imageMemoryTableAddress = getImageMemoryTable().deviceAddress; std::uint64_t gdsAddress = mParent->getGdsBuffer().getAddress(); mStorage->shaderResources.cacheTag = this; @@ -2365,6 +2473,14 @@ Cache::ComputeTag::getShader(const Registers::ComputeConfig &pgm) { configPtr[index] = static_cast(memoryTableAddress >> 32); } break; + case gcn::ConfigType::ImageMemoryTable: + if (slot.data == 0) { + configPtr[index] = static_cast(imageMemoryTableAddress); + } else { + configPtr[index] = + static_cast(imageMemoryTableAddress >> 32); + } + break; case gcn::ConfigType::Gds: if (slot.data == 0) { configPtr[index] = static_cast(gdsAddress); @@ -2585,12 +2701,10 @@ void Cache::invalidate(Tag &tag, rx::AddressRange range) { markHostInvalidated(mDevice, mVmId, range.beginAddress(), range.size()); } void Cache::flush(Tag &tag, rx::AddressRange range) { - if (flushImages(tag, range)) { - tag.getScheduler().submit(); - tag.getScheduler().wait(); - } + auto flushedRange = flushImages(tag, range); + flushedRange = flushedRange.merge(flushImageBuffers(tag, range)); - if (flushImageBuffers(tag, range)) { + if (flushedRange) { tag.getScheduler().submit(); tag.getScheduler().wait(); } diff --git a/rpcsx/gpu/Cache.hpp b/rpcsx/gpu/Cache.hpp index 67257a8d0..aa7d2acd0 100644 --- a/rpcsx/gpu/Cache.hpp +++ b/rpcsx/gpu/Cache.hpp @@ -87,6 +87,8 @@ struct ImageBufferKey { static ImageBufferKey createFrom(const gnm::TBuffer &tbuffer); static ImageBufferKey createFrom(const ImageKey &imageKey); + + constexpr auto operator<=>(const ImageBufferKey &) const = default; }; struct SamplerKey { @@ -244,6 +246,7 @@ private: std::uint32_t slotOffset = 0; rx::MemoryTableWithPayload bufferMemoryTable; + rx::MemoryTableWithPayload> imageMemoryTable; std::vector> resourceSlotToAddress; std::vector samplerResources; std::vector imageResources[3]; @@ -256,6 +259,7 @@ private: cacheTag = nullptr; slotOffset = 0; bufferMemoryTable.clear(); + imageMemoryTable.clear(); resourceSlotToAddress.clear(); samplerResources.clear(); for (auto &res : imageResources) { @@ -268,6 +272,7 @@ private: void loadResources(shader::gcn::Resources &res, std::span userSgprs); void buildMemoryTable(MemoryTable &memoryTable); + void buildImageMemoryTable(MemoryTable &memoryTable); std::uint32_t getResourceSlot(std::uint32_t id); template T readPointer(std::uint64_t address) { @@ -317,6 +322,7 @@ private: std::unique_lock mResourcesLock; TagId mTagId{}; std::uint32_t mAcquiredMemoryTable = -1; + std::uint32_t mAcquiredImageMemoryTable = -1; }; public: @@ -392,6 +398,24 @@ public: return result; } + Buffer getImageMemoryTable() { + if (mAcquiredImageMemoryTable + 1 == 0) { + mAcquiredImageMemoryTable = mParent->mMemoryTablePool.acquire(); + } + + auto &buffer = mParent->mMemoryTableBuffer; + auto offset = mAcquiredImageMemoryTable * kMemoryTableSize; + + Buffer result{ + .offset = offset, + .deviceAddress = buffer.getAddress() + offset, + .tagId = getReadId(), + .data = buffer.getData() + offset, + }; + + return result; + } + std::shared_ptr findShader(const ShaderKey &key, const ShaderKey *dependedKey = nullptr); friend Cache; diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp index 3b629042d..f5dfd44f4 100644 --- a/rpcsx/gpu/Device.cpp +++ b/rpcsx/gpu/Device.cpp @@ -224,20 +224,23 @@ Device::Device() : vkContext(createVkContext(this)) { rx::AddressRange::fromBeginSize(address, rx::mem::pageSize); auto tag = getCacheTag(vmId, sched); - if (tag.getCache()->flushImages(tag, range)) { + auto flushedRange = tag.getCache()->flushImages(tag, range); + flushedRange = + flushedRange.merge(tag.getCache()->flushImageBuffers(tag, range)); + + if (flushedRange) { sched.submit(); sched.wait(); } - if (tag.getCache()->flushImageBuffers(tag, range)) { - sched.submit(); - sched.wait(); + flushedRange = tag.getCache()->flushBuffers(flushedRange); + + if (flushedRange) { + unlockReadWrite(vmId, flushedRange.beginAddress(), + flushedRange.size()); + } else { + unlockReadWrite(vmId, range.beginAddress(), range.size()); } - - auto flushedRange = tag.getCache()->flushBuffers(range); - - assert(flushedRange.isValid() && flushedRange.size() > 0); - unlockReadWrite(vmId, flushedRange.beginAddress(), flushedRange.size()); } } }); @@ -915,6 +918,23 @@ void Device::waitForIdle() { } } + { + auto &queue = graphicsPipes[0].ceQueue; + if (queue.wptr != queue.rptr) { + allProcessed = false; + } + } + + for (auto &pipe : computePipes) { + for (auto &queue : pipe.queues) { + for (auto &ring : queue) { + if (ring.wptr != ring.rptr) { + allProcessed = false; + } + } + } + } + if (allProcessed) { break; } diff --git a/rpcsx/gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp b/rpcsx/gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp index 2c745f6c2..53e81e622 100644 --- a/rpcsx/gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp +++ b/rpcsx/gpu/lib/amdgpu-tiler/include/amdgpu/tiler.hpp @@ -138,6 +138,9 @@ struct TileMode { ((static_cast(mode) << 25) & 0x06000000); return *this; } + + constexpr auto operator<=>(const TileMode &other) const { return raw <=> other.raw; } + constexpr bool operator==(const TileMode &other) const { return raw == other.raw; } }; struct MacroTileMode { diff --git a/rpcsx/gpu/lib/gcn-shader/include/shader/GcnConverter.hpp b/rpcsx/gpu/lib/gcn-shader/include/shader/GcnConverter.hpp index 958d72f2d..c45932dbd 100644 --- a/rpcsx/gpu/lib/gcn-shader/include/shader/GcnConverter.hpp +++ b/rpcsx/gpu/lib/gcn-shader/include/shader/GcnConverter.hpp @@ -104,6 +104,7 @@ enum class ConfigType { UserSgpr, ResourceSlot, MemoryTable, + ImageMemoryTable, Gds, PsInputVGpr, VsInputSGpr, @@ -156,6 +157,11 @@ struct Resources { ir::Value words[4]; }; + struct ImageBuffer : Resource { + Access access; + ir::Value words[8]; + }; + struct Sampler : Resource { bool unorm; ir::Value words[4]; @@ -165,8 +171,9 @@ struct Resources { bool hasUnknown = false; std::uint32_t slots = 0; std::vector pointers; - std::vector textures; std::vector buffers; + std::vector textures; + std::vector imageBuffers; std::vector samplers; void print(std::ostream &os, ir::NameStorage &ns) const; diff --git a/rpcsx/gpu/lib/gcn-shader/include/shader/GcnInstruction.hpp b/rpcsx/gpu/lib/gcn-shader/include/shader/GcnInstruction.hpp index 4c4fad05d..9386fc624 100644 --- a/rpcsx/gpu/lib/gcn-shader/include/shader/GcnInstruction.hpp +++ b/rpcsx/gpu/lib/gcn-shader/include/shader/GcnInstruction.hpp @@ -29,6 +29,8 @@ struct GcnOperand { Buffer, Texture128, Texture256, + ImageBuffer128, + ImageBuffer256, Sampler, Pointer, }; @@ -137,6 +139,13 @@ struct GcnOperand { .kind = (is128 ? Kind::Texture128 : Kind::Texture256), }; } + static constexpr GcnOperand createImageBuffer(GcnOperand firstReg, bool is128) { + return { + .firstRegisterKind = firstReg.kind, + .firstRegisterIndex = static_cast(firstReg.value), + .kind = (is128 ? Kind::ImageBuffer128 : Kind::ImageBuffer256), + }; + } static constexpr GcnOperand createBuffer(GcnOperand firstReg) { return { .firstRegisterKind = firstReg.kind, diff --git a/rpcsx/gpu/lib/gcn-shader/include/shader/dialect/amdgpu.hpp b/rpcsx/gpu/lib/gcn-shader/include/shader/dialect/amdgpu.hpp index 5e3754256..1137d5309 100644 --- a/rpcsx/gpu/lib/gcn-shader/include/shader/dialect/amdgpu.hpp +++ b/rpcsx/gpu/lib/gcn-shader/include/shader/dialect/amdgpu.hpp @@ -10,6 +10,7 @@ enum Op { VBUFFER, SAMPLER, TBUFFER, + IMAGE_BUFFER, POINTER, OMOD, NEG_ABS, @@ -40,6 +41,8 @@ inline const char *getInstructionName(unsigned op) { return "sampler"; case TBUFFER: return "tbuffer"; + case IMAGE_BUFFER: + return "image_buffer"; case POINTER: return "pointer"; case OMOD: diff --git a/rpcsx/gpu/lib/gcn-shader/include/shader/gcn.hpp b/rpcsx/gpu/lib/gcn-shader/include/shader/gcn.hpp index b92df2b52..f04386d4e 100644 --- a/rpcsx/gpu/lib/gcn-shader/include/shader/gcn.hpp +++ b/rpcsx/gpu/lib/gcn-shader/include/shader/gcn.hpp @@ -41,6 +41,7 @@ enum RegId { VgprCount, ThreadId, MemoryTable, + ImageMemoryTable, Gds, }; diff --git a/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl b/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl index d30f69108..33b19b39d 100644 --- a/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl +++ b/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl @@ -63,7 +63,7 @@ DEFINE_SIZEOF(float64_t, 8); uint thread_id; uint64_t exec; -float32_t swizzle(f32vec4 comp, int sel) { +uint32_t swizzle(u32vec4 comp, int sel) { switch (sel) { case 0: return 0; case 1: return 1; @@ -76,8 +76,8 @@ float32_t swizzle(f32vec4 comp, int sel) { return 1; } -f32vec4 swizzle(f32vec4 comp, int selX, int selY, int selZ, int selW) { - return f32vec4(swizzle(comp, selX), swizzle(comp, selY), swizzle(comp, selZ), swizzle(comp, selW)); +u32vec4 swizzle(u32vec4 comp, int selX, int selY, int selZ, int selW) { + return u32vec4(swizzle(comp, selX), swizzle(comp, selY), swizzle(comp, selZ), swizzle(comp, selW)); } int32_t sext(int32_t x, uint bits) { @@ -247,10 +247,13 @@ float32_t ps_input_vgpr(int32_t index, f32vec4 fragCoord, bool frontFace) { case kPsVGprInputFrontFace: return intBitsToFloat(frontFace ? 1 : 0); case kPsVGprInputAncillary: + debugPrintfEXT("ps_input_vgpr: kPsVGprInputAncillary"); return 0; case kPsVGprInputSampleCoverage: + debugPrintfEXT("ps_input_vgpr: kPsVGprInputSampleCoverage"); return 0; case kPsVGprInputPosFixed: + debugPrintfEXT("ps_input_vgpr: kPsVGprInputPosFixed"); return 0; } @@ -1389,6 +1392,11 @@ const int kBufferChannelTypeUInt = 0x00000004; const int kBufferChannelTypeSInt = 0x00000005; const int kBufferChannelTypeSNormNoZero = 0x00000006; const int kBufferChannelTypeFloat = 0x00000007; +const int kNumericFormatSrgb = 0x00000009; +const int kNumericFormatUBNorm = 0x0000000A; +const int kNumericFormatUBNormNoZero = 0x0000000B; +const int kNumericFormatUBInt = 0x0000000C; +const int kNumericFormatUBScaled = 0x0000000D; uint64_t compute_vbuffer_address(uint size, u32vec4 vbuffer, uint64_t soff, uint64_t OFFSET, bool IDXEN, uint64_t vINDEX, uint64_t vOFFSET) { bool addTid = vbuffer_addtid_en(vbuffer); @@ -1594,8 +1602,10 @@ uint32_t convert_from_nfmt(uint32_t data, uint bits, uint nfmt) { data = zext(data, bits); switch (nfmt) { + case kNumericFormatSrgb: case kBufferChannelTypeUNorm: - return floatBitsToUint(float(uint(data)) / ((1 << bits) - 1)); + uint32_t result = floatBitsToUint(float(uint(data)) / ((1 << bits) - 1)); + return result; case kBufferChannelTypeSNorm: return floatBitsToUint(float(sext(int(data), bits)) / ((1 << (bits - 1)) - 1)); @@ -1607,7 +1617,7 @@ uint32_t convert_from_nfmt(uint32_t data, uint bits, uint nfmt) { return floatBitsToUint(float(sext(int(data), bits))); case kBufferChannelTypeUInt: - return data; + return floatBitsToUint(float(data)); case kBufferChannelTypeSInt: return uint32_t(sext(int(data), bits)); @@ -1617,38 +1627,52 @@ uint32_t convert_from_nfmt(uint32_t data, uint bits, uint nfmt) { case kBufferChannelTypeFloat: return data; + + default: + debugPrintfEXT("convert_from_nfmt: unexpected nfmt %x", nfmt); + break; } return 0; } uint32_t convert_to_nfmt(uint32_t data, uint bits, uint nfmt) { - data = zext(data, bits); - + uint32_t result = 0; switch (nfmt) { case kBufferChannelTypeUNorm: - return uint32_t(clamp(uintBitsToFloat(data), 0, 1) * ((1 << bits) - 1)); + result = uint32_t(clamp(uintBitsToFloat(data), 0, 1) * ((1 << bits) - 1)); + break; case kBufferChannelTypeSNorm: - return uint32_t(clamp(uintBitsToFloat(data), -1, 1) * ((1 << (bits - 1)) - 1)); + result = uint32_t(clamp(uintBitsToFloat(data), -1, 1) * ((1 << (bits - 1)) - 1)); + break; case kBufferChannelTypeUScaled: - return uint32_t(uintBitsToFloat(data)); + result = uint32_t(uintBitsToFloat(data)); + break; case kBufferChannelTypeUInt: - return data; + result = uint32_t(uintBitsToFloat(data)); + break; case kBufferChannelTypeSInt: - return uint32_t(sext(int32_t(data), bits)); + result = uint32_t(sext(int32_t(uintBitsToFloat(data)), bits)); + break; case kBufferChannelTypeSNormNoZero: - return uint32_t(clamp(uintBitsToFloat(data), -1, 1) * ((1 << bits) - 1) / 2 - 1); + result = uint32_t(clamp(uintBitsToFloat(data), -1, 1) * ((1 << bits) - 1) / 2 - 1); + break; case kBufferChannelTypeFloat: - return data; + result = data; + break; + + default: + debugPrintfEXT("convert_to_nfmt: unexpected nfmt %x", nfmt); + break; } - return 0; + return zext(result, bits); } uint32_t convert_from_format_x(uint32_t data, uint dfmt, uint nfmt) { @@ -1877,6 +1901,10 @@ uint32_t convert_to_format(uint element, u32vec4 data, uint dfmt, uint nfmt) { (convert_to_nfmt(data[2], 10, nfmt) << 20) | (convert_to_nfmt(data[3], 2, nfmt) << 30) ); + + default: + debugPrintfEXT("convert_to_format: unexpected format"); + break; } return uint32_t(0); @@ -1938,6 +1966,7 @@ u32vec4 buffer_load_format(uint dfmt, uint nfmt, uint32_t vOFFSET, uint32_t vIND uint64_t address = compute_vbuffer_address(data_size, vbuffer, soff, OFFSET, IDXEN, vINDEX, vOFFSET); if (address == 0 || dfmt == kBufferFormatInvalid) { + debugPrintfEXT("buffer_load_format: invalid buffer"); return u32vec4(0); } @@ -1945,6 +1974,7 @@ u32vec4 buffer_load_format(uint dfmt, uint nfmt, uint32_t vOFFSET, uint32_t vIND uint64_t deviceAddress = findMemoryAddress(address, data_size, memoryLocationHint, deviceAreaSize); if (deviceAddress == kInvalidAddress || deviceAreaSize < data_size) { + debugPrintfEXT("buffer_load_format: out of buffer memory"); return u32vec4(0); } @@ -2189,7 +2219,8 @@ uint32_t[16] s_load_dwordx16(int32_t memoryLocationHint, uint64_t sbase, int32_t uint64_t deviceAddress = findMemoryAddress(base_address + (offset & ~0x3ul), size, memoryLocationHint, deviceAreaSize); \ int32_t _offset = 0; \ for (int i = 0; i < N; i++) { \ - if (deviceAddress == kInvalidAddress || _offset + SIZEOF(uint32_t) > deviceAreaSize) { \ + if (deviceAddress == kInvalidAddress || offset + _offset > size - SIZEOF(uint32_t) || _offset + SIZEOF(uint32_t) > deviceAreaSize) { \ + debugPrintfEXT("S_BUFFER_LOAD_DWORD: out of buffer memory"); \ sdst[i] = 0; \ } else { \ sdst[i] = MEMORY_DATA_REF(uint32_t, deviceAddress + _offset); \ @@ -2210,6 +2241,10 @@ u32vec2 s_buffer_load_dwordx2(int32_t memoryLocationHint, u32vec4 vbuffer, int32 u32vec4 s_buffer_load_dwordx4(int32_t memoryLocationHint, u32vec4 vbuffer, int32_t offset) { u32vec4 sdst; S_BUFFER_LOAD_DWORD(sdst, memoryLocationHint, vbuffer, offset, 4); + if (offset == 48) { + uint64_t base = vbuffer_base(vbuffer); + debugPrintfEXT("s_buffer_load_dwordx4: %v4u, base=%lx", sdst, base); + } return sdst; } uint32_t[8] s_buffer_load_dwordx8(int32_t memoryLocationHint, u32vec4 vbuffer, int32_t offset) { @@ -2629,6 +2664,132 @@ int findTexture3DIndex(int32_t textureIndexHint, uint32_t tbuffer[8]) { return textureIndexHint; } +const uint32_t kImageFlagDmask = 0xf; +const uint32_t kImageFlagR128 = 1 << 4; +const uint32_t kImageFlagDA = 1 << 5; +const uint32_t kImageFlagUnorm = 1 << 6; +const uint32_t kImageFlagTFE = 1 << 7; + +struct ImageInfo { + uint64_t offset; + uvec3 extent; + uint16_t pitch; + uint16_t baseArraySlice; + uint16_t arraySliceCount; + uint8_t baseMipLevel; + uint8_t mipLevelCount; + uint8_t type; + uint8_t dataSize; +}; + +ImageInfo getImageInfo(uint32_t tbuffer[8], uint32_t mipLevel) { + uint8_t type = tbuffer_type(tbuffer); + uint16_t width = uint16_t(tbuffer_width(tbuffer) + 1u); + uint16_t height = uint16_t(tbuffer_height(tbuffer) + 1u); + uint16_t depth = uint16_t(tbuffer_depth(tbuffer) + 1u); + uint16_t pitch = uint16_t(tbuffer_pitch(tbuffer) + 1u); + uint16_t baseArray = tbuffer_base_array(tbuffer); + uint16_t lastArray = tbuffer_last_array(tbuffer); + uint8_t baseLevel = tbuffer_base_level(tbuffer); + uint8_t lastLevel = tbuffer_last_level(tbuffer); + bool pow2pad = tbuffer_pow2pad(tbuffer); + bool isVolume = type == kTextureType3D; + bool isCubemap = type == kTextureTypeCube; + depth = isVolume ? depth : uint16_t(1); + + uint arraySliceCount = depth; + + if (isCubemap) { + arraySliceCount *= 6; + } else if (isVolume) { + arraySliceCount = 1; + } + + if (pow2pad) { + if ((arraySliceCount & (arraySliceCount - 1)) != 0 || + (width & (width - 1)) != 0 || + (height & (height - 1)) != 0 || + (depth & (depth - 1)) != 0 || + (pitch & (pitch - 1)) != 0) { + debugPrintfEXT("getImageInfo: pow2pad"); + } + } + + uint64_t offset = 0; + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint dataSize = size_of_format(dfmt); + + for (uint32_t i = 0; i < mipLevel; ++i) { + uint16_t mipHeight = uint16_t(max(height >> i, 1)); + uint16_t mipDepth = uint16_t(max(depth >> i, 1)); + uint16_t mipPitch = uint16_t(max(pitch >> i, 1)); + + offset += arraySliceCount * dataSize * mipHeight * mipPitch * mipDepth; + } + + width = uint16_t(max(width >> mipLevel, 1)); + height = uint16_t(max(height >> mipLevel, 1)); + depth = uint16_t(max(depth >> mipLevel, 1)); + pitch = uint16_t(max(pitch >> mipLevel, 1)); + + ImageInfo result; + result.offset = offset; + result.extent = uvec3(width, height, depth); + result.pitch = pitch; + result.baseArraySlice = baseArray; + result.arraySliceCount = uint16_t(min(arraySliceCount, lastArray - baseArray + 1)); + result.baseMipLevel = baseLevel; + result.mipLevelCount = uint8_t(lastLevel - baseLevel + 1); + result.type = type; + result.dataSize = uint8_t(dataSize); + + return result; +} + +uint64_t findImageUnormPixelAddress(int32_t imageMemoryIndexHint, uint32_t tbuffer[8], uint32_t mipLevel, uint32_t arrayLayer, i32vec3 pos) { + ImageInfo img = getImageInfo(tbuffer, mipLevel); + + if (any(lessThan(pos, ivec3(0))) || any(greaterThan(pos, img.extent))) { + return kInvalidAddress; + } + + uint64_t address = tbuffer_base(tbuffer); + + address += img.offset; + address += img.dataSize * pos.x; + address += img.dataSize * img.pitch * pos.y; + address += img.dataSize * img.pitch * img.extent.y * pos.z; + + uint64_t deviceAreaSize = 0; + uint64_t deviceAddress = findImageMemoryAddress(address, img.dataSize, imageMemoryIndexHint, deviceAreaSize); + + if (deviceAddress == kInvalidAddress || deviceAreaSize < img.dataSize) { + return kInvalidAddress; + } + + return deviceAddress; +} + +uint64_t findImagePixelAddress(int32_t imageMemoryIndexHint, uint32_t tbuffer[8], float mipLevel, uint32_t arrayLayer, f32vec3 pos, bool unorm) { + i32vec3 unormPos; + if (unorm) { + unormPos = i32vec3(pos); + } else { + unormPos.x = int32_t((tbuffer_width(tbuffer) + 1) * pos.x); + unormPos.y = int32_t((tbuffer_height(tbuffer) + 1) * pos.y); + unormPos.z = int32_t((tbuffer_depth(tbuffer) + 1) * pos.z); + } + + uint8_t baseLevel = tbuffer_base_level(tbuffer); + uint8_t lastLevel = tbuffer_last_level(tbuffer); + + uint32_t umipLevel = baseLevel + uint32_t((lastLevel - baseLevel + 1) * mipLevel); + umipLevel = min(umipLevel, lastLevel + 1); + umipLevel = max(umipLevel, baseLevel); + + return findImageUnormPixelAddress(imageMemoryIndexHint, tbuffer, umipLevel, arrayLayer, unormPos); +} + // void image_gather4(inout u32vec4 vdata, u32vec4 vaddr, int32_t textureIndexHint, uint32_t tbuffer[8], int32_t samplerIndexHint, u32vec4 samplerDescriptor) {} // image_gather4_cl // image_gather4_l @@ -2685,35 +2846,74 @@ int findTexture3DIndex(int32_t textureIndexHint, uint32_t tbuffer[8]) { // void image_atomic_umin() {} // void image_atomic_xor() {} -void image_load(inout f32vec4 vdata, i32vec3 vaddr, int32_t textureIndexHint, uint32_t tbuffer[8], uint32_t dmask) { - uint8_t textureType = tbuffer_type(tbuffer); - f32vec4 result; +void image_load(inout u32vec4 vdata, u32vec3 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); - switch (uint(textureType)) { + switch (uint(tbuffer_type(tbuffer))) { case kTextureType1D: case kTextureTypeArray1D: - result = texelFetch(textures1D[findTexture1DIndex(textureIndexHint, tbuffer)], vaddr.x, 0); + pos.x = vaddr[0]; break; case kTextureType2D: case kTextureTypeCube: case kTextureTypeArray2D: - case kTextureTypeMsaa2D: - case kTextureTypeMsaaArray2D: - result = texelFetch(textures2D[findTexture2DIndex(textureIndexHint, tbuffer)], vaddr.xy, 0); + pos.xy = vaddr.xy; break; case kTextureType3D: - result = texelFetch(textures3D[findTexture3DIndex(textureIndexHint, tbuffer)], vaddr, 0); + pos.xyz = vaddr.xyz; break; - default: + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_load: MSAA"); + pos.xy = ivec2(vaddr.xy); + break; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, 0, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_load: invalid address"); return; } + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint8_t nfmt = tbuffer_nfmt(tbuffer); + + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + uint channel_count = components_of_format(dfmt); + uint channel_size = data_size / channel_count; + uint channels_per_element; + + if (data_size > SIZEOF(uint32_t)) { + channels_per_element = SIZEOF(uint32_t) / channel_size; + } else { + channels_per_element = channel_count; + } + + u32vec4 result = u32vec4(0); + int outIndex = 0; + + for (uint element = 0; element < elements_count; element++) { + uint32_t data = MEMORY_DATA_REF(uint32_t, deviceAddress); + u32vec4 unpacked = convert_from_format(data, dfmt, nfmt); + deviceAddress += SIZEOF(uint32_t); + for (int channel = 0; channel < channels_per_element; channel++) { + result[outIndex++] = unpacked[channel]; + } + } + + result = swizzle(result, + tbuffer_dst_sel_x(tbuffer), + tbuffer_dst_sel_y(tbuffer), + tbuffer_dst_sel_z(tbuffer), + tbuffer_dst_sel_w(tbuffer)); + int vdataIndex = 0; for (int i = 0; i < 4; ++i) { - if ((dmask & (1 << i)) != 0) { + if ((flags & (1 << i)) != 0) { vdata[vdataIndex++] = result[i]; } } @@ -2722,36 +2922,80 @@ void image_load(inout f32vec4 vdata, i32vec3 vaddr, int32_t textureIndexHint, ui // void image_load_pck() {} // void image_load_pck_sgn() {} -void image_load_mip(inout f32vec4 vdata, u32vec4 vaddr_u, int32_t textureIndexHint, uint32_t tbuffer[8], uint32_t dmask) { - uint8_t textureType = tbuffer_type(tbuffer); - f32vec4 result; - i32vec4 vaddr = i32vec4(vaddr_u); +void image_load_mip(inout u32vec4 vdata, u32vec4 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); + uint32_t mipLevel = 0; - switch (uint(textureType)) { + switch (uint(tbuffer_type(tbuffer))) { case kTextureType1D: case kTextureTypeArray1D: - result = texelFetch(textures1D[findTexture1DIndex(textureIndexHint, tbuffer)], vaddr.x, vaddr.y); + pos.x = vaddr[0]; + mipLevel = vaddr.y; break; case kTextureType2D: case kTextureTypeCube: case kTextureTypeArray2D: - case kTextureTypeMsaa2D: - case kTextureTypeMsaaArray2D: - result = texelFetch(textures2D[findTexture2DIndex(textureIndexHint, tbuffer)], vaddr.xy, vaddr.z); + pos.xy = vaddr.xy; + mipLevel = vaddr.z; break; case kTextureType3D: - result = texelFetch(textures3D[findTexture3DIndex(textureIndexHint, tbuffer)], vaddr.xyz, vaddr.w); + pos.xyz = vaddr.xyz; + mipLevel = vaddr.w; break; - default: + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_load_mip: MSAA"); + pos.xy = ivec2(vaddr.xy); + mipLevel = vaddr.z; + break; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, mipLevel, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_load_mip: invalid address"); return; } + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint8_t nfmt = tbuffer_nfmt(tbuffer); + + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + uint channel_count = components_of_format(dfmt); + uint channel_size = data_size / channel_count; + uint channels_per_element; + + if (data_size > SIZEOF(uint32_t)) { + channels_per_element = SIZEOF(uint32_t) / channel_size; + } else { + channels_per_element = channel_count; + } + + u32vec4 result = u32vec4(0); + int outIndex = 0; + + for (uint element = 0; element < elements_count; element++) { + uint32_t data = MEMORY_DATA_REF(uint32_t, deviceAddress); + u32vec4 unpacked = convert_from_format(data, dfmt, nfmt); + deviceAddress += SIZEOF(uint32_t); + // debugPrintfEXT("image_load_mip: data: %x, unpacked: %v4x, element: %u, channels: %u", data, unpacked, element, channels_per_element); + for (int channel = 0; channel < channels_per_element; channel++) { + result[outIndex++] = unpacked[channel]; + } + } + + result = swizzle(result, + tbuffer_dst_sel_x(tbuffer), + tbuffer_dst_sel_y(tbuffer), + tbuffer_dst_sel_z(tbuffer), + tbuffer_dst_sel_w(tbuffer)); + int vdataIndex = 0; for (int i = 0; i < 4; ++i) { - if ((dmask & (1 << i)) != 0) { + if ((flags & (1 << i)) != 0) { vdata[vdataIndex++] = result[i]; } } @@ -2759,10 +3003,203 @@ void image_load_mip(inout f32vec4 vdata, u32vec4 vaddr_u, int32_t textureIndexHi // void image_load_mip_pck() {} // void image_load_mip_pck_sgn() {} -// void image_store() {} -// void image_store_pck() {} -// void image_store_mip() {} -// void image_store_mip_pck() {} + +void image_store(u32vec4 vdata, u32vec4 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); + + switch (uint(tbuffer_type(tbuffer))) { + case kTextureType1D: + case kTextureTypeArray1D: + pos.x = vaddr[0]; + break; + + case kTextureType2D: + case kTextureTypeCube: + case kTextureTypeArray2D: + pos.xy = vaddr.xy; + break; + + case kTextureType3D: + pos.xyz = vaddr.xyz; + break; + + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_store: MSAA"); + pos.xy = ivec2(vaddr.xy); + break; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, 0, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_store: invalid address"); + return; + } + + if ((flags & kImageFlagDmask) != 0xf) { + debugPrintfEXT("image_store: unexpected dmask. flags %x", flags); + } + + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint8_t nfmt = tbuffer_nfmt(tbuffer); + + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + + for (uint element = 0; element < elements_count; element++) { + uint32_t value = convert_to_format(element, vdata, dfmt, nfmt); + MEMORY_DATA_REF(uint32_t, deviceAddress) = value; + deviceAddress += SIZEOF(uint32_t); + } +} +void image_store_pck(u32vec4 vdata, u32vec4 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); + + switch (uint(tbuffer_type(tbuffer))) { + case kTextureType1D: + case kTextureTypeArray1D: + pos.x = vaddr[0]; + break; + + case kTextureType2D: + case kTextureTypeCube: + case kTextureTypeArray2D: + pos.xy = vaddr.xy; + break; + + case kTextureType3D: + pos.xyz = vaddr.xyz; + break; + + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_store: MSAA"); + pos.xy = ivec2(vaddr.xy); + break; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, 0, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_store: invalid address"); + return; + } + + if ((flags & kImageFlagDmask) != 0xf) { + debugPrintfEXT("image_store: unexpected dmask. flags %x", flags); + } + + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint8_t nfmt = tbuffer_nfmt(tbuffer); + + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + + for (uint element = 0; element < elements_count; element++) { + MEMORY_DATA_REF(uint32_t, deviceAddress) = vdata[element]; + deviceAddress += SIZEOF(uint32_t); + } +} +void image_store_mip(u32vec4 vdata, u32vec4 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); + uint32_t mipLevel = 0; + + switch (uint(tbuffer_type(tbuffer))) { + case kTextureType1D: + case kTextureTypeArray1D: + pos.x = vaddr[0]; + mipLevel = vaddr.y; + break; + + case kTextureType2D: + case kTextureTypeCube: + case kTextureTypeArray2D: + pos.xy = vaddr.xy; + mipLevel = vaddr.z; + break; + + case kTextureType3D: + pos.xyz = vaddr.xyz; + mipLevel = vaddr.w; + break; + + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_store_mip: MSAA"); + pos.xy = ivec2(vaddr.xy); + mipLevel = vaddr.z; + break; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, mipLevel, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_store_mip: invalid address"); + return; + } + + if ((flags & kImageFlagDmask) != 0xf) { + debugPrintfEXT("image_store_mip: unexpected dmask. flags %x", flags); + } + + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint8_t nfmt = tbuffer_nfmt(tbuffer); + + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + + for (uint element = 0; element < elements_count; element++) { + uint32_t value = convert_to_format(element, vdata, dfmt, nfmt); + MEMORY_DATA_REF(uint32_t, deviceAddress) = value; + deviceAddress += SIZEOF(uint32_t); + } +} +void image_store_mip_pck(u32vec4 vdata, u32vec4 vaddr, int32_t imageBufferIndexHint, uint32_t tbuffer[8], uint32_t flags) { + u32vec3 pos = u32vec3(0); + uint32_t mipLevel = 0; + + switch (uint(tbuffer_type(tbuffer))) { + case kTextureType1D: + case kTextureTypeArray1D: + pos.x = vaddr[0]; + mipLevel = vaddr.y; + break; + + case kTextureType2D: + case kTextureTypeCube: + case kTextureTypeArray2D: + pos.xy = vaddr.xy; + mipLevel = vaddr.z; + break; + + case kTextureType3D: + pos.xyz = vaddr.xyz; + mipLevel = vaddr.w; + break; + + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + debugPrintfEXT("image_store_mip_pck: MSAA"); + return; + } + + uint64_t deviceAddress = findImageUnormPixelAddress(imageBufferIndexHint, tbuffer, mipLevel, 0, i32vec3(pos)); + if (deviceAddress == kInvalidAddress) { + debugPrintfEXT("image_store_mip_pck: invalid address"); + return; + } + + if ((flags & kImageFlagDmask) != 0xf) { + debugPrintfEXT("image_store_mip_pck: unexpected dmask. flags %x", flags); + } + + uint8_t dfmt = tbuffer_dfmt(tbuffer); + uint data_size = size_of_format(dfmt); + uint elements_count = (data_size + SIZEOF(uint32_t) - 1) / SIZEOF(uint32_t); + + for (uint element = 0; element < elements_count; element++) { + MEMORY_DATA_REF(uint32_t, deviceAddress) = vdata[element]; + deviceAddress += SIZEOF(uint32_t); + } +} void image_sample(inout f32vec4 vdata, f32vec3 vaddr, int32_t textureIndexHint, uint32_t tbuffer[8], int32_t samplerIndexHint, u32vec4 ssampler, uint32_t dmask) { uint8_t textureType = tbuffer_type(tbuffer); @@ -2980,38 +3417,8 @@ void image_get_lod(inout f32vec2 vdata, u32vec3 vaddr, int32_t textureIndexHint, } void image_get_resinfo(inout u32vec4 vdata, int32_t vmipid, int32_t textureIndexHint, uint32_t tbuffer[8], uint32_t dmask) { - i32vec4 result = i32vec4(1); - - switch (uint(tbuffer_type(tbuffer))) { - case kTextureType1D: { - int texIndex = findTexture1DIndex(textureIndexHint, tbuffer); - result.x = textureSize(textures1D[texIndex], vmipid); - result.w = textureQueryLevels(textures1D[texIndex]); - break; - } - - case kTextureTypeArray1D: - case kTextureType2D: - case kTextureTypeCube: - case kTextureTypeArray2D: { - int texIndex = findTexture2DIndex(textureIndexHint, tbuffer); - result.xy = textureSize(textures2D[texIndex], vmipid); - result.w = textureQueryLevels(textures2D[texIndex]); - break; - } - - case kTextureTypeMsaa2D: - case kTextureTypeMsaaArray2D: - result.xy = textureSize(textures2D[findTexture2DIndex(textureIndexHint, tbuffer)], 0); - break; - - case kTextureType3D: { - int texIndex = findTexture3DIndex(textureIndexHint, tbuffer); - result.xyz = textureSize(textures3D[texIndex], vmipid); - result.w = textureQueryLevels(textures3D[texIndex]); - break; - } - } + ImageInfo img = getImageInfo(tbuffer, vmipid); + i32vec4 result = i32vec4(img.extent, img.mipLevelCount); int vdataIndex = 0; for (int i = 0; i < 4; ++i) { diff --git a/rpcsx/gpu/lib/gcn-shader/src/GcnConverter.cpp b/rpcsx/gpu/lib/gcn-shader/src/GcnConverter.cpp index c33d093eb..6aa35d6ad 100644 --- a/rpcsx/gpu/lib/gcn-shader/src/GcnConverter.cpp +++ b/rpcsx/gpu/lib/gcn-shader/src/GcnConverter.cpp @@ -102,6 +102,10 @@ struct ResourcesBuilder { p.resourceSlot = resources.slots++; resources.textures.push_back(p); } + void addImageBuffer(gcn::Resources::ImageBuffer p) { + p.resourceSlot = resources.slots++; + resources.imageBuffers.push_back(p); + } void addBuffer(gcn::Resources::Buffer p) { p.resourceSlot = resources.slots++; resources.buffers.push_back(p); @@ -334,6 +338,27 @@ struct ResourcesBuilder { continue; } + if (inst == ir::amdgpu::IMAGE_BUFFER) { + auto access = static_cast(*inst.getOperand(1).getAsInt32()); + auto words = inst.getOperands().subspan(2); + if (words.size() > 4) { + addImageBuffer({ + .access = access, + .words = {words[0].getAsValue(), words[1].getAsValue(), + words[2].getAsValue(), words[3].getAsValue(), + words[4].getAsValue(), words[5].getAsValue(), + words[6].getAsValue(), words[7].getAsValue()}, + }); + } else { + addImageBuffer({ + .access = access, + .words = {words[0].getAsValue(), words[1].getAsValue(), + words[2].getAsValue(), words[3].getAsValue()}, + }); + } + continue; + } + if (inst == ir::amdgpu::SAMPLER) { auto words = inst.getOperands().subspan(1); auto unorm = *inst.getOperand(5).getAsBool(); @@ -408,6 +433,20 @@ void gcn::Resources::print(std::ostream &os, ir::NameStorage &ns) const { } } + if (!imageBuffers.empty()) { + os << "image buffers:\n"; + for (auto &buffer : buffers) { + os << " #" << buffer.resourceSlot << ":\n"; + printAccess(buffer.access); + + for (auto &word : buffer.words) { + os << " word" << (&word - buffer.words) << ": "; + printFlat(os, word, ns); + os << "\n"; + } + } + } + if (!buffers.empty()) { os << "buffers:\n"; for (auto &buffer : buffers) { @@ -876,7 +915,8 @@ static void instructionsToSpv(GcnConverter &converter, gcn::Import &importer, } if (inst == ir::amdgpu::POINTER || inst == ir::amdgpu::VBUFFER || - inst == ir::amdgpu::SAMPLER || inst == ir::amdgpu::TBUFFER) { + inst == ir::amdgpu::SAMPLER || inst == ir::amdgpu::TBUFFER || + inst == ir::amdgpu::IMAGE_BUFFER) { toAnalyze.push_back(inst.staticCast()); continue; } @@ -1216,7 +1256,7 @@ static void instructionsToSpv(GcnConverter &converter, gcn::Import &importer, auto spvFnCall = builder.createSpvFunctionCall( inst.getLocation(), inst.getOperand(0).getAsValue(), function); - for (auto arg : inst.getOperands().subspan(1)) { + for (auto &arg : inst.getOperands().subspan(1)) { spvFnCall.addOperand(arg); } @@ -1622,6 +1662,13 @@ static void createInitialValues(GcnConverter &converter, stage, builder, info.create(gcn::ConfigType::MemoryTable, word))); } + for (int word = 0; word < 2; ++word) { + context.writeReg(loc, builder, gcn::RegId::ImageMemoryTable, word, + converter.createReadConfig( + stage, builder, + info.create(gcn::ConfigType::ImageMemoryTable, word))); + } + for (int word = 0; word < 2; ++word) { context.writeReg( loc, builder, gcn::RegId::Gds, word, diff --git a/rpcsx/gpu/lib/gcn-shader/src/GcnInstruction.cpp b/rpcsx/gpu/lib/gcn-shader/src/GcnInstruction.cpp index e312a9b0b..6e0bacc73 100644 --- a/rpcsx/gpu/lib/gcn-shader/src/GcnInstruction.cpp +++ b/rpcsx/gpu/lib/gcn-shader/src/GcnInstruction.cpp @@ -557,6 +557,11 @@ readMimgInst(GcnInstruction &inst, std::uint64_t &address, auto srsrc = fetchMaskedValue(words[1], srsrcMask) << 2; auto ssamp = fetchMaskedValue(words[1], ssampMask) << 2; + const uint32_t kImageFlagR128 = 1 << 4; + const uint32_t kImageFlagDA = 1 << 5; + const uint32_t kImageFlagUnorm = 1 << 6; + const uint32_t kImageFlagTFE = 1 << 7; + std::uint8_t textureAccess = 0; bool hasSampler = false; @@ -583,14 +588,32 @@ readMimgInst(GcnInstruction &inst, std::uint64_t &address, inst.addOperand(createVgprGcnOperand(vdata).withRW()); inst.addOperand(createVgprGcnOperand(vaddr).withR()); auto tbufferStart = createSgprGcnOperand(address, srsrc); - inst.addOperand( - GcnOperand::createTexture(tbufferStart, r128).withAccess(textureAccess)); - inst.addOperand(tbufferStart); if (hasSampler) { + inst.addOperand(GcnOperand::createTexture(tbufferStart, r128) + .withAccess(textureAccess)); + inst.addOperand(tbufferStart); + auto samplerStart = createSgprGcnOperand(address, ssamp); inst.addOperand(GcnOperand::createSampler(samplerStart, unrm).withR()); inst.addOperand(samplerStart); + } else { + inst.addOperand(GcnOperand::createImageBuffer(tbufferStart, r128) + .withAccess(textureAccess)); + inst.addOperand(tbufferStart); + } + + if (r128) { + dmask |= kImageFlagR128; + } + if (da) { + dmask |= kImageFlagDA; + } + if (unrm) { + dmask |= kImageFlagUnorm; + } + if (tfe) { + dmask |= kImageFlagTFE; } inst.addOperand(GcnOperand::createConstant(dmask)); @@ -982,6 +1005,20 @@ void GcnOperand::print(std::ostream &os) const { getUnderlyingOperand(7).print(os); os << "}"; break; + case Kind::ImageBuffer128: + os << "buffer T#{"; + getUnderlyingOperand(0).print(os); + os << ".."; + getUnderlyingOperand(3).print(os); + os << "}"; + break; + case Kind::ImageBuffer256: + os << "buffer T#{"; + getUnderlyingOperand(0).print(os); + os << ".."; + getUnderlyingOperand(7).print(os); + os << "}"; + break; case Kind::Pointer: os << "ptr{"; getUnderlyingOperand(0).print(os); diff --git a/rpcsx/gpu/lib/gcn-shader/src/analyze.cpp b/rpcsx/gpu/lib/gcn-shader/src/analyze.cpp index 4080e23c1..c4c224d5d 100644 --- a/rpcsx/gpu/lib/gcn-shader/src/analyze.cpp +++ b/rpcsx/gpu/lib/gcn-shader/src/analyze.cpp @@ -132,6 +132,7 @@ static std::unordered_set g_instsWithoutSideEffects = { ir::getInstructionId(ir::amdgpu::VBUFFER), ir::getInstructionId(ir::amdgpu::SAMPLER), ir::getInstructionId(ir::amdgpu::TBUFFER), + ir::getInstructionId(ir::amdgpu::IMAGE_BUFFER), ir::getInstructionId(ir::amdgpu::POINTER), ir::getInstructionId(ir::amdgpu::PS_INPUT_VGPR), ir::getInstructionId(ir::amdgpu::PS_COMP_SWAP), diff --git a/rpcsx/gpu/lib/gcn-shader/src/gcn.cpp b/rpcsx/gpu/lib/gcn-shader/src/gcn.cpp index 3e0ef1fbd..01dfdcbea 100644 --- a/rpcsx/gpu/lib/gcn-shader/src/gcn.cpp +++ b/rpcsx/gpu/lib/gcn-shader/src/gcn.cpp @@ -9,6 +9,7 @@ #include "spv.hpp" #include "transform.hpp" +#include #include #include @@ -121,6 +122,14 @@ inline shader::spv::TypeInfo getRegisterInfo(unsigned id) { .componentsCount = 2, }; + case gcn::RegId::ImageMemoryTable: + return { + .baseType = ir::spv::OpTypeVector, + .componentType = ir::spv::OpTypeInt, + .componentWidth = 32, + .componentsCount = 2, + }; + case gcn::RegId::Gds: return { .baseType = ir::spv::OpTypeVector, @@ -161,6 +170,8 @@ inline const char *getRegisterName(unsigned id) { return "thread_id"; case gcn::MemoryTable: return "memory_table"; + case gcn::ImageMemoryTable: + return "image_memory_table"; case gcn::Gds: return "gds"; } @@ -190,6 +201,8 @@ static std::optional getRegIdByName(std::string_view variableName) { return gcn::RegId::ThreadId; if (variableName == "memory_table") return gcn::RegId::MemoryTable; + if (variableName == "image_memory_table") + return gcn::RegId::ImageMemoryTable; if (variableName == "gds") return gcn::RegId::Gds; @@ -825,6 +838,8 @@ static ir::Value deserializeGcnRegion( case GcnOperand::Kind::Buffer: case GcnOperand::Kind::Texture128: case GcnOperand::Kind::Texture256: + case GcnOperand::Kind::ImageBuffer128: + case GcnOperand::Kind::ImageBuffer256: case GcnOperand::Kind::Sampler: case GcnOperand::Kind::Pointer: break; @@ -849,6 +864,37 @@ static ir::Value deserializeGcnRegion( op.getUnderlyingOperand(2)), createOperandReadImpl(loc, builder, uint32TV, op.getUnderlyingOperand(3))); + case GcnOperand::Kind::ImageBuffer128: + return builder.createValue( + loc, ir::amdgpu::IMAGE_BUFFER, type, op.access, + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(0)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(1)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(2)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(3))); + + case GcnOperand::Kind::ImageBuffer256: + return builder.createValue( + loc, ir::amdgpu::IMAGE_BUFFER, type, op.access, + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(0)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(1)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(2)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(3)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(4)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(5)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(6)), + createOperandReadImpl(loc, builder, uint32TV, + op.getUnderlyingOperand(7))); case GcnOperand::Kind::Texture128: return builder.createValue( @@ -1381,8 +1427,8 @@ static ir::Value deserializeGcnRegion( converter.getTypePointer(ir::spv::StorageClass::Function, paramType), ir::spv::StorageClass::Function); - auto result = createOperandRead(loc, builder, paramType, op); - builder.createSpvStore(loc, arg, result); + auto result = createOperandRead(loc, builder, paramType, op); + builder.createSpvStore(loc, arg, result); callArgs.push_back(arg); } @@ -1631,10 +1677,9 @@ ir::Node gcn::Import::getOrCloneImpl(ir::Context &context, ir::Node node, if (shader::spv::getTypeInfo( inst.getOperand(0).getAsValue().getOperand(1).getAsValue()) != getRegisterInfo(*regId)) { - std::fprintf(stderr, - "unexpected type for register variable " - "'%s', expected %u\n", - name->c_str(), getRegisterInfo(*regId).width()); + std::println( + stderr, "unexpected type for register variable '{}', expected {}", + *name, getRegisterInfo(*regId).width()); std::abort(); } @@ -1799,7 +1844,7 @@ gcn::deserialize(gcn::Context &context, const gcn::Environment &environment, .createSpvBranch(child.getLocation(), regionEntry); child.remove(); } else { - std::fprintf(stderr, "failed to evaluate branch!\n"); + std::println(stderr, "failed to evaluate branch!"); } context.requiredUserSgprs |= evaluator.usedUserSgprs; } diff --git a/rpcsx/gpu/shaders/rect_list.geom.glsl b/rpcsx/gpu/shaders/rect_list.geom.glsl index 79e9fdf74..00136c33d 100644 --- a/rpcsx/gpu/shaders/rect_list.geom.glsl +++ b/rpcsx/gpu/shaders/rect_list.geom.glsl @@ -3,6 +3,9 @@ layout (triangles, invocations = 1) in; layout (triangle_strip, max_vertices = 4) out; +layout (location=0) in vec4 inp[3]; +layout (location=0) out vec4 outp; + void main(void) { vec4 topLeft = gl_in[0].gl_Position; @@ -23,15 +26,19 @@ void main(void) topLeft.w ); + outp = inp[0]; gl_Position = topLeft; EmitVertex(); + outp = inp[2]; gl_Position = bottomLeft; EmitVertex(); + outp = vec4(inp[1].x, inp[0].y, inp[0].z, inp[0].w); gl_Position = topRight; EmitVertex(); + outp = vec4(inp[1].x, inp[2].y, inp[0].z, inp[0].w); gl_Position = bottomRight; EmitVertex(); diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp index b36edf96d..ab4862257 100644 --- a/rpcsx/iodev/gc.cpp +++ b/rpcsx/iodev/gc.cpp @@ -91,6 +91,8 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, orbis::g_currentThread->tproc->vmId, {args->cmds + i * 4, 4}); } + + // gpu.waitForIdle(); } else { return orbis::ErrorCode::BUSY; } @@ -105,6 +107,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, auto args = reinterpret_cast(argp); if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { + gpu.waitForIdle(); gpu.submitSwitchBuffer(orbis::g_currentThread->tproc->vmId); } else { return orbis::ErrorCode::BUSY; @@ -132,18 +135,22 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, {args->cmds + i * 4, 4}); } - // ORBIS_LOG_ERROR("submit and write eop", args->eopValue, args->waitFlag); + // ORBIS_LOG_ERROR("submit and write eop", args->eopValue, + // args->waitFlag); gpu.submitWriteEop(gcFile->gfxPipe, args->waitFlag, args->eopValue); } else { return orbis::ErrorCode::BUSY; } - // orbis::bridge.sendDoFlip(); break; } case 0xc0048116: { // submit done? - break; + if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { + gpu.waitForIdle(); + } else { + return orbis::ErrorCode::BUSY; + } } case 0xc0048117: @@ -243,7 +250,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { gpu.submitComputeQueue(args->meId, args->pipeId, args->queueId, - args->nextStartOffsetInDw); + args->nextStartOffsetInDw); } else { return orbis::ErrorCode::BUSY; }