diff --git a/rpcsx-gpu2/Cache.cpp b/rpcsx-gpu2/Cache.cpp index 74f30c235..b74ec13fd 100644 --- a/rpcsx-gpu2/Cache.cpp +++ b/rpcsx-gpu2/Cache.cpp @@ -4,6 +4,7 @@ #include "gnm/vulkan.hpp" #include "rx/MemoryTable.hpp" #include "rx/die.hpp" +#include "shader/Evaluator.hpp" #include "shader/GcnConverter.hpp" #include "shader/dialect.hpp" #include "shader/glsl.hpp" @@ -12,10 +13,12 @@ #include #include #include +#include #include #include using namespace amdgpu; +using namespace shader; static bool isPrimRequiresConversion(gnm::PrimitiveType primType) { switch (primType) { @@ -81,21 +84,274 @@ static ConverterFn *getPrimConverterFn(gnm::PrimitiveType primType, } } -static VkShaderStageFlagBits shaderStageToVk(shader::gcn::Stage stage) { +void Cache::ShaderResources::loadResources( + gcn::Resources &res, std::span userSgprs) { + this->userSgprs = userSgprs; + for (auto &pointer : res.pointers) { + auto pointerBase = eval(pointer.base).zExtScalar(); + auto pointerOffset = eval(pointer.offset).zExtScalar(); + + if (!pointerBase || !pointerOffset) { + res.dump(); + rx::die("failed to evaluate pointer"); + } + + bufferMemoryTable.map(*pointerBase, + *pointerBase + *pointerOffset + pointer.size, + Access::Read); + resourceSlotToAddress.push_back( + {slotOffset + pointer.resourceSlot, *pointerBase}); + } + + for (auto &bufferRes : res.buffers) { + auto word0 = eval(bufferRes.words[0]).zExtScalar(); + auto word1 = eval(bufferRes.words[1]).zExtScalar(); + auto word2 = eval(bufferRes.words[2]).zExtScalar(); + auto word3 = eval(bufferRes.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate V#"); + } + + gnm::VBuffer buffer{}; + std::memcpy(reinterpret_cast(&buffer), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, + sizeof(std::uint32_t)); + + bufferMemoryTable.map(buffer.address(), buffer.address() + buffer.size(), + bufferRes.access); + resourceSlotToAddress.push_back( + {slotOffset + bufferRes.resourceSlot, buffer.address()}); + } + + for (auto &texture : res.textures) { + auto word0 = eval(texture.words[0]).zExtScalar(); + auto word1 = eval(texture.words[1]).zExtScalar(); + auto word2 = eval(texture.words[2]).zExtScalar(); + auto word3 = eval(texture.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate 128 bit T#"); + } + + gnm::TBuffer buffer{}; + std::memcpy(reinterpret_cast(&buffer), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, + sizeof(std::uint32_t)); + + if (texture.words[4] != nullptr) { + auto word4 = eval(texture.words[4]).zExtScalar(); + auto word5 = eval(texture.words[5]).zExtScalar(); + auto word6 = eval(texture.words[6]).zExtScalar(); + auto word7 = eval(texture.words[7]).zExtScalar(); + + if (!word4 || !word5 || !word6 || !word7) { + res.dump(); + rx::die("failed to evaluate 256 bit T#"); + } + + std::memcpy(reinterpret_cast(&buffer) + 4, &*word4, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 5, &*word5, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 6, &*word6, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&buffer) + 7, &*word7, + sizeof(std::uint32_t)); + } + + std::vector *resources = nullptr; + + switch (buffer.type) { + case gnm::TextureType::Array1D: + case gnm::TextureType::Dim1D: + resources = &imageResources[0]; + break; + case gnm::TextureType::Dim2D: + case gnm::TextureType::Array2D: + case gnm::TextureType::Msaa2D: + case gnm::TextureType::MsaaArray2D: + case gnm::TextureType::Cube: + resources = &imageResources[1]; + break; + case gnm::TextureType::Dim3D: + resources = &imageResources[2]; + break; + } + + rx::dieIf(resources == nullptr, + "ShaderResources: unexpected texture type %u", + static_cast(buffer.type)); + + slotResources[slotOffset + texture.resourceSlot] = resources->size(); + resources->push_back(cacheTag->getImageView( + amdgpu::ImageKey::createFrom(buffer), texture.access)); + } + + for (auto &sampler : res.samplers) { + auto word0 = eval(sampler.words[0]).zExtScalar(); + auto word1 = eval(sampler.words[1]).zExtScalar(); + auto word2 = eval(sampler.words[2]).zExtScalar(); + auto word3 = eval(sampler.words[3]).zExtScalar(); + + if (!word0 || !word1 || !word2 || !word3) { + res.dump(); + rx::die("failed to evaluate S#"); + } + + gnm::SSampler sSampler{}; + std::memcpy(reinterpret_cast(&sSampler), &*word0, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 1, &*word1, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 2, &*word2, + sizeof(std::uint32_t)); + std::memcpy(reinterpret_cast(&sSampler) + 3, &*word3, + sizeof(std::uint32_t)); + + if (sampler.unorm) { + sSampler.force_unorm_coords = true; + } + + slotResources[slotOffset + sampler.resourceSlot] = samplerResources.size(); + samplerResources.push_back( + cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler))); + } + + slotOffset += res.slots; +} + +void Cache::ShaderResources::buildMemoryTable(MemoryTable &memoryTable) { + memoryTable.count = 0; + + for (auto p : bufferMemoryTable) { + auto size = p.endAddress - p.beginAddress; + auto buffer = cacheTag->getBuffer(p.beginAddress, size, p.payload); + + auto memoryTableSlot = memoryTable.count; + memoryTable.slots[memoryTable.count++] = { + .address = p.beginAddress, + .size = size, + .flags = static_cast(p.payload), + .deviceAddress = buffer.deviceAddress, + }; + + for (auto [slot, address] : resourceSlotToAddress) { + if (address >= p.beginAddress && address < p.endAddress) { + slotResources[slot] = memoryTableSlot; + } + } + } +} + +std::uint32_t Cache::ShaderResources::getResourceSlot(std::uint32_t id) { + if (auto it = slotResources.find(id); it != slotResources.end()) { + return it->second; + } + return -1; +} + +eval::Value +Cache::ShaderResources::eval(ir::InstructionId instId, + std::span operands) { + if (instId == ir::amdgpu::POINTER) { + auto type = operands[0].getAsValue(); + auto loadSize = *operands[1].getAsInt32(); + auto base = eval(operands[2]).zExtScalar(); + auto offset = eval(operands[3]).zExtScalar(); + + if (!base || !offset) { + rx::die("failed to evaluate pointer dependency"); + } + + eval::Value result; + auto address = *base + *offset; + + switch (loadSize) { + case 1: + result = readPointer(address); + break; + case 2: + result = readPointer(address); + break; + case 4: + result = readPointer(address); + break; + case 8: + result = readPointer(address); + break; + case 12: + result = readPointer(address); + break; + case 16: + result = readPointer(address); + break; + case 32: + result = readPointer>(address); + break; + default: + rx::die("unexpected pointer load size"); + } + + return result; + } + + if (instId == ir::amdgpu::VBUFFER) { + rx::die("resource depends on buffer value"); + } + + if (instId == ir::amdgpu::TBUFFER) { + rx::die("resource depends on texture value"); + } + + if (instId == ir::amdgpu::SAMPLER) { + rx::die("resource depends on sampler value"); + } + + if (instId == ir::amdgpu::USER_SGPR) { + auto index = static_cast(*operands[1].getAsInt32()); + rx::dieIf(index >= userSgprs.size(), "out of user sgprs"); + return userSgprs[index]; + } + + if (instId == ir::amdgpu::IMM) { + auto address = static_cast(*operands[1].getAsInt64()); + + std::uint32_t result; + cacheTag->readMemory(&result, address, sizeof(result)); + return result; + } + + return Evaluator::eval(instId, operands); +} + +static VkShaderStageFlagBits shaderStageToVk(gcn::Stage stage) { switch (stage) { - case shader::gcn::Stage::Ps: + case gcn::Stage::Ps: return VK_SHADER_STAGE_FRAGMENT_BIT; - case shader::gcn::Stage::VsVs: + case gcn::Stage::VsVs: return VK_SHADER_STAGE_VERTEX_BIT; - // case shader::gcn::Stage::VsEs: - // case shader::gcn::Stage::VsLs: - case shader::gcn::Stage::Cs: + // case gcn::Stage::VsEs: + // case gcn::Stage::VsLs: + case gcn::Stage::Cs: return VK_SHADER_STAGE_COMPUTE_BIT; - // case shader::gcn::Stage::Gs: - // case shader::gcn::Stage::GsVs: - // case shader::gcn::Stage::Hs: - // case shader::gcn::Stage::DsVs: - // case shader::gcn::Stage::DsEs: + // case gcn::Stage::Gs: + // case gcn::Stage::GsVs: + // case gcn::Stage::Hs: + // case gcn::Stage::DsVs: + // case gcn::Stage::DsEs: default: rx::die("unsupported shader stage %u", int(stage)); @@ -200,7 +456,7 @@ struct Cache::Entry { struct CachedShader : Cache::Entry { std::uint64_t magic; VkShaderEXT handle; - shader::gcn::ShaderInfo info; + gcn::ShaderInfo info; std::vector>> usedMemory; ~CachedShader() { @@ -395,16 +651,6 @@ ImageKey ImageKey::createFrom(const gnm::TBuffer &buffer) { }; } -ImageViewKey ImageViewKey::createFrom(const gnm::TBuffer &buffer) { - ImageViewKey result{}; - static_cast(result) = ImageKey::createFrom(buffer); - result.R = buffer.dst_sel_x; - result.G = buffer.dst_sel_y; - result.B = buffer.dst_sel_z; - result.A = buffer.dst_sel_w; - return result; -} - SamplerKey SamplerKey::createFrom(const gnm::SSampler &sampler) { float lodBias = ((std::int16_t(sampler.lod_bias) << 2) >> 2) / float(256.f); // FIXME: lodBias can be scaled by gnm::TBuffer @@ -433,17 +679,17 @@ Cache::Shader Cache::Tag::getShader(const ShaderKey &key, auto stage = shaderStageToVk(key.stage); if (auto result = findShader(key, dependedKey)) { auto cachedShader = static_cast(result.get()); - mAcquiredResources.push_back(result); + mStorage->mAcquiredResources.push_back(result); return {cachedShader->handle, &cachedShader->info, stage}; } auto vmId = mParent->mVmIm; - std::optional converted; + std::optional converted; { - shader::gcn::Context context; - auto deserialized = shader::gcn::deserialize( + gcn::Context context; + auto deserialized = gcn::deserialize( context, key.env, mParent->mDevice->gcnSemantic, key.address, [vmId](std::uint64_t address) -> std::uint32_t { return *RemoteMemory{vmId}.getPointer(address); @@ -451,9 +697,9 @@ Cache::Shader Cache::Tag::getShader(const ShaderKey &key, // deserialized.print(std::cerr, context.ns); - converted = shader::gcn::convertToSpv( - context, deserialized, mParent->mDevice->gcnSemanticModuleInfo, - key.stage, key.env); + converted = gcn::convertToSpv(context, deserialized, + mParent->mDevice->gcnSemanticModuleInfo, + key.stage, key.env); if (!converted) { return {}; } @@ -510,7 +756,7 @@ Cache::Shader Cache::Tag::getShader(const ShaderKey &key, } mParent->mShaders.map(key.address, key.address + 8, result); - mAcquiredResources.push_back(result); + mStorage->mAcquiredResources.push_back(result); return {handle, &result->info, stage}; } @@ -602,7 +848,7 @@ Cache::Buffer Cache::Tag::getBuffer(std::uint64_t address, std::uint64_t size, cached->tagId = (access & Access::Write) != Access::Write ? getWriteId() : getReadId(); - mAcquiredResources.push_back(cached); + mStorage->mAcquiredResources.push_back(cached); return { .handle = cached->buffer.getHandle(), @@ -613,15 +859,12 @@ Cache::Buffer Cache::Tag::getBuffer(std::uint64_t address, std::uint64_t size, }; } -Cache::Buffer Cache::Tag::getInternalBuffer(std::uint64_t size) { - auto buffer = vk::Buffer::Allocate( - vk::getHostVisibleMemory(), size, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | - VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | - VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | - VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT); +Cache::Buffer Cache::Tag::getInternalHostVisibleBuffer(std::uint64_t size) { + auto buffer = vk::Buffer::Allocate(vk::getHostVisibleMemory(), size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT); auto cached = std::make_shared(); cached->baseAddress = 0; @@ -630,7 +873,7 @@ Cache::Buffer Cache::Tag::getInternalBuffer(std::uint64_t size) { cached->size = size; cached->tagId = getReadId(); - mAcquiredResources.push_back(cached); + mStorage->mAcquiredResources.push_back(cached); return { .handle = cached->buffer.getHandle(), @@ -641,6 +884,89 @@ Cache::Buffer Cache::Tag::getInternalBuffer(std::uint64_t size) { }; } +Cache::Buffer Cache::Tag::getInternalDeviceLocalBuffer(std::uint64_t size) { + auto buffer = vk::Buffer::Allocate(vk::getDeviceLocalMemory(), size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT); + + auto cached = std::make_shared(); + cached->baseAddress = 0; + cached->acquiredAccess = Access::None; + cached->buffer = std::move(buffer); + cached->size = size; + cached->tagId = getReadId(); + + mStorage->mAcquiredResources.push_back(cached); + + return { + .handle = cached->buffer.getHandle(), + .offset = 0, + .deviceAddress = cached->buffer.getAddress(), + .tagId = getReadId(), + .data = cached->buffer.getData(), + }; +} + +void Cache::Tag::buildDescriptors(VkDescriptorSet descriptorSet) { + auto memoryTableBuffer = getMemoryTable(); + auto memoryTable = std::bit_cast(memoryTableBuffer.data); + mStorage->shaderResources.buildMemoryTable(*memoryTable); + + for (auto &sampler : mStorage->shaderResources.samplerResources) { + uint32_t index = + &sampler - mStorage->shaderResources.samplerResources.data(); + + VkDescriptorImageInfo samplerInfo{.sampler = sampler.handle}; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSet, + .dstBinding = Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER), + .dstArrayElement = index, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER, + .pImageInfo = &samplerInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + } + + for (auto &imageResources : mStorage->shaderResources.imageResources) { + auto dim = (&imageResources - mStorage->shaderResources.imageResources) + 1; + auto binding = static_cast( + Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, dim)); + + for (auto &image : imageResources) { + uint32_t index = &image - imageResources.data(); + + VkDescriptorImageInfo imageInfo{ + .imageView = image.handle, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSet, + .dstBinding = binding, + .dstArrayElement = index, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .pImageInfo = &imageInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + } + } + + for (auto &mtConfig : mStorage->memoryTableConfigSlots) { + auto config = mStorage->descriptorBuffers[mtConfig.bufferIndex]; + config[mtConfig.configIndex] = + mStorage->shaderResources.getResourceSlot(mtConfig.resourceSlot); + } +} + Cache::IndexBuffer Cache::Tag::getIndexBuffer(std::uint64_t address, std::uint32_t indexCount, gnm::PrimitiveType primType, @@ -682,7 +1008,8 @@ Cache::IndexBuffer Cache::Tag::getIndexBuffer(std::uint64_t address, auto &resource = it.get(); auto indexBuffer = static_cast(resource.get()); if (indexBuffer->size == size && resource->tagId == indexBuffer->tagId) { - mAcquiredResources.push_back(resource); + mStorage->mAcquiredResources.push_back(resource); + return { .handle = indexBuffer->buffer.getHandle(), .offset = 0, @@ -739,15 +1066,17 @@ Cache::IndexBuffer Cache::Tag::getIndexBuffer(std::uint64_t address, cached->primType = primType; cached->indexType = indexType; + auto handle = cached->buffer.getHandle(); + mParent->mIndexBuffers.map(address, address + size, cached); - mAcquiredResources.push_back(cached); + mStorage->mAcquiredResources.push_back(std::move(cached)); return { - .handle = cached->buffer.getHandle(), + .handle = handle, .offset = 0, .indexCount = indexCount, - .primType = cached->primType, - .indexType = cached->indexType, + .primType = primType, + .indexType = indexType, }; } @@ -882,7 +1211,6 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { mScheduler->afterSubmit([detiledBuffer = std::move(detiledBuffer)] {}); - for (unsigned mipLevel = key.baseMipLevel; mipLevel < key.baseMipLevel + key.mipCount; ++mipLevel) { auto &info = surfaceInfo.getSubresourceInfo(mipLevel); @@ -918,22 +1246,15 @@ Cache::Image Cache::Tag::getImage(const ImageKey &key, Access access) { cached->acquiredAccess = access; cached->acquiredTileMode = key.tileMode; cached->acquiredDfmt = key.dfmt; - mAcquiredResources.push_back(cached); + mStorage->mAcquiredResources.push_back(cached); return {.handle = cached->image.getHandle(), .subresource = subresourceRange}; } -Cache::ImageView Cache::Tag::getImageView(const ImageViewKey &key, - Access access) { +Cache::ImageView Cache::Tag::getImageView(const ImageKey &key, Access access) { auto image = getImage(key, access); auto result = vk::ImageView(gnm::toVkImageViewType(key.type), image.handle, - gnm::toVkFormat(key.dfmt, key.nfmt), - { - .r = gnm::toVkComponentSwizzle(key.R), - .g = gnm::toVkComponentSwizzle(key.G), - .b = gnm::toVkComponentSwizzle(key.B), - .a = gnm::toVkComponentSwizzle(key.A), - }, + gnm::toVkFormat(key.dfmt, key.nfmt), {}, { .aspectMask = toAspect(key.kind), .baseMipLevel = key.baseMipLevel, @@ -948,7 +1269,7 @@ Cache::ImageView Cache::Tag::getImageView(const ImageViewKey &key, cached->acquiredAccess = access; cached->view = std::move(result); - mAcquiredResources.push_back(cached); + mStorage->mAcquiredResources.push_back(cached); return { .handle = cached->view.getHandle(), @@ -978,41 +1299,289 @@ int Cache::Tag::compareMemory(const void *source, std::uint64_t address, return std::memcmp(memoryPtr, source, size); } +void Cache::GraphicsTag::release() { + if (mAcquiredGraphicsDescriptorSet + 1 != 0) { + getCache()->mGraphicsDescriptorSetPool.release( + mAcquiredGraphicsDescriptorSet); + mAcquiredGraphicsDescriptorSet = -1; + } + + Tag::release(); +} + +void Cache::ComputeTag::release() { + if (mAcquiredComputeDescriptorSet + 1 != 0) { + getCache()->mComputeDescriptorSetPool.release( + mAcquiredComputeDescriptorSet); + mAcquiredComputeDescriptorSet = -1; + } + + Tag::release(); +} + void Cache::Tag::release() { - for (auto ds : mGraphicsDescriptorSets) { - getCache()->destroyGraphicsDescriptorSets(ds); + if (mAcquiredMemoryTable + 1 != 0) { + getCache()->mMemoryTablePool.release(mAcquiredMemoryTable); + mAcquiredMemoryTable = -1; } - for (auto ds : mComputeDescriptorSets) { - getCache()->destroyComputeDescriptorSet(ds); - } - - mGraphicsDescriptorSets.clear(); - mComputeDescriptorSets.clear(); - - if (mAcquiredResources.empty()) { + if (mStorage == nullptr) { return; } - while (!mAcquiredResources.empty()) { - auto resource = std::move(mAcquiredResources.back()); - mAcquiredResources.pop_back(); + while (!mStorage->mAcquiredResources.empty()) { + auto resource = std::move(mStorage->mAcquiredResources.back()); + mStorage->mAcquiredResources.pop_back(); resource->flush(*this, *mScheduler, 0, ~static_cast(0)); } mScheduler->submit(); - mScheduler->then([mAcquiredResources = std::move(mAcquiredResources)] {}); + mScheduler->wait(); + + mStorage->clear(); + auto storageIndex = mStorage - mParent->mTagStorages; + // std::println("release tag storage {}", storageIndex); + mStorage = nullptr; + mParent->mTagStoragePool.release(storageIndex); } -Cache::Tag Cache::createTag(Scheduler &scheduler) { - auto tag = Tag{this, scheduler, mNextTagId}; - mNextTagId = static_cast(static_cast(mNextTagId) + 2); - return tag; +Cache::Shader +Cache::GraphicsTag::getPixelShader(const SpiShaderPgm &pgm, + const Registers::Context &context, + std::span viewPorts) { + gcn::PsVGprInput + psVgprInput[static_cast(gcn::PsVGprInput::Count)]; + std::size_t psVgprInputs = 0; + + SpiPsInput spiInputAddr = context.spiPsInputAddr; + + if (spiInputAddr.perspSampleEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::IPerspSample; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JPerspSample; + } + if (spiInputAddr.perspCenterEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::IPerspCenter; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JPerspCenter; + } + if (spiInputAddr.perspCentroidEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::IPerspCentroid; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JPerspCentroid; + } + if (spiInputAddr.perspPullModelEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::IW; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JW; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::_1W; + } + if (spiInputAddr.linearSampleEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::ILinearSample; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JLinearSample; + } + if (spiInputAddr.linearCenterEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::ILinearCenter; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JLinearCenter; + } + if (spiInputAddr.linearCentroidEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::ILinearCentroid; + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::JLinearCentroid; + } + if (spiInputAddr.posXFloatEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::X; + } + if (spiInputAddr.posYFloatEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::Y; + } + if (spiInputAddr.posZFloatEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::Z; + } + if (spiInputAddr.posWFloatEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::W; + } + if (spiInputAddr.frontFaceEna) { + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::FrontFace; + } + if (spiInputAddr.ancillaryEna) { + rx::die("unimplemented ancillary fs input"); + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::Ancillary; + } + if (spiInputAddr.sampleCoverageEna) { + rx::die("unimplemented sample coverage fs input"); + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::SampleCoverage; + } + if (spiInputAddr.posFixedPtEna) { + rx::die("unimplemented pos fixed fs input"); + psVgprInput[psVgprInputs++] = gcn::PsVGprInput::PosFixed; + } + + return getShader(gcn::Stage::Ps, pgm, context, {}, viewPorts, + {psVgprInput, psVgprInputs}); +} + +Cache::Shader +Cache::GraphicsTag::getVertexShader(gcn::Stage stage, const SpiShaderPgm &pgm, + const Registers::Context &context, + gnm::PrimitiveType vsPrimType, + std::span viewPorts) { + return getShader(stage, pgm, context, vsPrimType, viewPorts, {}); +} + +Cache::Shader +Cache::GraphicsTag::getShader(gcn::Stage stage, const SpiShaderPgm &pgm, + const Registers::Context &context, + gnm::PrimitiveType vsPrimType, + std::span viewPorts, + std::span psVgprInput) { + auto descriptorSets = getDescriptorSets(); + gcn::Environment env{ + .vgprCount = pgm.rsrc1.getVGprCount(), + .sgprCount = pgm.rsrc1.getSGprCount(), + .userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr), + .supportsBarycentric = vk::context->supportsBarycentric, + .supportsInt8 = vk::context->supportsInt8, + .supportsInt64Atomics = vk::context->supportsInt64Atomics, + }; + + auto shader = Tag::getShader({ + .address = pgm.address << 8, + .stage = stage, + .env = env, + }); + + if (!shader.handle) { + return shader; + } + + std::uint64_t memoryTableAddress = getMemoryTable().deviceAddress; + + std::uint64_t gdsAddress = mParent->getGdsBuffer().getAddress(); + mStorage->shaderResources.cacheTag = this; + + std::uint32_t slotOffset = mStorage->shaderResources.slotOffset; + + mStorage->shaderResources.loadResources( + shader.info->resources, + std::span(pgm.userData.data(), pgm.rsrc2.userSgpr)); + + const auto &configSlots = shader.info->configSlots; + + auto configSize = configSlots.size() * sizeof(std::uint32_t); + auto configBuffer = getInternalHostVisibleBuffer(configSize); + + auto configPtr = reinterpret_cast(configBuffer.data); + + for (std::size_t index = 0; const auto &slot : configSlots) { + switch (slot.type) { + case gcn::ConfigType::Imm: + readMemory(&configPtr[index], slot.data, sizeof(std::uint32_t)); + break; + case gcn::ConfigType::UserSgpr: + configPtr[index] = pgm.userData[slot.data]; + break; + case gcn::ConfigType::ViewPortOffsetX: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].xOffset / + (viewPorts[slot.data].width / 2.f) - + 1); + break; + case gcn::ConfigType::ViewPortOffsetY: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].yOffset / + (viewPorts[slot.data].height / 2.f) - + 1); + break; + case gcn::ConfigType::ViewPortOffsetZ: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].zOffset); + break; + case gcn::ConfigType::ViewPortScaleX: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].xScale / + (viewPorts[slot.data].width / 2.f)); + break; + case gcn::ConfigType::ViewPortScaleY: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].yScale / + (viewPorts[slot.data].height / 2.f)); + break; + case gcn::ConfigType::ViewPortScaleZ: + configPtr[index] = + std::bit_cast(context.paClVports[slot.data].zScale); + break; + case gcn::ConfigType::PsInputVGpr: + if (slot.data > psVgprInput.size()) { + configPtr[index] = ~0; + } else { + configPtr[index] = std::bit_cast(psVgprInput[slot.data]); + } + break; + case gcn::ConfigType::VsPrimType: + configPtr[index] = static_cast(vsPrimType); + break; + + case gcn::ConfigType::ResourceSlot: + mStorage->memoryTableConfigSlots.push_back({ + .bufferIndex = + static_cast(mStorage->descriptorBuffers.size()), + .configIndex = static_cast(index), + .resourceSlot = static_cast(slotOffset + slot.data), + }); + break; + + case gcn::ConfigType::MemoryTable: + if (slot.data == 0) { + configPtr[index] = static_cast(memoryTableAddress); + } else { + configPtr[index] = static_cast(memoryTableAddress >> 32); + } + break; + case gcn::ConfigType::Gds: + if (slot.data == 0) { + configPtr[index] = static_cast(gdsAddress); + } else { + configPtr[index] = static_cast(gdsAddress >> 32); + } + break; + + case gcn::ConfigType::CbCompSwap: + configPtr[index] = std::bit_cast( + context.cbColor[slot.data].info.compSwap); + break; + } + + ++index; + } + + mStorage->descriptorBuffers.push_back(configPtr); + + VkDescriptorBufferInfo bufferInfo{ + .buffer = configBuffer.handle, + .offset = configBuffer.offset, + .range = configSize, + }; + + auto stageIndex = Cache::getStageIndex(shader.stage); + + VkWriteDescriptorSet writeDescSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .dstSet = descriptorSets[stageIndex], + .dstBinding = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .pBufferInfo = &bufferInfo, + }; + + vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); + return shader; +} + +Cache::Shader +Cache::ComputeTag::getShader(const Registers::ComputeConfig &pgm) { + return {}; } Cache::Cache(Device *device, int vmId) : mDevice(device), mVmIm(vmId) { - mMemoryTableBuffer = - vk::Buffer::Allocate(vk::getHostVisibleMemory(), 0x10000); + mMemoryTableBuffer = vk::Buffer::Allocate( + vk::getHostVisibleMemory(), kMemoryTableSize * kMemoryTableCount); + mGdsBuffer = vk::Buffer::Allocate(vk::getHostVisibleMemory(), 0x40000); { @@ -1080,8 +1649,82 @@ Cache::Cache(Device *device, int vmId) : mDevice(device), mVmIm(vmId) { vk::context->allocator, &mComputePipelineLayout)); } + + { + VkDescriptorPoolSize descriptorPoolSizes[]{ + { + .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + .descriptorCount = 4 * (kDescriptorSetCount * 2 / 4), + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 3 * 32 * (kDescriptorSetCount * 2 / 4), + }, + { + .type = VK_DESCRIPTOR_TYPE_SAMPLER, + .descriptorCount = 32 * (kDescriptorSetCount * 2 / 4), + }, + { + .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 32 * (kDescriptorSetCount * 2 / 4), + }, + }; + + VkDescriptorPoolCreateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, + .maxSets = kDescriptorSetCount * 2, + .poolSizeCount = static_cast(std::size(descriptorPoolSizes)), + .pPoolSizes = descriptorPoolSizes, + }; + + VK_VERIFY(vkCreateDescriptorPool(vk::context->device, &info, + vk::context->allocator, &mDescriptorPool)); + } + + { + VkDescriptorSetAllocateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = mDescriptorPool, + .descriptorSetCount = + static_cast(mGraphicsDescriptorSetLayouts.size()), + .pSetLayouts = mGraphicsDescriptorSetLayouts.data(), + }; + + for (auto &graphicsSet : mGraphicsDescriptorSets) { + vkAllocateDescriptorSets(vk::context->device, &info, graphicsSet.data()); + } + } + + { + VkDescriptorSetAllocateInfo info{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, + .descriptorPool = mDescriptorPool, + .descriptorSetCount = 1, + .pSetLayouts = &mComputeDescriptorSetLayout, + }; + + for (auto &computeSet : mComputeDescriptorSets) { + vkAllocateDescriptorSets(vk::context->device, &info, &computeSet); + } + } +} + +Cache::~Cache() { + vkDestroyDescriptorPool(vk::context->device, mDescriptorPool, + vk::context->allocator); + + vkDestroyPipelineLayout(vk::context->device, mGraphicsPipelineLayout, + vk::context->allocator); + vkDestroyPipelineLayout(vk::context->device, mComputePipelineLayout, + vk::context->allocator); + + for (auto &layout : mGraphicsDescriptorSetLayouts) { + vkDestroyDescriptorSetLayout(vk::context->device, layout, + vk::context->allocator); + } + vkDestroyDescriptorSetLayout(vk::context->device, mComputeDescriptorSetLayout, + vk::context->allocator); } -Cache::~Cache() {} void Cache::addFrameBuffer(Scheduler &scheduler, int index, std::uint64_t address, std::uint32_t width, @@ -1142,117 +1785,6 @@ void Cache::flush(Scheduler &scheduler, std::uint64_t address, // flushCacheImpl(scheduler, tag, mShaders, beginAddress, endAddress); flushCacheImpl(scheduler, tag, mSyncTable, beginAddress, endAddress); -} - -std::array -Cache::createGraphicsDescriptorSets() { - std::lock_guard lock(mDescriptorMtx); - - if (!mGraphicsDescriptorSets.empty()) { - auto result = mGraphicsDescriptorSets.back(); - mGraphicsDescriptorSets.pop_back(); - return result; - } - - constexpr auto maxSets = Cache::kGraphicsStages.size() * 128; - - if (mGraphicsDescriptorPool == nullptr) { - VkDescriptorPoolSize poolSizes[]{ - { - .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .descriptorCount = 1 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, - .descriptorCount = 3 * 16 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_SAMPLER, - .descriptorCount = 16 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - .descriptorCount = 16 * (maxSets / 4), - }, - }; - - VkDescriptorPoolCreateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, - .maxSets = maxSets, - .poolSizeCount = static_cast(std::size(poolSizes)), - .pPoolSizes = poolSizes, - }; - - VK_VERIFY(vkCreateDescriptorPool(vk::context->device, &info, - vk::context->allocator, - &mGraphicsDescriptorPool)); - } - - VkDescriptorSetAllocateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, - .descriptorPool = mGraphicsDescriptorPool, - .descriptorSetCount = - static_cast(mGraphicsDescriptorSetLayouts.size()), - .pSetLayouts = mGraphicsDescriptorSetLayouts.data(), - }; - - std::array result; - VK_VERIFY( - vkAllocateDescriptorSets(vk::context->device, &info, result.data())); - return result; -} - -VkDescriptorSet Cache::createComputeDescriptorSet() { - std::lock_guard lock(mDescriptorMtx); - - if (!mComputeDescriptorSets.empty()) { - auto result = mComputeDescriptorSets.back(); - mComputeDescriptorSets.pop_back(); - return result; - } - - constexpr auto maxSets = 128; - - if (mComputeDescriptorPool == nullptr) { - VkDescriptorPoolSize poolSizes[]{ - { - .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .descriptorCount = 1 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, - .descriptorCount = 3 * 16 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_SAMPLER, - .descriptorCount = 16 * (maxSets / 4), - }, - { - .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - .descriptorCount = 16 * (maxSets / 4), - }, - }; - - VkDescriptorPoolCreateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, - .maxSets = maxSets, - .poolSizeCount = static_cast(std::size(poolSizes)), - .pPoolSizes = poolSizes, - }; - - VK_VERIFY(vkCreateDescriptorPool(vk::context->device, &info, - vk::context->allocator, - &mComputeDescriptorPool)); - } - - VkDescriptorSetAllocateInfo info{ - .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, - .descriptorPool = mComputeDescriptorPool, - .descriptorSetCount = 1, - .pSetLayouts = &mComputeDescriptorSetLayout, - }; - - VkDescriptorSet result; - VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info, &result)); - return result; + scheduler.submit(); + scheduler.wait(); } diff --git a/rpcsx-gpu2/Cache.hpp b/rpcsx-gpu2/Cache.hpp index c703af75f..21f174b34 100644 --- a/rpcsx-gpu2/Cache.hpp +++ b/rpcsx-gpu2/Cache.hpp @@ -5,12 +5,17 @@ #include "gnm/constants.hpp" #include "rx/die.hpp" #include "shader/Access.hpp" +#include "shader/Evaluator.hpp" #include "shader/GcnConverter.hpp" #include #include #include +#include +#include +#include #include #include +#include #include namespace amdgpu { @@ -22,11 +27,7 @@ struct ShaderKey { shader::gcn::Environment env; }; -enum class ImageKind { - Color, - Depth, - Stencil -}; +enum class ImageKind { Color, Depth, Stencil }; struct ImageKey { std::uint64_t readAddress; @@ -48,15 +49,6 @@ struct ImageKey { static ImageKey createFrom(const gnm::TBuffer &tbuffer); }; -struct ImageViewKey : ImageKey { - gnm::Swizzle R = gnm::Swizzle::R; - gnm::Swizzle G = gnm::Swizzle::G; - gnm::Swizzle B = gnm::Swizzle::B; - gnm::Swizzle A = gnm::Swizzle::A; - - static ImageViewKey createFrom(const gnm::TBuffer &tbuffer); -}; - struct SamplerKey { VkFilter magFilter; VkFilter minFilter; @@ -98,6 +90,10 @@ struct Cache { }; static constexpr int getStageIndex(VkShaderStageFlagBits stage) { + if (stage == VK_SHADER_STAGE_COMPUTE_BIT) { + return 0; + } + auto it = std::find(kGraphicsStages.begin(), kGraphicsStages.end(), stage); if (it == kGraphicsStages.end()) { @@ -107,9 +103,10 @@ struct Cache { return it - kGraphicsStages.begin(); } - static constexpr int getDescriptorBinding(VkDescriptorType type, int dim = 0) { - auto it = - std::find(kDescriptorBindings.begin(), kDescriptorBindings.end(), type + dim * 1000); + static constexpr int getDescriptorBinding(VkDescriptorType type, + int dim = 0) { + auto it = std::find(kDescriptorBindings.begin(), kDescriptorBindings.end(), + type + dim * 1000); if (it == kDescriptorBindings.end()) { return -1; @@ -124,17 +121,17 @@ struct Cache { int vmId = -1; struct Shader { - VkShaderEXT handle; + VkShaderEXT handle = VK_NULL_HANDLE; shader::gcn::ShaderInfo *info; VkShaderStageFlagBits stage; }; struct Sampler { - VkSampler handle; + VkSampler handle = VK_NULL_HANDLE; }; struct Buffer { - VkBuffer handle; + VkBuffer handle = VK_NULL_HANDLE; std::uint64_t offset; std::uint64_t deviceAddress; TagId tagId; @@ -142,7 +139,7 @@ struct Cache { }; struct IndexBuffer { - VkBuffer handle; + VkBuffer handle = VK_NULL_HANDLE; std::uint64_t offset; std::uint32_t indexCount; gnm::PrimitiveType primType; @@ -150,73 +147,150 @@ struct Cache { }; struct Image { - VkImage handle; + VkImage handle = VK_NULL_HANDLE; VkImageSubresourceRange subresource; }; struct ImageView { - VkImageView handle; + VkImageView handle = VK_NULL_HANDLE; VkImage imageHandle; VkImageSubresourceRange subresource; }; - class Tag { - Cache *mParent = nullptr; - Scheduler *mScheduler = nullptr; - TagId mTagId{}; + class Tag; + +private: + struct MemoryTableSlot { + std::uint64_t address; + union { + struct { + std::uint64_t size : 40; + std::uint64_t flags : 4; + }; + std::uint64_t sizeAndFlags; + }; + std::uint64_t deviceAddress; + }; + + struct MemoryTable { + std::uint32_t count; + std::uint32_t pad; + MemoryTableSlot slots[]; + }; + + struct ShaderResources : shader::eval::Evaluator { + std::map slotResources; + std::span userSgprs; + Tag *cacheTag = nullptr; + + std::uint32_t slotOffset = 0; + rx::MemoryTableWithPayload bufferMemoryTable; + std::vector> resourceSlotToAddress; + std::vector samplerResources; + std::vector imageResources[3]; + + using Evaluator::eval; + + void clear() { + slotResources.clear(); + userSgprs = {}; + cacheTag = nullptr; + slotOffset = 0; + bufferMemoryTable.clear(); + resourceSlotToAddress.clear(); + samplerResources.clear(); + for (auto &res : imageResources) { + res.clear(); + } + + Evaluator::invalidate(); + } + + void loadResources(shader::gcn::Resources &res, + std::span userSgprs); + void buildMemoryTable(MemoryTable &memoryTable); + std::uint32_t getResourceSlot(std::uint32_t id); + + template T readPointer(std::uint64_t address) { + T result{}; + cacheTag->readMemory(&result, address, sizeof(result)); + return result; + } + + shader::eval::Value + eval(shader::ir::InstructionId instId, + std::span operands) override; + }; + + struct TagStorage { + struct MemoryTableConfigSlot { + std::uint32_t bufferIndex; + std::uint32_t configIndex; + std::uint32_t resourceSlot; + }; std::vector> mAcquiredResources; - std::vector> - mGraphicsDescriptorSets; + std::vector memoryTableConfigSlots; + std::vector descriptorBuffers; + ShaderResources shaderResources; - std::vector mComputeDescriptorSets; + TagStorage() = default; + TagStorage(const TagStorage &) = delete; - public: - Tag() = default; - Tag(Cache *parent, Scheduler &scheduler, TagId id) - : mParent(parent), mScheduler(&scheduler), mTagId(id) {} + void clear() { + mAcquiredResources.clear(); + memoryTableConfigSlots.clear(); + descriptorBuffers.clear(); + shaderResources.clear(); + } + }; + + struct TagData { + TagStorage *mStorage = nullptr; + Scheduler *mScheduler = nullptr; + Cache *mParent = nullptr; + TagId mTagId{}; + std::uint32_t mAcquiredMemoryTable = -1; + }; + +public: + struct Tag : protected TagData { Tag(const Tag &) = delete; - Tag(Tag &&other) { other.swap(*this); } - Tag &operator=(Tag &&other) { - other.swap(*this); + Tag() noexcept = default; + Tag(Tag &&other) noexcept { swap(other); } + Tag &operator=(Tag &&other) noexcept { + swap(other); return *this; } + ~Tag() { release(); } - void submitAndWait() { - mScheduler->submit(); - mScheduler->wait(); + void swap(Tag &other) { + std::swap(static_cast(*this), static_cast(other)); } - Scheduler &getScheduler() const { return *mScheduler; } - - ~Tag() { release(); } + Shader getShader(const ShaderKey &key, + const ShaderKey *dependedKey = nullptr); TagId getReadId() const { return TagId{std::uint64_t(mTagId) - 1}; } TagId getWriteId() const { return mTagId; } - void swap(Tag &other) { - std::swap(mParent, other.mParent); - std::swap(mScheduler, other.mScheduler); - std::swap(mTagId, other.mTagId); - std::swap(mAcquiredResources, other.mAcquiredResources); - std::swap(mGraphicsDescriptorSets, other.mGraphicsDescriptorSets); - std::swap(mComputeDescriptorSets, other.mComputeDescriptorSets); - } - Cache *getCache() const { return mParent; } Device *getDevice() const { return mParent->mDevice; } + Scheduler &getScheduler() const { return *mScheduler; } int getVmId() const { return mParent->mVmIm; } - Shader getShader(const ShaderKey &key, - const ShaderKey *dependedKey = nullptr); + Buffer getInternalHostVisibleBuffer(std::uint64_t size); + Buffer getInternalDeviceLocalBuffer(std::uint64_t size); + + void buildDescriptors(VkDescriptorSet descriptorSet); + Sampler getSampler(const SamplerKey &key); Buffer getBuffer(std::uint64_t address, std::uint64_t size, Access access); - Buffer getInternalBuffer(std::uint64_t size); IndexBuffer getIndexBuffer(std::uint64_t address, std::uint32_t indexCount, gnm::PrimitiveType primType, gnm::IndexType indexType); Image getImage(const ImageKey &key, Access access); - ImageView getImageView(const ImageViewKey &key, Access access); + ImageView getImageView(const ImageKey &key, Access access); void readMemory(void *target, std::uint64_t address, std::uint64_t size); void writeMemory(const void *source, std::uint64_t address, std::uint64_t size); @@ -232,28 +306,138 @@ struct Cache { return getCache()->getComputePipelineLayout(); } - std::array - createGraphicsDescriptorSets() { - auto result = getCache()->createGraphicsDescriptorSets(); - mGraphicsDescriptorSets.push_back(result); - return result; - } + Buffer getMemoryTable() { + if (mAcquiredMemoryTable + 1 == 0) { + mAcquiredMemoryTable = mParent->mMemoryTablePool.acquire(); + } + + auto &buffer = mParent->mMemoryTableBuffer; + auto offset = mAcquiredMemoryTable * kMemoryTableSize; + + Buffer result{ + .offset = offset, + .deviceAddress = buffer.getAddress() + offset, + .tagId = getReadId(), + .data = buffer.getData() + offset, + }; - VkDescriptorSet createComputeDescriptorSet() { - auto result = getCache()->createComputeDescriptorSet(); - mComputeDescriptorSets.push_back(result); return result; } std::shared_ptr findShader(const ShaderKey &key, const ShaderKey *dependedKey = nullptr); + friend Cache; }; + struct GraphicsTag : Tag { + GraphicsTag() = default; + GraphicsTag(GraphicsTag &&other) noexcept { swap(other); } + GraphicsTag &operator=(GraphicsTag &&other) noexcept { + swap(other); + return *this; + } + ~GraphicsTag() { release(); } + + std::array getDescriptorSets() { + if (mAcquiredGraphicsDescriptorSet + 1 == 0) { + mAcquiredGraphicsDescriptorSet = + mParent->mGraphicsDescriptorSetPool.acquire(); + } + + return mParent->mGraphicsDescriptorSets[mAcquiredGraphicsDescriptorSet]; + } + + Shader getShader(shader::gcn::Stage stage, const SpiShaderPgm &pgm, + const Registers::Context &context, + gnm::PrimitiveType vsPrimType, + std::span viewPorts, + std::span psVgprInput); + + Shader getPixelShader(const SpiShaderPgm &pgm, + const Registers::Context &context, + std::span viewPorts); + + Shader getVertexShader(shader::gcn::Stage stage, const SpiShaderPgm &pgm, + const Registers::Context &context, + gnm::PrimitiveType vsPrimType, + std::span viewPorts); + void release(); + + void swap(GraphicsTag &other) { + Tag::swap(other); + std::swap(mAcquiredGraphicsDescriptorSet, + other.mAcquiredGraphicsDescriptorSet); + } + + private: + std::uint32_t mAcquiredGraphicsDescriptorSet = -1; + }; + + struct ComputeTag : Tag { + ComputeTag() = default; + ComputeTag(ComputeTag &&other) noexcept { swap(other); } + ComputeTag &operator=(ComputeTag &&other) noexcept { + swap(other); + return *this; + } + ~ComputeTag() { release(); } + + Shader getShader(const Registers::ComputeConfig &pgm); + + VkDescriptorSet getDescriptorSet() { + if (mAcquiredComputeDescriptorSet + 1 == 0) { + mAcquiredComputeDescriptorSet = + mParent->mComputeDescriptorSetPool.acquire(); + } + + return mParent->mComputeDescriptorSets[mAcquiredComputeDescriptorSet]; + } + + void release(); + + void swap(ComputeTag &other) { + Tag::swap(other); + std::swap(mAcquiredComputeDescriptorSet, + other.mAcquiredComputeDescriptorSet); + } + + private: + std::uint32_t mAcquiredComputeDescriptorSet = -1; + }; + +private: + template T createTagImpl(Scheduler &scheduler) { + T result; + + auto id = mNextTagId.load(std::memory_order::acquire); + while (!mNextTagId.compare_exchange_weak( + id, TagId{static_cast(id) + 2}, + std::memory_order::release, std::memory_order::relaxed)) { + } + + auto storageIndex = mTagStoragePool.acquire(); + + // std::println("acquire tag storage {}", storageIndex); + result.mStorage = mTagStorages + storageIndex; + result.mTagId = id; + result.mParent = this; + result.mScheduler = &scheduler; + + return result; + } + +public: Cache(Device *device, int vmId); ~Cache(); - Tag createTag(Scheduler &scheduler); - vk::Buffer &getMemoryTableBuffer() { return mMemoryTableBuffer; } + Tag createTag(Scheduler &scheduler) { return createTagImpl(scheduler); } + GraphicsTag createGraphicsTag(Scheduler &scheduler) { + return createTagImpl(scheduler); + } + ComputeTag createComputeTag(Scheduler &scheduler) { + return createTagImpl(scheduler); + } + vk::Buffer &getGdsBuffer() { return mGdsBuffer; } void addFrameBuffer(Scheduler &scheduler, int index, std::uint64_t address, @@ -273,21 +457,6 @@ struct Cache { flush(scheduler, 0, ~static_cast(0)); } - const std::array & - getGraphicsDescriptorSetLayouts() const { - return mGraphicsDescriptorSetLayouts; - } - - VkDescriptorSetLayout - getGraphicsDescriptorSetLayout(VkShaderStageFlagBits stage) const { - int index = getStageIndex(stage); - rx::dieIf(index < 0, "getGraphicsDescriptorSetLayout: unexpected stage"); - return mGraphicsDescriptorSetLayouts[index]; - } - - VkDescriptorSetLayout getComputeDescriptorSetLayout() const { - return mComputeDescriptorSetLayout; - } VkPipelineLayout getGraphicsPipelineLayout() const { return mGraphicsPipelineLayout; } @@ -296,19 +465,8 @@ struct Cache { return mComputePipelineLayout; } - std::array - createGraphicsDescriptorSets(); - VkDescriptorSet createComputeDescriptorSet(); - - void destroyGraphicsDescriptorSets( - const std::array &set) { - std::lock_guard lock(mDescriptorMtx); - mGraphicsDescriptorSets.push_back(set); - } - - void destroyComputeDescriptorSet(VkDescriptorSet set) { - std::lock_guard lock(mDescriptorMtx); - mComputeDescriptorSets.push_back(set); + auto &getGraphicsDescriptorSetLayouts() const { + return mGraphicsDescriptorSetLayouts; } private: @@ -316,21 +474,31 @@ private: Device *mDevice; int mVmIm; - TagId mNextTagId{2}; - vk::Buffer mMemoryTableBuffer; + std::atomic mNextTagId{TagId{2}}; vk::Buffer mGdsBuffer; - std::mutex mDescriptorMtx; + static constexpr auto kMemoryTableSize = 0x10000; + static constexpr auto kMemoryTableCount = 64; + static constexpr auto kDescriptorSetCount = 128; + static constexpr auto kTagStorageCount = 128; + + rx::ConcurrentBitPool mMemoryTablePool; + vk::Buffer mMemoryTableBuffer; + std::array mGraphicsDescriptorSetLayouts{}; VkDescriptorSetLayout mComputeDescriptorSetLayout{}; VkPipelineLayout mGraphicsPipelineLayout{}; VkPipelineLayout mComputePipelineLayout{}; - VkDescriptorPool mGraphicsDescriptorPool{}; - VkDescriptorPool mComputeDescriptorPool{}; - std::vector> - mGraphicsDescriptorSets; - std::vector mComputeDescriptorSets; + VkDescriptorPool mDescriptorPool{}; + + rx::ConcurrentBitPool mGraphicsDescriptorSetPool; + rx::ConcurrentBitPool mComputeDescriptorSetPool; + rx::ConcurrentBitPool mTagStoragePool; + std::array + mGraphicsDescriptorSets[kDescriptorSetCount]; + VkDescriptorSet mComputeDescriptorSets[kDescriptorSetCount]; + TagStorage mTagStorages[kTagStorageCount]; std::map mSamplers; std::shared_ptr mFrameBuffers[10]; diff --git a/rpcsx-gpu2/Device.hpp b/rpcsx-gpu2/Device.hpp index 6759c826c..620b0d32d 100644 --- a/rpcsx-gpu2/Device.hpp +++ b/rpcsx-gpu2/Device.hpp @@ -1,9 +1,9 @@ #pragma once #include "Cache.hpp" +#include "FlipPipeline.hpp" #include "Pipe.hpp" #include "amdgpu/bridge/bridge.hpp" #include "amdgpu/tiler_vulkan.hpp" -#include "FlipPipeline.hpp" #include "rx/MemoryTable.hpp" #include "shader/SemanticInfo.hpp" #include "shader/SpvConverter.hpp" @@ -70,6 +70,14 @@ struct Device { return caches[vmId].createTag(scheduler); } + Cache::GraphicsTag getGraphicsTag(int vmId, Scheduler &scheduler) { + return caches[vmId].createGraphicsTag(scheduler); + } + + Cache::ComputeTag getComputeTag(int vmId, Scheduler &scheduler) { + return caches[vmId].createComputeTag(scheduler); + } + void mapProcess(std::int64_t pid, int vmId, const char *shmName); void unmapProcess(std::int64_t pid); void protectMemory(int pid, std::uint64_t address, std::uint64_t size, diff --git a/rpcsx-gpu2/FlipPipeline.cpp b/rpcsx-gpu2/FlipPipeline.cpp index c170b519f..85adec48a 100644 --- a/rpcsx-gpu2/FlipPipeline.cpp +++ b/rpcsx-gpu2/FlipPipeline.cpp @@ -242,37 +242,10 @@ FlipPipeline::FlipPipeline() { void FlipPipeline::bind(Scheduler &sched, FlipType type, VkImageView imageView, VkSampler sampler) { auto cmdBuffer = sched.getCommandBuffer(); + auto descriptorIndex = descriptorSetPool.acquire(); - auto allocateDescriptorSetIndex = [this] { - auto mask = freeDescriptorSets.load(std::memory_order::acquire); - - while (true) { - auto index = std::countr_one(mask); - if (index >= std::size(descriptorSets)) { - mask = freeDescriptorSets.load(std::memory_order::relaxed); - continue; - } - - if (!freeDescriptorSets.compare_exchange_weak( - mask, mask | (1 << index), std::memory_order::release, - std::memory_order::relaxed)) { - continue; - } - - return index; - } - }; - - auto descriptorIndex = allocateDescriptorSetIndex(); - - sched.afterSubmit([this, descriptorIndex] { - decltype(freeDescriptorSets)::value_type mask = 1 << descriptorIndex; - - while (!freeDescriptorSets.compare_exchange_weak( - mask, mask & ~(1 << descriptorIndex), std::memory_order::release, - std::memory_order::acquire)) { - } - }); + sched.afterSubmit( + [this, descriptorIndex] { descriptorSetPool.release(descriptorIndex); }); auto descriptorSet = descriptorSets[descriptorIndex]; VkDescriptorImageInfo imageInfo = { diff --git a/rpcsx-gpu2/FlipPipeline.hpp b/rpcsx-gpu2/FlipPipeline.hpp index 8e286d45a..f32300228 100644 --- a/rpcsx-gpu2/FlipPipeline.hpp +++ b/rpcsx-gpu2/FlipPipeline.hpp @@ -1,8 +1,7 @@ #pragma once #include "Scheduler.hpp" -#include -#include +#include #include enum class FlipType { @@ -11,6 +10,7 @@ enum class FlipType { }; struct FlipPipeline { + static constexpr auto kDescriptorSetCount = 16; VkShaderModule flipVertShaderModule{}; VkShaderModule flipFragStdShaderModule{}; VkShaderModule flipFragAltShaderModule{}; @@ -18,8 +18,8 @@ struct FlipPipeline { VkDescriptorSetLayout descriptorSetLayout{}; VkPipeline pipelines[2]{}; VkDescriptorPool descriptorPool{}; - VkDescriptorSet descriptorSets[8]{}; - std::atomic freeDescriptorSets{0}; + VkDescriptorSet descriptorSets[kDescriptorSetCount]{}; + rx::ConcurrentBitPool descriptorSetPool; FlipPipeline(const FlipPipeline &) = delete; FlipPipeline(); diff --git a/rpcsx-gpu2/Registers.hpp b/rpcsx-gpu2/Registers.hpp index cddc737fd..5673aea78 100644 --- a/rpcsx-gpu2/Registers.hpp +++ b/rpcsx-gpu2/Registers.hpp @@ -203,6 +203,8 @@ struct PaScRect { std::uint16_t top; std::uint16_t right; std::uint16_t bottom; + + bool isValid() const { return left < right && top < bottom; } }; struct SpiShaderPgm { diff --git a/rpcsx-gpu2/Renderer.cpp b/rpcsx-gpu2/Renderer.cpp index df21dd8f4..d988bb9dd 100644 --- a/rpcsx-gpu2/Renderer.cpp +++ b/rpcsx-gpu2/Renderer.cpp @@ -1,8 +1,6 @@ #include "Renderer.hpp" #include "Device.hpp" -#include "gnm/descriptors.hpp" #include "gnm/gnm.hpp" -#include "rx/MemoryTable.hpp" #include #include @@ -14,7 +12,6 @@ #include #include -#include #include using namespace shader; @@ -35,39 +32,26 @@ VkRect2D toVkRect2D(amdgpu::PaScRect rect) { }; } -amdgpu::PaScRect intersection(amdgpu::PaScRect rect, amdgpu::PaScRect scissor) { +amdgpu::PaScRect intersection(amdgpu::PaScRect lhs, amdgpu::PaScRect rhs) { + if (!lhs.isValid()) { + return rhs; + } + + if (!rhs.isValid()) { + return lhs; + } + amdgpu::PaScRect result{ - .left = std::max(rect.left, scissor.left), - .top = std::max(rect.top, scissor.top), - .right = std::min(rect.right, scissor.right), - .bottom = std::min(rect.bottom, scissor.bottom), + .left = std::max(lhs.left, rhs.left), + .top = std::max(lhs.top, rhs.top), + .right = std::min(lhs.right, rhs.right), + .bottom = std::min(lhs.bottom, rhs.bottom), }; - result.top = std::min(result.top, result.bottom); - result.bottom = std::max(result.top, result.bottom); - result.left = std::min(result.left, result.right); - result.right = std::max(result.left, result.right); return result; } } // namespace gnm -struct MemoryTableSlot { - std::uint64_t address; - union { - struct { - std::uint64_t size : 40; - std::uint64_t flags : 4; - }; - std::uint64_t sizeAndFlags; - }; - std::uint64_t deviceAddress; -}; -struct MemoryTable { - std::uint32_t count; - std::uint32_t pad; - MemoryTableSlot slots[]; -}; - static VkShaderEXT getPrimTypeRectGeomShader(amdgpu::Cache &cache) { static VkShaderEXT shader = VK_NULL_HANDLE; if (shader != VK_NULL_HANDLE) { @@ -151,282 +135,6 @@ static VkPrimitiveTopology toVkPrimitiveType(gnm::PrimitiveType type) { } } -struct ShaderResources : eval::Evaluator { - amdgpu::Cache::Tag *cacheTag; - shader::eval::Evaluator evaluator; - std::map slotResources; - std::span userSgprs; - - std::uint32_t slotOffset = 0; - rx::MemoryTableWithPayload bufferMemoryTable; - std::vector> resourceSlotToAddress; - std::vector samplerResources; - std::vector imageResources[3]; - - using Evaluator::eval; - - ShaderResources() = default; - - void loadResources(shader::gcn::Resources &res, - std::span userSgprs) { - this->userSgprs = userSgprs; - for (auto &pointer : res.pointers) { - auto pointerBase = eval(pointer.base).zExtScalar(); - auto pointerOffset = eval(pointer.offset).zExtScalar(); - - if (!pointerBase || !pointerOffset) { - res.dump(); - rx::die("failed to evaluate pointer"); - } - - bufferMemoryTable.map(*pointerBase, - *pointerBase + *pointerOffset + pointer.size, - Access::Read); - resourceSlotToAddress.push_back( - {slotOffset + pointer.resourceSlot, *pointerBase}); - } - - for (auto &bufferRes : res.buffers) { - auto word0 = eval(bufferRes.words[0]).zExtScalar(); - auto word1 = eval(bufferRes.words[1]).zExtScalar(); - auto word2 = eval(bufferRes.words[2]).zExtScalar(); - auto word3 = eval(bufferRes.words[3]).zExtScalar(); - - if (!word0 || !word1 || !word2 || !word3) { - res.dump(); - rx::die("failed to evaluate V#"); - } - - gnm::VBuffer buffer{}; - std::memcpy(reinterpret_cast(&buffer), &*word0, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, - sizeof(std::uint32_t)); - - bufferMemoryTable.map(buffer.address(), buffer.address() + buffer.size(), - bufferRes.access); - resourceSlotToAddress.push_back( - {slotOffset + bufferRes.resourceSlot, buffer.address()}); - } - - for (auto &texture : res.textures) { - auto word0 = eval(texture.words[0]).zExtScalar(); - auto word1 = eval(texture.words[1]).zExtScalar(); - auto word2 = eval(texture.words[2]).zExtScalar(); - auto word3 = eval(texture.words[3]).zExtScalar(); - - if (!word0 || !word1 || !word2 || !word3) { - res.dump(); - rx::die("failed to evaluate 128 bit T#"); - } - - gnm::TBuffer buffer{}; - std::memcpy(reinterpret_cast(&buffer), &*word0, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 1, &*word1, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 2, &*word2, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 3, &*word3, - sizeof(std::uint32_t)); - - if (texture.words[4] != nullptr) { - auto word4 = eval(texture.words[4]).zExtScalar(); - auto word5 = eval(texture.words[5]).zExtScalar(); - auto word6 = eval(texture.words[6]).zExtScalar(); - auto word7 = eval(texture.words[7]).zExtScalar(); - - if (!word4 || !word5 || !word6 || !word7) { - res.dump(); - rx::die("failed to evaluate 256 bit T#"); - } - - std::memcpy(reinterpret_cast(&buffer) + 4, &*word4, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 5, &*word5, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 6, &*word6, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&buffer) + 7, &*word7, - sizeof(std::uint32_t)); - } - - std::vector *resources = nullptr; - - switch (buffer.type) { - case gnm::TextureType::Array1D: - case gnm::TextureType::Dim1D: - resources = &imageResources[0]; - break; - case gnm::TextureType::Dim2D: - case gnm::TextureType::Array2D: - case gnm::TextureType::Msaa2D: - case gnm::TextureType::MsaaArray2D: - case gnm::TextureType::Cube: - resources = &imageResources[1]; - break; - case gnm::TextureType::Dim3D: - resources = &imageResources[2]; - break; - } - - rx::dieIf(resources == nullptr, - "ShaderResources: unexpected texture type %u", - static_cast(buffer.type)); - - slotResources[slotOffset + texture.resourceSlot] = resources->size(); - resources->push_back(cacheTag->getImageView( - amdgpu::ImageViewKey::createFrom(buffer), texture.access)); - } - - for (auto &sampler : res.samplers) { - auto word0 = eval(sampler.words[0]).zExtScalar(); - auto word1 = eval(sampler.words[1]).zExtScalar(); - auto word2 = eval(sampler.words[2]).zExtScalar(); - auto word3 = eval(sampler.words[3]).zExtScalar(); - - if (!word0 || !word1 || !word2 || !word3) { - res.dump(); - rx::die("failed to evaluate S#"); - } - - gnm::SSampler sSampler{}; - std::memcpy(reinterpret_cast(&sSampler), &*word0, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&sSampler) + 1, &*word1, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&sSampler) + 2, &*word2, - sizeof(std::uint32_t)); - std::memcpy(reinterpret_cast(&sSampler) + 3, &*word3, - sizeof(std::uint32_t)); - - if (sampler.unorm) { - sSampler.force_unorm_coords = true; - } - - slotResources[slotOffset + sampler.resourceSlot] = - samplerResources.size(); - samplerResources.push_back( - cacheTag->getSampler(amdgpu::SamplerKey::createFrom(sSampler))); - } - - slotOffset += res.slots; - } - - void buildMemoryTable(MemoryTable &memoryTable) { - memoryTable.count = 0; - - for (auto p : bufferMemoryTable) { - auto size = p.endAddress - p.beginAddress; - auto buffer = cacheTag->getBuffer(p.beginAddress, size, p.payload); - - auto memoryTableSlot = memoryTable.count; - memoryTable.slots[memoryTable.count++] = { - .address = p.beginAddress, - .size = size, - .flags = static_cast(p.payload), - .deviceAddress = buffer.deviceAddress, - }; - - for (auto [slot, address] : resourceSlotToAddress) { - if (address >= p.beginAddress && address < p.endAddress) { - slotResources[slot] = memoryTableSlot; - } - } - } - } - - std::uint32_t getResourceSlot(std::uint32_t id) { - if (auto it = slotResources.find(id); it != slotResources.end()) { - return it->second; - } - return -1; - } - - template T readPointer(std::uint64_t address) { - T result{}; - cacheTag->readMemory(&result, address, sizeof(result)); - return result; - } - - eval::Value eval(ir::InstructionId instId, - std::span operands) override { - if (instId == ir::amdgpu::POINTER) { - auto type = operands[0].getAsValue(); - auto loadSize = *operands[1].getAsInt32(); - auto base = eval(operands[2]).zExtScalar(); - auto offset = eval(operands[3]).zExtScalar(); - - if (!base || !offset) { - rx::die("failed to evaluate pointer dependency"); - } - - eval::Value result; - auto address = *base + *offset; - - switch (loadSize) { - case 1: - result = readPointer(address); - break; - case 2: - result = readPointer(address); - break; - case 4: - result = readPointer(address); - break; - case 8: - result = readPointer(address); - break; - case 12: - result = readPointer(address); - break; - case 16: - result = readPointer(address); - break; - case 32: - result = readPointer>(address); - break; - default: - rx::die("unexpected pointer load size"); - } - - return result; - } - - if (instId == ir::amdgpu::VBUFFER) { - rx::die("resource depends on buffer value"); - } - - if (instId == ir::amdgpu::TBUFFER) { - rx::die("resource depends on texture value"); - } - - if (instId == ir::amdgpu::SAMPLER) { - rx::die("resource depends on sampler value"); - } - - if (instId == ir::amdgpu::USER_SGPR) { - auto index = static_cast(*operands[1].getAsInt32()); - rx::dieIf(index >= userSgprs.size(), "out of user sgprs"); - return userSgprs[index]; - } - - if (instId == ir::amdgpu::IMM) { - auto address = static_cast(*operands[1].getAsInt64()); - - std::uint32_t result; - cacheTag->readMemory(&result, address, sizeof(result)); - return result; - } - - return Evaluator::eval(instId, operands); - } -}; - void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, std::uint32_t vertexCount, std::uint32_t firstInstance, std::uint32_t instanceCount, std::uint64_t indiciesAddress, @@ -449,7 +157,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, return; } - auto cacheTag = pipe.device->getCacheTag(vmId, pipe.scheduler); + auto cacheTag = pipe.device->getGraphicsTag(vmId, pipe.scheduler); auto targetMask = pipe.context.cbTargetMask.raw; VkRenderingAttachmentInfo colorAttachments[8]{}; @@ -460,8 +168,12 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, VkRect2D viewPortScissors[8]{}; unsigned renderTargets = 0; - VkRenderingAttachmentInfo depthAttachment{}; - VkRenderingAttachmentInfo stencilAttachment{}; + VkRenderingAttachmentInfo depthAttachment{ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + }; + VkRenderingAttachmentInfo stencilAttachment{ + .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, + }; auto depthAccess = Access::None; auto stencilAccess = Access::None; @@ -484,12 +196,15 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, } } + // FIXME + stencilAccess = Access::None; + if (depthAccess != Access::None) { auto viewPortScissor = pipe.context.paScScreenScissor; auto viewPortRect = gnm::toVkRect2D(viewPortScissor); auto imageView = cacheTag.getImageView( - {{ + { .readAddress = pipe.context.dbZReadBase, .writeAddress = pipe.context.dbZWriteBase, .dfmt = gnm::getDataFormat(pipe.context.dbZInfo.format), @@ -502,7 +217,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, }, .pitch = viewPortRect.extent.width, .kind = ImageKind::Depth, - }}, + }, depthAccess); depthAttachment = { @@ -533,12 +248,12 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, } auto viewPortScissor = pipe.context.paScScreenScissor; - // viewPortScissor = gnm::intersection( - // viewPortScissor, pipe.context.paScVportScissor[renderTargets]); - // viewPortScissor = - // gnm::intersection(viewPortScissor, pipe.context.paScWindowScissor); - // viewPortScissor = - // gnm::intersection(viewPortScissor, pipe.context.paScGenericScissor); + viewPortScissor = gnm::intersection( + viewPortScissor, pipe.context.paScVportScissor[renderTargets]); + viewPortScissor = + gnm::intersection(viewPortScissor, pipe.context.paScWindowScissor); + viewPortScissor = + gnm::intersection(viewPortScissor, pipe.context.paScGenericScissor); auto viewPortRect = gnm::toVkRect2D(viewPortScissor); @@ -554,7 +269,7 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, auto vkViewPortScissor = gnm::toVkRect2D(viewPortScissor); viewPortScissors[renderTargets] = vkViewPortScissor; - ImageViewKey renderTargetInfo{}; + ImageKey renderTargetInfo{}; renderTargetInfo.type = gnm::TextureType::Dim2D; renderTargetInfo.pitch = vkViewPortScissor.extent.width; renderTargetInfo.readAddress = static_cast(cbColor.base) @@ -572,7 +287,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, cbColor.info.linearGeneral ? TileMode{.raw = 0} : getDefaultTileModes()[cbColor.attrib.tileModeIndex]; - // std::printf("draw to %lx\n", renderTargetInfo.address); auto access = Access::None; @@ -640,13 +354,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, if (renderTargets == 0) { return; } - - // if (pipe.context.cbTargetMask == 0) { - // return; - // } - - // auto cache = pipe.device->getCache(vmId); - if (indiciesAddress == 0) { indexCount = vertexCount; } @@ -659,244 +366,34 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, VkShaderEXT shaders[stages.size()]{}; auto pipelineLayout = cacheTag.getGraphicsPipelineLayout(); - auto descriptorSets = cacheTag.createGraphicsDescriptorSets(); - - std::vector descriptorBuffers; - auto &memoryTableBuffer = cacheTag.getCache()->getMemoryTableBuffer(); - std::uint64_t memoryTableAddress = memoryTableBuffer.getAddress(); - auto memoryTable = std::bit_cast(memoryTableBuffer.getData()); - - std::uint64_t gdsAddress = cacheTag.getCache()->getGdsBuffer().getAddress(); - ShaderResources shaderResources; - shaderResources.cacheTag = &cacheTag; - - struct MemoryTableConfigSlot { - std::uint32_t bufferIndex; - std::uint32_t configIndex; - std::uint32_t resourceSlot; - }; - std::vector memoryTableConfigSlots; - - auto addShader = [&](const SpiShaderPgm &pgm, shader::gcn::Stage stage) { - shader::gcn::Environment env{ - .vgprCount = pgm.rsrc1.getVGprCount(), - .sgprCount = pgm.rsrc1.getSGprCount(), - .userSgprs = std::span(pgm.userData.data(), pgm.rsrc2.userSgpr), - .supportsBarycentric = vk::context->supportsBarycentric, - .supportsInt8 = vk::context->supportsInt8, - .supportsInt64Atomics = vk::context->supportsInt64Atomics, - }; - - auto shader = cacheTag.getShader({ - .address = pgm.address << 8, - .stage = stage, - .env = env, - }); - - std::uint32_t slotOffset = shaderResources.slotOffset; - - shaderResources.loadResources( - shader.info->resources, - std::span(pgm.userData.data(), pgm.rsrc2.userSgpr)); - - const auto &configSlots = shader.info->configSlots; - - auto configSize = configSlots.size() * sizeof(std::uint32_t); - auto configBuffer = cacheTag.getInternalBuffer(configSize); - - auto configPtr = reinterpret_cast(configBuffer.data); - - shader::gcn::PsVGprInput - psVgprInput[static_cast(shader::gcn::PsVGprInput::Count)]; - std::size_t psVgprInputs = 0; - - if (stage == shader::gcn::Stage::Ps) { - SpiPsInput spiInputAddr = pipe.context.spiPsInputAddr; - - if (spiInputAddr.perspSampleEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspSample; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspSample; - } - if (spiInputAddr.perspCenterEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCenter; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCenter; - } - if (spiInputAddr.perspCentroidEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IPerspCentroid; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JPerspCentroid; - } - if (spiInputAddr.perspPullModelEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::IW; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JW; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::_1W; - } - if (spiInputAddr.linearSampleEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearSample; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearSample; - } - if (spiInputAddr.linearCenterEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCenter; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCenter; - } - if (spiInputAddr.linearCentroidEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::ILinearCentroid; - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::JLinearCentroid; - } - if (spiInputAddr.posXFloatEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::X; - } - if (spiInputAddr.posYFloatEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Y; - } - if (spiInputAddr.posZFloatEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Z; - } - if (spiInputAddr.posWFloatEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::W; - } - if (spiInputAddr.frontFaceEna) { - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::FrontFace; - } - if (spiInputAddr.ancillaryEna) { - rx::die("unimplemented ancillary fs input"); - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::Ancillary; - } - if (spiInputAddr.sampleCoverageEna) { - rx::die("unimplemented sample coverage fs input"); - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::SampleCoverage; - } - if (spiInputAddr.posFixedPtEna) { - rx::die("unimplemented pos fixed fs input"); - psVgprInput[psVgprInputs++] = shader::gcn::PsVGprInput::PosFixed; - } - } - - for (std::size_t index = 0; const auto &slot : configSlots) { - switch (slot.type) { - case shader::gcn::ConfigType::Imm: - cacheTag.readMemory(&configPtr[index], slot.data, - sizeof(std::uint32_t)); - break; - case shader::gcn::ConfigType::UserSgpr: - configPtr[index] = pgm.userData[slot.data]; - break; - case shader::gcn::ConfigType::ViewPortOffsetX: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].xOffset / - (viewPorts[0].width / 2.f) - - 1); - break; - case shader::gcn::ConfigType::ViewPortOffsetY: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].yOffset / - (viewPorts[slot.data].height / 2.f) - - 1); - break; - case shader::gcn::ConfigType::ViewPortOffsetZ: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].zOffset); - break; - case shader::gcn::ConfigType::ViewPortScaleX: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].xScale / - (viewPorts[slot.data].width / 2.f)); - break; - case shader::gcn::ConfigType::ViewPortScaleY: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].yScale / - (viewPorts[slot.data].height / 2.f)); - break; - case shader::gcn::ConfigType::ViewPortScaleZ: - configPtr[index] = std::bit_cast( - pipe.context.paClVports[slot.data].zScale); - break; - case shader::gcn::ConfigType::PsInputVGpr: - if (slot.data > psVgprInputs) { - configPtr[index] = ~0; - } else { - configPtr[index] = - std::bit_cast(psVgprInput[slot.data]); - } - break; - case shader::gcn::ConfigType::VsPrimType: - if (indexBuffer.handle == VK_NULL_HANDLE && - pipe.uConfig.vgtPrimitiveType != indexBuffer.primType) { - configPtr[index] = - static_cast(pipe.uConfig.vgtPrimitiveType.value); - } else { - configPtr[index] = 0; - } - break; - - case shader::gcn::ConfigType::ResourceSlot: - memoryTableConfigSlots.push_back({ - .bufferIndex = static_cast(descriptorBuffers.size()), - .configIndex = static_cast(index), - .resourceSlot = static_cast(slotOffset + slot.data), - }); - break; - - case shader::gcn::ConfigType::MemoryTable: - if (slot.data == 0) { - configPtr[index] = static_cast(memoryTableAddress); - } else { - configPtr[index] = - static_cast(memoryTableAddress >> 32); - } - break; - case shader::gcn::ConfigType::Gds: - if (slot.data == 0) { - configPtr[index] = static_cast(gdsAddress); - } else { - configPtr[index] = static_cast(gdsAddress >> 32); - } - break; - - case shader::gcn::ConfigType::CbCompSwap: - configPtr[index] = std::bit_cast( - pipe.context.cbColor[slot.data].info.compSwap); - break; - } - - ++index; - } - - VkDescriptorBufferInfo bufferInfo{ - .buffer = configBuffer.handle, - .offset = configBuffer.offset, - .range = configSize, - }; - - auto stageIndex = Cache::getStageIndex(shader.stage); - - VkWriteDescriptorSet writeDescSet{ - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .dstSet = descriptorSets[stageIndex], - .dstBinding = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - .pBufferInfo = &bufferInfo, - }; - - vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); - - shaders[stageIndex] = shader.handle - ? shader.handle - : getFillRedFragShader(*cacheTag.getCache()); - descriptorBuffers.push_back(configPtr); - }; + auto descriptorSets = cacheTag.getDescriptorSets(); + Cache::Shader vertexShader; if (pipe.context.vgtShaderStagesEn.vsEn == amdgpu::VsStage::VsReal) { - addShader(pipe.sh.spiShaderPgmVs, shader::gcn::Stage::VsVs); + gnm::PrimitiveType vsPrimType = {}; + if (indexBuffer.handle == VK_NULL_HANDLE && + pipe.uConfig.vgtPrimitiveType != indexBuffer.primType) { + vsPrimType = pipe.uConfig.vgtPrimitiveType.value; + } + + vertexShader = + cacheTag.getVertexShader(gcn::Stage::VsVs, pipe.sh.spiShaderPgmVs, + pipe.context, vsPrimType, viewPorts); } - if (true) { - addShader(pipe.sh.spiShaderPgmPs, shader::gcn::Stage::Ps); - } else { + auto pixelShader = + cacheTag.getPixelShader(pipe.sh.spiShaderPgmPs, pipe.context, viewPorts); + + if (pixelShader.handle == nullptr) { shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] = getFillRedFragShader(*cacheTag.getCache()); } + shaders[Cache::getStageIndex(VK_SHADER_STAGE_VERTEX_BIT)] = + vertexShader.handle; + shaders[Cache::getStageIndex(VK_SHADER_STAGE_FRAGMENT_BIT)] = + pixelShader.handle; + if (pipe.uConfig.vgtPrimitiveType == gnm::PrimitiveType::RectList) { shaders[Cache::getStageIndex(VK_SHADER_STAGE_GEOMETRY_BIT)] = getPrimTypeRectGeomShader(*cacheTag.getCache()); @@ -906,18 +403,24 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, vertexCount = indexBuffer.indexCount; } - auto commandBuffer = pipe.scheduler.getCommandBuffer(); - VkRenderingInfo renderInfo{ .sType = VK_STRUCTURE_TYPE_RENDERING_INFO, .renderArea = gnm::toVkRect2D(pipe.context.paScScreenScissor), .layerCount = 1, .colorAttachmentCount = renderTargets, .pColorAttachments = colorAttachments, - .pDepthAttachment = &depthAttachment, - // .pStencilAttachment = &stencilAttachment, + .pDepthAttachment = + depthAccess != Access::None ? &depthAttachment : nullptr, + .pStencilAttachment = + stencilAccess != Access::None ? &stencilAttachment : nullptr, }; + cacheTag.buildDescriptors(descriptorSets[0]); + + pipe.scheduler.afterSubmit([cacheTag = std::move(cacheTag)] {}); + + auto commandBuffer = pipe.scheduler.getCommandBuffer(); + vkCmdBeginRendering(commandBuffer, &renderInfo); vkCmdSetRasterizerDiscardEnable(commandBuffer, VK_FALSE); @@ -991,57 +494,6 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, vk::CmdBindShadersEXT(commandBuffer, stages.size(), stages.data(), shaders); - shaderResources.buildMemoryTable(*memoryTable); - - for (auto &sampler : shaderResources.samplerResources) { - uint32_t index = &sampler - shaderResources.samplerResources.data(); - - VkDescriptorImageInfo samplerInfo{.sampler = sampler.handle}; - - VkWriteDescriptorSet writeDescSet{ - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .dstSet = descriptorSets[0], - .dstBinding = Cache::getDescriptorBinding(VK_DESCRIPTOR_TYPE_SAMPLER), - .dstArrayElement = index, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER, - .pImageInfo = &samplerInfo, - }; - - vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); - } - - for (auto &imageResources : shaderResources.imageResources) { - auto dim = (&imageResources - shaderResources.imageResources) + 1; - for (auto &image : imageResources) { - uint32_t index = &image - imageResources.data(); - - VkDescriptorImageInfo imageInfo{ - .imageView = image.handle, - .imageLayout = VK_IMAGE_LAYOUT_GENERAL, - }; - - VkWriteDescriptorSet writeDescSet{ - .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, - .dstSet = descriptorSets[0], - .dstBinding = static_cast(Cache::getDescriptorBinding( - VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, dim)), - .dstArrayElement = index, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, - .pImageInfo = &imageInfo, - }; - - vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr); - } - } - - for (auto &mtConfig : memoryTableConfigSlots) { - auto config = descriptorBuffers[mtConfig.bufferIndex]; - config[mtConfig.configIndex] = - shaderResources.getResourceSlot(mtConfig.resourceSlot); - } - if (indexBuffer.handle != VK_NULL_HANDLE) { vkCmdBindIndexBuffer(commandBuffer, indexBuffer.handle, indexBuffer.offset, gnm::toVkIndexType(indexBuffer.indexType)); @@ -1054,10 +506,19 @@ void amdgpu::draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex, vkCmdEndRendering(commandBuffer); pipe.scheduler.submit(); - pipe.scheduler.then([=, cacheTag = std::move(cacheTag), - shaderResources = std::move(shaderResources)] {}); } +// void amdgpu::dispatch(Scheduler &sched, +// amdgpu::Registers::ComputeConfig &computeConfig, int +// vmId, std::uint32_t groupCountX, std::uint32_t +// groupCountY, std::uint32_t groupCountZ) { + +// vkCmdDispatch(sched.getCommandBuffer(), groupCountX, groupCountY, +// groupCountZ); + +// sched.submit(); +// } + static void transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout oldLayout, VkImageLayout newLayout, @@ -1115,10 +576,10 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer, VkExtent2D targetExtent, std::uint64_t address, - VkImageView target, VkExtent2D imageExtent, - FlipType type, TileMode tileMode, gnm::DataFormat dfmt, + VkImageView target, VkExtent2D imageExtent, FlipType type, + TileMode tileMode, gnm::DataFormat dfmt, gnm::NumericFormat nfmt) { - ImageViewKey framebuffer{}; + ImageKey framebuffer{}; framebuffer.readAddress = address; framebuffer.type = gnm::TextureType::Dim2D; framebuffer.dfmt = dfmt; @@ -1181,7 +642,8 @@ void amdgpu::flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer, vkCmdBeginRendering(commandBuffer, &renderInfo); - cacheTag.getDevice()->flipPipeline.bind(cacheTag.getScheduler(), type, imageView.handle, sampler.handle); + cacheTag.getDevice()->flipPipeline.bind(cacheTag.getScheduler(), type, + imageView.handle, sampler.handle); vkCmdSetViewportWithCount(commandBuffer, 1, viewPorts); vkCmdSetScissorWithCount(commandBuffer, 1, viewPortScissors); diff --git a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp index efdba029c..b720fe504 100644 --- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp +++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp @@ -1,9 +1,9 @@ #include "amdgpu/tiler_vulkan.hpp" #include "Scheduler.hpp" #include "amdgpu/tiler.hpp" -#include #include #include +#include #include #include @@ -72,11 +72,11 @@ struct TilerShader { }; struct amdgpu::GpuTiler::Impl { + static constexpr auto kDescriptorSetCount = 32; TilerDecriptorSetLayout descriptorSetLayout; - std::mutex descriptorMtx; - VkDescriptorSet descriptorSets[32]{}; + rx::ConcurrentBitPool descriptorSetPool; + VkDescriptorSet descriptorSets[kDescriptorSetCount]{}; VkDescriptorPool descriptorPool; - std::uint32_t inUseDescriptorSets = 0; vk::Buffer configData; TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp}; @@ -156,20 +156,10 @@ struct amdgpu::GpuTiler::Impl { vk::context->allocator); } - std::uint32_t allocateDescriptorSlot() { - std::lock_guard lock(descriptorMtx); - - auto result = std::countl_one(inUseDescriptorSets); - rx::dieIf(result >= std::size(descriptorSets), - "out of tiler descriptor sets"); - inUseDescriptorSets |= (1 << result); - - return result; - } + std::uint32_t allocateDescriptorSlot() { return descriptorSetPool.acquire(); } void releaseDescriptorSlot(std::uint32_t slot) { - std::lock_guard lock(descriptorMtx); - inUseDescriptorSets &= ~(1u << slot); + descriptorSetPool.release(slot); } }; diff --git a/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp b/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp index 58e8226c3..9f7748f26 100644 --- a/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp +++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp @@ -10,6 +10,7 @@ public: virtual ~Evaluator() = default; void invalidate(ir::Value node) { values.erase(node); } + void invalidate() { values.clear(); } void setValue(ir::Value node, Value value) { values[node] = value; } Value eval(const ir::Operand &op, ir::Value type = nullptr); diff --git a/rpcsx-gpu2/lib/gnm/lib/gnm-vulkan/src/vulkan.cpp b/rpcsx-gpu2/lib/gnm/lib/gnm-vulkan/src/vulkan.cpp index 0fccdddf0..f6946b9c8 100644 --- a/rpcsx-gpu2/lib/gnm/lib/gnm-vulkan/src/vulkan.cpp +++ b/rpcsx-gpu2/lib/gnm/lib/gnm-vulkan/src/vulkan.cpp @@ -74,7 +74,7 @@ VkFormat gnm::toVkFormat(DataFormat dfmt, NumericFormat nfmt) { case kDataFormat5_6_5: switch (nfmt) { case kNumericFormatUNorm: - return VK_FORMAT_R5G6B5_UNORM_PACK16; + return VK_FORMAT_B5G6R5_UNORM_PACK16; default: break; diff --git a/rpcsx-gpu2/lib/vk/include/Scheduler.hpp b/rpcsx-gpu2/lib/vk/include/Scheduler.hpp index 391714ec4..a88521302 100644 --- a/rpcsx-gpu2/lib/vk/include/Scheduler.hpp +++ b/rpcsx-gpu2/lib/vk/include/Scheduler.hpp @@ -98,12 +98,12 @@ public: wait(); - for (auto &&fn : mAfterSubmitTasks) { - fn(); + while (!mAfterSubmitTasks.empty()) { + auto task = std::move(mAfterSubmitTasks.back()); + mAfterSubmitTasks.pop_back(); + std::move(task)(); } - mAfterSubmitTasks.clear(); - std::vector> taskList; for (auto it = mTasks.begin(); it != mTasks.end(); it = mTasks.erase(it)) { diff --git a/rx/include/rx/ConcurrentBitPool.hpp b/rx/include/rx/ConcurrentBitPool.hpp new file mode 100644 index 000000000..c93c85311 --- /dev/null +++ b/rx/include/rx/ConcurrentBitPool.hpp @@ -0,0 +1,72 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace rx { +namespace detail { +template auto pickBitSetBaseType() { + if constexpr (Count <= 8) { + return std::array, 1>{}; + } else if constexpr (Count <= 16) { + return std::array, 1>{}; + } else if constexpr (Count <= 32) { + return std::array, 1>{}; + } else { + return std::array, (Count + 63) / 64>(); + } +} + +template +using ConcurrentBitPoolBaseType = decltype(pickBitSetBaseType()); +} // namespace detail + +template +class ConcurrentBitPool { + detail::ConcurrentBitPoolBaseType mStorage{{}}; + using WordType = std::remove_cvref_t::value_type; + static constexpr auto kWordBitWidth = sizeof(WordType) * 8; + +public: + ElementType acquire() { + while (true) { + for (auto &node : mStorage) { + auto mask = node.load(std::memory_order::acquire); + + auto bitIndex = std::countr_one(mask); + if (bitIndex >= kWordBitWidth) { + continue; + } + + auto pattern = static_cast(1) << bitIndex; + + if (!node.compare_exchange_strong(mask, mask | pattern, + std::memory_order::release, + std::memory_order::relaxed)) { + continue; + } + + auto wordIndex = &node - mStorage.data(); + return static_cast(kWordBitWidth * wordIndex + bitIndex); + } + } + } + + void release(ElementType index) { + auto rawIndex = static_cast(index); + auto bitIndex = rawIndex % kWordBitWidth; + auto wordIndex = rawIndex / kWordBitWidth; + + WordType pattern = static_cast(1) << bitIndex; + WordType mask = pattern; + + while (!mStorage[wordIndex].compare_exchange_weak( + mask, mask & ~pattern, std::memory_order::release, + std::memory_order::acquire)) { + } + } +}; +} // namespace rx