#include "Device.hpp" #include "FlipPipeline.hpp" #include "Renderer.hpp" #include "amdgpu/tiler.hpp" #include "gnm/constants.hpp" #include "gnm/pm4.hpp" #include "rx/bits.hpp" #include "rx/die.hpp" #include "rx/mem.hpp" #include "shader/spv.hpp" #include "shaders/rdna-semantic-spirv.hpp" #include "vk.hpp" #include #include using namespace amdgpu; Device::Device() { if (!shader::spv::validate(g_rdna_semantic_spirv)) { shader::spv::dump(g_rdna_semantic_spirv, true); rx::die("builtin semantic validation failed"); } if (auto sem = shader::spv::deserialize( shaderSemanticContext, g_rdna_semantic_spirv, shaderSemanticContext.getUnknownLocation())) { auto shaderSemantic = *sem; shader::gcn::canonicalizeSemantic(shaderSemanticContext, shaderSemantic); shader::gcn::collectSemanticModuleInfo(gcnSemanticModuleInfo, shaderSemantic); gcnSemantic = shader::gcn::collectSemanticInfo(gcnSemanticModuleInfo); } else { rx::die("failed to deserialize builtin semantics\n"); } for (int index = 0; auto &cache : caches) { cache.vmId = index++; } for (auto &pipe : graphicsPipes) { pipe.device = this; } // for (auto &pipe : computePipes) { // pipe.device = this; // } } Device::~Device() { for (auto fd : dmemFd) { if (fd >= 0) { ::close(fd); } } for (auto &[pid, info] : processInfo) { if (info.vmFd >= 0) { ::close(info.vmFd); } } } void Device::mapProcess(std::int64_t pid, int vmId, const char *shmName) { auto &process = processInfo[pid]; process.vmId = vmId; auto memory = amdgpu::RemoteMemory{vmId}; std::string pidVmName = shmName; pidVmName += '-'; pidVmName += std::to_string(pid); int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR); process.vmFd = memoryFd; if (memoryFd < 0) { std::printf("failed to process %x shared memory\n", (int)pid); std::abort(); } for (auto [startAddress, endAddress, slot] : process.vmTable) { auto gpuProt = slot.prot >> 4; if (gpuProt == 0) { continue; } auto devOffset = slot.offset + startAddress - slot.baseAddress; int mapFd = memoryFd; if (slot.memoryType >= 0) { mapFd = dmemFd[slot.memoryType]; } auto mmapResult = ::mmap(memory.getPointer(startAddress), endAddress - startAddress, gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset); if (mmapResult == MAP_FAILED) { std::printf("failed to map process %x memory, address %lx-%lx, type %x\n", (int)pid, startAddress, endAddress, slot.memoryType); std::abort(); } handleProtectChange(vmId, startAddress, endAddress - startAddress, slot.prot); } } void Device::unmapProcess(std::int64_t pid) { auto &process = processInfo[pid]; auto startAddress = static_cast(process.vmId) << 40; auto size = static_cast(1) << 40; rx::mem::reserve(reinterpret_cast(startAddress), size); ::close(process.vmFd); process.vmFd = -1; process.vmId = -1; } void Device::protectMemory(int pid, std::uint64_t address, std::uint64_t size, int prot) { auto &process = processInfo[pid]; auto vmSlotIt = process.vmTable.queryArea(address); if (vmSlotIt == process.vmTable.end()) { std::abort(); } auto vmSlot = (*vmSlotIt).payload; process.vmTable.map(address, address + size, VmMapSlot{ .memoryType = vmSlot.memoryType, .prot = static_cast(prot), .offset = vmSlot.offset, .baseAddress = vmSlot.baseAddress, }); if (process.vmId >= 0) { auto memory = amdgpu::RemoteMemory{process.vmId}; rx::mem::protect(memory.getPointer(address), size, prot >> 4); handleProtectChange(process.vmId, address, size, prot); } } void Device::onCommandBuffer(std::int64_t pid, int cmdHeader, std::uint64_t address, std::uint64_t size) { auto &process = processInfo[pid]; if (process.vmId < 0) { return; } auto memory = RemoteMemory{process.vmId}; auto op = rx::getBits(cmdHeader, 15, 8); if (op == gnm::IT_INDIRECT_BUFFER_CNST) { graphicsPipes[0].setCeQueue(Queue::createFromRange( process.vmId, memory.getPointer(address), size / sizeof(std::uint32_t))); } else if (op == gnm::IT_INDIRECT_BUFFER) { graphicsPipes[0].setDeQueue( Queue::createFromRange(process.vmId, memory.getPointer(address), size / sizeof(std::uint32_t)), 1); } else { rx::die("unimplemented command buffer %x", cmdHeader); } } bool Device::processPipes() { bool allProcessed = true; // for (auto &pipe : computePipes) { // if (!pipe.processAllRings()) { // allProcessed = false; // } // } for (auto &pipe : graphicsPipes) { if (!pipe.processAllRings()) { allProcessed = false; } } return allProcessed; } static void transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image, VkImageLayout oldLayout, VkImageLayout newLayout, const VkImageSubresourceRange &subresourceRange) { VkImageMemoryBarrier barrier{}; barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.oldLayout = oldLayout; barrier.newLayout = newLayout; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.image = image; barrier.subresourceRange = subresourceRange; auto layoutToStageAccess = [](VkImageLayout layout) -> std::pair { switch (layout) { case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: case VK_IMAGE_LAYOUT_GENERAL: return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0}; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT}; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT}; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT}; case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT}; case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT}; default: std::abort(); } }; auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout); auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout); barrier.srcAccessMask = sourceAccess; barrier.dstAccessMask = destinationAccess; vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); } bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg, VkCommandBuffer commandBuffer, VkImage swapchainImage, VkImageView swapchainImageView, VkFence fence) { auto &pipe = graphicsPipes[0]; auto &scheduler = pipe.scheduler; auto &process = processInfo[pid]; if (process.vmId < 0) { return false; } if (bufferIndex < 0) { bridge->flipBuffer[process.vmId] = bufferIndex; bridge->flipArg[process.vmId] = arg; bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1; return false; } auto &buffer = process.buffers[bufferIndex]; auto &bufferAttr = process.bufferAttributes[buffer.attrId]; gnm::DataFormat dfmt; gnm::NumericFormat nfmt; auto flipType = FlipType::Alt; switch (bufferAttr.pixelFormat) { case 0x80000000: dfmt = gnm::kDataFormat8_8_8_8; nfmt = gnm::kNumericFormatSrgb; break; case 0x80002200: dfmt = gnm::kDataFormat8_8_8_8; nfmt = gnm::kNumericFormatSrgb; flipType = FlipType::Std; break; case 0x88740000: case 0x88060000: dfmt = gnm::kDataFormat2_10_10_10; nfmt = gnm::kNumericFormatSNorm; break; case 0xc1060000: dfmt = gnm::kDataFormat16_16_16_16; nfmt = gnm::kNumericFormatFloat; break; default: rx::die("unimplemented color buffer format %x", bufferAttr.pixelFormat); } // std::printf("displaying buffer %lx\n", buffer.address); VkCommandBufferBeginInfo beginInfo{}; beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; vkBeginCommandBuffer(commandBuffer, &beginInfo); auto cacheTag = getCacheTag(process.vmId, scheduler); transitionImageLayout(commandBuffer, swapchainImage, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .levelCount = 1, .layerCount = 1, }); amdgpu::flip( cacheTag, commandBuffer, vk::context->swapchainExtent, buffer.address, swapchainImageView, {bufferAttr.width, bufferAttr.height}, flipType, getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], dfmt, nfmt); transitionImageLayout(commandBuffer, swapchainImage, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, { .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .levelCount = 1, .layerCount = 1, }); auto submitCompleteTask = scheduler.createExternalSubmit(); { vkEndCommandBuffer(commandBuffer); VkSemaphoreSubmitInfo signalSemSubmitInfos[] = { { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = vk::context->renderCompleteSemaphore, .value = 1, .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, }, { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = scheduler.getSemaphoreHandle(), .value = submitCompleteTask, .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, }, }; VkSemaphoreSubmitInfo waitSemSubmitInfos[] = { { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = vk::context->presentCompleteSemaphore, .value = 1, .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, }, { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, .semaphore = scheduler.getSemaphoreHandle(), .value = submitCompleteTask - 1, .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, }, }; VkCommandBufferSubmitInfo cmdBufferSubmitInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = commandBuffer, }; VkSubmitInfo2 submitInfo{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, .waitSemaphoreInfoCount = 1, .pWaitSemaphoreInfos = waitSemSubmitInfos, .commandBufferInfoCount = 1, .pCommandBufferInfos = &cmdBufferSubmitInfo, .signalSemaphoreInfoCount = 2, .pSignalSemaphoreInfos = signalSemSubmitInfos, }; vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence); vkQueueWaitIdle(vk::context->presentQueue); } scheduler.then([=, this, cacheTag = std::move(cacheTag)] { bridge->flipBuffer[process.vmId] = bufferIndex; bridge->flipArg[process.vmId] = arg; bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1; auto mem = RemoteMemory{process.vmId}; auto bufferInUse = mem.getPointer(bridge->bufferInUseAddress[process.vmId]); if (bufferInUse != nullptr) { bufferInUse[bufferIndex] = 0; } }); return true; } void Device::mapMemory(std::int64_t pid, std::uint64_t address, std::uint64_t size, int memoryType, int dmemIndex, int prot, std::int64_t offset) { auto &process = processInfo[pid]; process.vmTable.map(address, address + size, VmMapSlot{ .memoryType = memoryType >= 0 ? dmemIndex : -1, .prot = prot, .offset = offset, .baseAddress = address, }); if (process.vmId < 0) { return; } auto memory = amdgpu::RemoteMemory{process.vmId}; int mapFd = process.vmFd; if (memoryType >= 0) { mapFd = dmemFd[dmemIndex]; } auto mmapResult = ::mmap(memory.getPointer(address), size, prot >> 4, MAP_FIXED | MAP_SHARED, mapFd, offset); if (mmapResult == MAP_FAILED) { rx::die("failed to map process %x memory, address %lx-%lx, type %x", (int)pid, address, address + size, memoryType); } handleProtectChange(process.vmId, address, size, prot); } void Device::registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer) { auto &process = processInfo[pid]; if (buffer.attrId >= 10 || buffer.index >= 10) { rx::die("out of buffers %u, %u", buffer.attrId, buffer.index); } process.buffers[buffer.index] = buffer; } void Device::registerBufferAttribute(std::int64_t pid, bridge::CmdBufferAttribute attr) { auto &process = processInfo[pid]; if (attr.attrId >= 10) { rx::die("out of buffer attributes %u", attr.attrId); } process.bufferAttributes[attr.attrId] = attr; } void Device::handleProtectChange(int vmId, std::uint64_t address, std::uint64_t size, int prot) {}