rpcsx/rpcsx-gpu/Device.cpp
2024-10-01 22:04:40 +03:00

454 lines
14 KiB
C++

#include "Device.hpp"
#include "FlipPipeline.hpp"
#include "Renderer.hpp"
#include "amdgpu/tiler.hpp"
#include "gnm/constants.hpp"
#include "gnm/pm4.hpp"
#include "rx/bits.hpp"
#include "rx/die.hpp"
#include "rx/mem.hpp"
#include "shader/spv.hpp"
#include "shaders/rdna-semantic-spirv.hpp"
#include "vk.hpp"
#include <fcntl.h>
#include <sys/mman.h>
using namespace amdgpu;
Device::Device() {
if (!shader::spv::validate(g_rdna_semantic_spirv)) {
shader::spv::dump(g_rdna_semantic_spirv, true);
rx::die("builtin semantic validation failed");
}
if (auto sem = shader::spv::deserialize(
shaderSemanticContext, g_rdna_semantic_spirv,
shaderSemanticContext.getUnknownLocation())) {
auto shaderSemantic = *sem;
shader::gcn::canonicalizeSemantic(shaderSemanticContext, shaderSemantic);
shader::gcn::collectSemanticModuleInfo(gcnSemanticModuleInfo,
shaderSemantic);
gcnSemantic = shader::gcn::collectSemanticInfo(gcnSemanticModuleInfo);
} else {
rx::die("failed to deserialize builtin semantics\n");
}
for (int index = 0; auto &cache : caches) {
cache.vmId = index++;
}
for (auto &pipe : graphicsPipes) {
pipe.device = this;
}
// for (auto &pipe : computePipes) {
// pipe.device = this;
// }
}
Device::~Device() {
for (auto fd : dmemFd) {
if (fd >= 0) {
::close(fd);
}
}
for (auto &[pid, info] : processInfo) {
if (info.vmFd >= 0) {
::close(info.vmFd);
}
}
}
void Device::mapProcess(std::int64_t pid, int vmId, const char *shmName) {
auto &process = processInfo[pid];
process.vmId = vmId;
auto memory = amdgpu::RemoteMemory{vmId};
std::string pidVmName = shmName;
pidVmName += '-';
pidVmName += std::to_string(pid);
int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
process.vmFd = memoryFd;
if (memoryFd < 0) {
std::printf("failed to process %x shared memory\n", (int)pid);
std::abort();
}
for (auto [startAddress, endAddress, slot] : process.vmTable) {
auto gpuProt = slot.prot >> 4;
if (gpuProt == 0) {
continue;
}
auto devOffset = slot.offset + startAddress - slot.baseAddress;
int mapFd = memoryFd;
if (slot.memoryType >= 0) {
mapFd = dmemFd[slot.memoryType];
}
auto mmapResult =
::mmap(memory.getPointer(startAddress), endAddress - startAddress,
gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset);
if (mmapResult == MAP_FAILED) {
std::printf("failed to map process %x memory, address %lx-%lx, type %x\n",
(int)pid, startAddress, endAddress, slot.memoryType);
std::abort();
}
handleProtectChange(vmId, startAddress, endAddress - startAddress,
slot.prot);
}
}
void Device::unmapProcess(std::int64_t pid) {
auto &process = processInfo[pid];
auto startAddress = static_cast<std::uint64_t>(process.vmId) << 40;
auto size = static_cast<std::uint64_t>(1) << 40;
rx::mem::reserve(reinterpret_cast<void *>(startAddress), size);
::close(process.vmFd);
process.vmFd = -1;
process.vmId = -1;
}
void Device::protectMemory(int pid, std::uint64_t address, std::uint64_t size,
int prot) {
auto &process = processInfo[pid];
auto vmSlotIt = process.vmTable.queryArea(address);
if (vmSlotIt == process.vmTable.end()) {
std::abort();
}
auto vmSlot = (*vmSlotIt).payload;
process.vmTable.map(address, address + size,
VmMapSlot{
.memoryType = vmSlot.memoryType,
.prot = static_cast<int>(prot),
.offset = vmSlot.offset,
.baseAddress = vmSlot.baseAddress,
});
if (process.vmId >= 0) {
auto memory = amdgpu::RemoteMemory{process.vmId};
rx::mem::protect(memory.getPointer(address), size, prot >> 4);
handleProtectChange(process.vmId, address, size, prot);
}
}
void Device::onCommandBuffer(std::int64_t pid, int cmdHeader,
std::uint64_t address, std::uint64_t size) {
auto &process = processInfo[pid];
if (process.vmId < 0) {
return;
}
auto memory = RemoteMemory{process.vmId};
auto op = rx::getBits(cmdHeader, 15, 8);
if (op == gnm::IT_INDIRECT_BUFFER_CNST) {
graphicsPipes[0].setCeQueue(Queue::createFromRange(
process.vmId, memory.getPointer<std::uint32_t>(address),
size / sizeof(std::uint32_t)));
} else if (op == gnm::IT_INDIRECT_BUFFER) {
graphicsPipes[0].setDeQueue(
Queue::createFromRange(process.vmId,
memory.getPointer<std::uint32_t>(address),
size / sizeof(std::uint32_t)),
1);
} else {
rx::die("unimplemented command buffer %x", cmdHeader);
}
}
bool Device::processPipes() {
bool allProcessed = true;
// for (auto &pipe : computePipes) {
// if (!pipe.processAllRings()) {
// allProcessed = false;
// }
// }
for (auto &pipe : graphicsPipes) {
if (!pipe.processAllRings()) {
allProcessed = false;
}
}
return allProcessed;
}
static void
transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
VkImageLayout oldLayout, VkImageLayout newLayout,
const VkImageSubresourceRange &subresourceRange) {
VkImageMemoryBarrier barrier{};
barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
barrier.oldLayout = oldLayout;
barrier.newLayout = newLayout;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.image = image;
barrier.subresourceRange = subresourceRange;
auto layoutToStageAccess = [](VkImageLayout layout)
-> std::pair<VkPipelineStageFlags, VkAccessFlags> {
switch (layout) {
case VK_IMAGE_LAYOUT_UNDEFINED:
case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
case VK_IMAGE_LAYOUT_GENERAL:
return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL:
return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT};
case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT};
case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL:
return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT};
case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL:
return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT};
default:
std::abort();
}
};
auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
barrier.srcAccessMask = sourceAccess;
barrier.dstAccessMask = destinationAccess;
vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0,
nullptr, 0, nullptr, 1, &barrier);
}
bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
VkCommandBuffer commandBuffer, VkImage swapchainImage,
VkImageView swapchainImageView, VkFence fence) {
auto &pipe = graphicsPipes[0];
auto &scheduler = pipe.scheduler;
auto &process = processInfo[pid];
if (process.vmId < 0) {
return false;
}
if (bufferIndex < 0) {
bridge->flipBuffer[process.vmId] = bufferIndex;
bridge->flipArg[process.vmId] = arg;
bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1;
return false;
}
auto &buffer = process.buffers[bufferIndex];
auto &bufferAttr = process.bufferAttributes[buffer.attrId];
gnm::DataFormat dfmt;
gnm::NumericFormat nfmt;
auto flipType = FlipType::Alt;
switch (bufferAttr.pixelFormat) {
case 0x80000000:
dfmt = gnm::kDataFormat8_8_8_8;
nfmt = gnm::kNumericFormatSrgb;
break;
case 0x80002200:
dfmt = gnm::kDataFormat8_8_8_8;
nfmt = gnm::kNumericFormatSrgb;
flipType = FlipType::Std;
break;
case 0x88740000:
case 0x88060000:
dfmt = gnm::kDataFormat2_10_10_10;
nfmt = gnm::kNumericFormatSNorm;
break;
case 0xc1060000:
dfmt = gnm::kDataFormat16_16_16_16;
nfmt = gnm::kNumericFormatFloat;
break;
default:
rx::die("unimplemented color buffer format %x", bufferAttr.pixelFormat);
}
// std::printf("displaying buffer %lx\n", buffer.address);
VkCommandBufferBeginInfo beginInfo{};
beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
vkBeginCommandBuffer(commandBuffer, &beginInfo);
auto cacheTag = getCacheTag(process.vmId, scheduler);
transitionImageLayout(commandBuffer, swapchainImage,
VK_IMAGE_LAYOUT_UNDEFINED,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.levelCount = 1,
.layerCount = 1,
});
amdgpu::flip(
cacheTag, commandBuffer, vk::context->swapchainExtent, buffer.address,
swapchainImageView, {bufferAttr.width, bufferAttr.height}, flipType,
getDefaultTileModes()[bufferAttr.tilingMode == 1 ? 10 : 8], dfmt, nfmt);
transitionImageLayout(commandBuffer, swapchainImage,
VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
{
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.levelCount = 1,
.layerCount = 1,
});
auto submitCompleteTask = scheduler.createExternalSubmit();
{
vkEndCommandBuffer(commandBuffer);
VkSemaphoreSubmitInfo signalSemSubmitInfos[] = {
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = vk::context->renderCompleteSemaphore,
.value = 1,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
},
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = scheduler.getSemaphoreHandle(),
.value = submitCompleteTask,
.stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
},
};
VkSemaphoreSubmitInfo waitSemSubmitInfos[] = {
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = vk::context->presentCompleteSemaphore,
.value = 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
},
{
.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
.semaphore = scheduler.getSemaphoreHandle(),
.value = submitCompleteTask - 1,
.stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
},
};
VkCommandBufferSubmitInfo cmdBufferSubmitInfo{
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
.commandBuffer = commandBuffer,
};
VkSubmitInfo2 submitInfo{
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
.waitSemaphoreInfoCount = 1,
.pWaitSemaphoreInfos = waitSemSubmitInfos,
.commandBufferInfoCount = 1,
.pCommandBufferInfos = &cmdBufferSubmitInfo,
.signalSemaphoreInfoCount = 2,
.pSignalSemaphoreInfos = signalSemSubmitInfos,
};
vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence);
vkQueueWaitIdle(vk::context->presentQueue);
}
scheduler.then([=, this, cacheTag = std::move(cacheTag)] {
bridge->flipBuffer[process.vmId] = bufferIndex;
bridge->flipArg[process.vmId] = arg;
bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1;
auto mem = RemoteMemory{process.vmId};
auto bufferInUse =
mem.getPointer<std::uint64_t>(bridge->bufferInUseAddress[process.vmId]);
if (bufferInUse != nullptr) {
bufferInUse[bufferIndex] = 0;
}
});
return true;
}
void Device::mapMemory(std::int64_t pid, std::uint64_t address,
std::uint64_t size, int memoryType, int dmemIndex,
int prot, std::int64_t offset) {
auto &process = processInfo[pid];
process.vmTable.map(address, address + size,
VmMapSlot{
.memoryType = memoryType >= 0 ? dmemIndex : -1,
.prot = prot,
.offset = offset,
.baseAddress = address,
});
if (process.vmId < 0) {
return;
}
auto memory = amdgpu::RemoteMemory{process.vmId};
int mapFd = process.vmFd;
if (memoryType >= 0) {
mapFd = dmemFd[dmemIndex];
}
auto mmapResult = ::mmap(memory.getPointer(address), size, prot >> 4,
MAP_FIXED | MAP_SHARED, mapFd, offset);
if (mmapResult == MAP_FAILED) {
rx::die("failed to map process %x memory, address %lx-%lx, type %x",
(int)pid, address, address + size, memoryType);
}
handleProtectChange(process.vmId, address, size, prot);
}
void Device::registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer) {
auto &process = processInfo[pid];
if (buffer.attrId >= 10 || buffer.index >= 10) {
rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
}
process.buffers[buffer.index] = buffer;
}
void Device::registerBufferAttribute(std::int64_t pid,
bridge::CmdBufferAttribute attr) {
auto &process = processInfo[pid];
if (attr.attrId >= 10) {
rx::die("out of buffer attributes %u", attr.attrId);
}
process.bufferAttributes[attr.attrId] = attr;
}
void Device::handleProtectChange(int vmId, std::uint64_t address,
std::uint64_t size, int prot) {}