From 6a9924ebd1f1adeea8a3326f9fe7bd80dd05a3cc Mon Sep 17 00:00:00 2001 From: DH Date: Mon, 2 Sep 2024 20:24:16 +0300 Subject: [PATCH] gpu: add mulitprocess buffer commits --- .../bridge/include/amdgpu/bridge/bridge.hpp | 73 ++++++++++++++++--- .../device/include/amdgpu/device/device.hpp | 4 +- hw/amdgpu/device/src/device.cpp | 54 +++++++------- rpcsx-gpu/main.cpp | 33 ++++++++- rpcsx-os/iodev/dce.cpp | 38 +++------- 5 files changed, 136 insertions(+), 66 deletions(-) diff --git a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp index a05bf1e5a..5d23effed 100644 --- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp +++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp @@ -48,6 +48,8 @@ enum class CommandId : std::uint32_t { MapMemory, MapProcess, UnmapProcess, + RegisterBuffer, + RegisterBufferAttribute, }; struct CmdMemoryProt { @@ -64,13 +66,25 @@ struct CmdCommandBuffer { std::uint32_t pid; }; -struct CmdBuffer { - std::uint32_t width; - std::uint32_t height; - std::uint32_t pitch; - std::uint64_t address; +struct CmdBufferAttribute { + std::uint32_t pid; + std::uint8_t attrId; + std::uint8_t submit; + std::uint64_t canary; std::uint32_t pixelFormat; std::uint32_t tilingMode; + std::uint32_t pitch; + std::uint32_t width; + std::uint32_t height; +}; + +struct CmdBuffer { + std::uint64_t canary; + std::uint32_t index; + std::uint32_t attrId; + std::uint64_t address; + std::uint64_t address2; + std::uint32_t pid; }; struct CmdFlip { @@ -118,14 +132,14 @@ struct BridgeHeader { std::uint64_t vmSize; char vmName[32]; PadState kbPadState; - volatile std::uint32_t flipBuffer; - volatile std::uint64_t flipArg; - volatile std::uint64_t flipCount; - volatile std::uint64_t bufferInUseAddress; + volatile std::uint32_t flipBuffer[6]; + volatile std::uint64_t flipArg[6]; + volatile std::uint64_t flipCount[6]; + volatile std::uint64_t bufferInUseAddress[6]; std::uint32_t commandBufferCount; std::uint32_t bufferCount; CmdCommandBuffer commandBuffers[32]; - CmdBuffer buffers[10]; + // CmdBuffer buffers[10]; // orbis::shared_mutex cacheCommandMtx; // orbis::shared_cv cacheCommandCv; std::atomic cacheCommands[6][4]; @@ -144,6 +158,7 @@ struct Command { CmdMemoryProt memoryProt; CmdCommandBuffer commandBuffer; CmdBuffer buffer; + CmdBufferAttribute bufferAttribute; CmdFlip flip; CmdMapMemory mapMemory; CmdMapProcess mapProcess; @@ -181,6 +196,23 @@ struct BridgePusher { {pid, memoryType, dmemIndex, address, size, prot, offset}); } + void sendRegisterBuffer(std::uint32_t pid, std::uint64_t canary, + std::uint32_t index, std::uint32_t attrId, + std::uint64_t address, std::uint64_t address2) { + sendCommand(CommandId::RegisterBuffer, + {pid, canary, index, attrId, address, address2}); + } + void sendRegisterBufferAttribute(std::uint32_t pid, std::uint8_t attrId, + std::uint8_t submit, std::uint64_t canary, + std::uint32_t pixelFormat, + std::uint32_t tilingMode, + std::uint32_t pitch, std::uint32_t width, + std::uint32_t height) { + sendCommand(CommandId::RegisterBufferAttribute, + {pid, attrId, submit, canary, pixelFormat, tilingMode, pitch, + width, height}); + } + void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue, std::uint64_t address, std::uint64_t size) { sendCommand(CommandId::CommandBuffer, {pid, queue, address, size}); @@ -335,6 +367,27 @@ private: case CommandId::UnmapProcess: result.unmapProcess.pid = args[0]; return result; + + case CommandId::RegisterBufferAttribute: + result.bufferAttribute.pid = args[0]; + result.bufferAttribute.attrId = args[1]; + result.bufferAttribute.submit = args[2]; + result.bufferAttribute.canary = args[3]; + result.bufferAttribute.pixelFormat = args[4]; + result.bufferAttribute.tilingMode = args[5]; + result.bufferAttribute.pitch = args[6]; + result.bufferAttribute.width = args[7]; + result.bufferAttribute.height = args[8]; + return result; + + case CommandId::RegisterBuffer: + result.buffer.pid = args[0]; + result.buffer.canary = args[1]; + result.buffer.index = args[2]; + result.buffer.attrId = args[3]; + result.buffer.address = args[4]; + result.buffer.address2 = args[5]; + return result; } __builtin_trap(); diff --git a/hw/amdgpu/device/include/amdgpu/device/device.hpp b/hw/amdgpu/device/include/amdgpu/device/device.hpp index 2749cc86b..909ff2fc1 100644 --- a/hw/amdgpu/device/include/amdgpu/device/device.hpp +++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp @@ -1311,7 +1311,9 @@ struct AmdgpuDevice { TaskChain &initTaskChain, std::uint32_t bufferIndex, std::uint64_t arg, VkImage targetImage, VkExtent2D targetExtent, VkSemaphore waitSemaphore, - VkSemaphore signalSemaphore, VkFence fence); + VkSemaphore signalSemaphore, VkFence fence, + bridge::CmdBuffer *buffers, + bridge::CmdBufferAttribute *bufferAttributes); AmdgpuDevice(amdgpu::bridge::BridgeHeader *bridge); diff --git a/hw/amdgpu/device/src/device.cpp b/hw/amdgpu/device/src/device.cpp index d692ce018..024796135 100644 --- a/hw/amdgpu/device/src/device.cpp +++ b/hw/amdgpu/device/src/device.cpp @@ -2666,9 +2666,10 @@ struct CacheLine { std::mutex writeBackTableMtx; util::MemoryTableWithPayload> writeBackTable; - CacheLine(std::uint64_t areaAddress, std::uint64_t areaSize) - : areaAddress(areaAddress), areaSize(areaSize) { + CacheLine(RemoteMemory memory, std::uint64_t areaAddress, std::uint64_t areaSize) + :memory(memory), areaAddress(areaAddress), areaSize(areaSize) { memoryOverlay = new MemoryOverlay(); + memoryOverlay->memory = memory; hostSyncTable.map(areaAddress, areaAddress + areaSize, {1, memoryOverlay}); } @@ -3631,8 +3632,7 @@ private: assert(address >= area.beginAddress && address + size < area.endAddress); it = cacheLines.emplace_hint( it, std::piecewise_construct, std::tuple{area.beginAddress}, - std::tuple{area.beginAddress, area.endAddress}); - it->second.memory = memory; + std::tuple{memory, area.beginAddress, area.endAddress}); } return it->second; @@ -4817,8 +4817,8 @@ void amdgpu::device::AmdgpuDevice::handleProtectMemory(RemoteMemory memory, protStr = "unknown"; break; } - std::fprintf(stderr, "Allocated area at %zx, size %lx, prot %s\n", address, - size, protStr); + std::fprintf(stderr, "Allocated area at %zx, size %lx, prot %s, vmid %u\n", address, + size, protStr, memory.vmId); } else { memoryAreaTable[memory.vmId].unmap(beginPage, endPage); std::fprintf(stderr, "Unmapped area at %zx, size %lx\n", address, size); @@ -4888,12 +4888,13 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer, TaskChain &taskChain, std::uint32_t bufferIndex, std::uint64_t arg, VkImage targetImage, VkExtent2D targetExtent, VkSemaphore waitSemaphore, - VkSemaphore signalSemaphore, VkFence fence) { + VkSemaphore signalSemaphore, VkFence fence, bridge::CmdBuffer *buffers, + bridge::CmdBufferAttribute *bufferAttributes) { if (bufferIndex == ~static_cast(0)) { - g_bridge->flipBuffer = bufferIndex; - g_bridge->flipArg = arg; - g_bridge->flipCount = g_bridge->flipCount + 1; + g_bridge->flipBuffer[memory.vmId] = bufferIndex; + g_bridge->flipArg[memory.vmId] = arg; + g_bridge->flipCount[memory.vmId] = g_bridge->flipCount[memory.vmId] + 1; // black surface, ignore for now return false; @@ -4904,9 +4905,10 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( // std::fprintf(stderr, "host visible memory: "); // getHostVisibleMemory().dump(); - auto buffer = g_bridge->buffers[bufferIndex]; + auto buffer = buffers[bufferIndex]; + auto bufferAttr = bufferAttributes[buffer.attrId]; - if (buffer.pitch == 0 || buffer.height == 0 || buffer.address == 0) { + if (bufferAttr.pitch == 0 || bufferAttr.height == 0 || buffer.address == 0) { std::printf("Attempt to flip unallocated buffer\n"); return false; } @@ -4925,7 +4927,7 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( SurfaceFormat surfFormat; TextureChannelType channelType; - switch (buffer.pixelFormat) { + switch (bufferAttr.pixelFormat) { case 0x80000000: // bgra surfFormat = kSurfaceFormat8_8_8_8; @@ -4946,18 +4948,18 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( default: util::unreachable("unimplemented color buffer format %x", - buffer.pixelFormat); + bufferAttr.pixelFormat); } auto &cache = getCache(memory); auto tag = cache.createTag(); - imageRef = - cache.getImage(tag, taskChain, buffer.address, surfFormat, channelType, - buffer.tilingMode == 1 ? kTileModeDisplay_2dThin - : kTileModeDisplay_LinearAligned, - buffer.width, buffer.height, 1, buffer.pitch, 4, 5, 6, 7, - shader::AccessOp::Load); + imageRef = cache.getImage( + tag, taskChain, buffer.address, surfFormat, channelType, + bufferAttr.tilingMode == 1 ? kTileModeDisplay_2dThin + : kTileModeDisplay_LinearAligned, + bufferAttr.width, bufferAttr.height, 1, bufferAttr.pitch, 4, 5, 6, 7, + shader::AccessOp::Load); auto initTask = taskChain.getLastTaskId(); @@ -4972,8 +4974,8 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( .baseArrayLayer = 0, .layerCount = 1}, .srcOffsets = {{}, - {static_cast(buffer.width), - static_cast(buffer.height), 1}}, + {static_cast(bufferAttr.width), + static_cast(bufferAttr.height), 1}}, .dstSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, .mipLevel = 0, .baseArrayLayer = 0, @@ -5064,11 +5066,11 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( taskChain.add(submitCompleteTask, [=] { imageRef->unlock(tag); - g_bridge->flipBuffer = bufferIndex; - g_bridge->flipArg = arg; - g_bridge->flipCount = g_bridge->flipCount + 1; + g_bridge->flipBuffer[memory.vmId] = bufferIndex; + g_bridge->flipArg[memory.vmId] = arg; + g_bridge->flipCount[memory.vmId] = g_bridge->flipCount[memory.vmId] + 1; auto bufferInUse = - memory.getPointer(g_bridge->bufferInUseAddress); + memory.getPointer(g_bridge->bufferInUseAddress[memory.vmId]); if (bufferInUse != nullptr) { bufferInUse[bufferIndex] = 0; } diff --git a/rpcsx-gpu/main.cpp b/rpcsx-gpu/main.cpp index a6f943bc3..87e129d8f 100644 --- a/rpcsx-gpu/main.cpp +++ b/rpcsx-gpu/main.cpp @@ -763,6 +763,8 @@ int main(int argc, const char *argv[]) { struct ProcessInfo { int vmId = -1; int vmFd = -1; + amdgpu::bridge::CmdBufferAttribute bufferAttributes[10]; + amdgpu::bridge::CmdBuffer buffers[10]; rx::MemoryTableWithPayload vmTable; }; @@ -1033,7 +1035,8 @@ int main(int argc, const char *argv[]) { rx::mem::protect(memory.getPointer(cmd.memoryProt.address), cmd.memoryProt.size, cmd.memoryProt.prot >> 4); device.handleProtectMemory(memory, cmd.memoryProt.address, - cmd.memoryProt.size, cmd.memoryProt.prot); + cmd.memoryProt.size, + cmd.memoryProt.prot); } break; } @@ -1075,7 +1078,8 @@ int main(int argc, const char *argv[]) { *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex, cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent, presentCompleteSemaphore, renderCompleteSemaphore, - inFlightFences[imageIndex])) { + inFlightFences[imageIndex], process.buffers, + process.bufferAttributes)) { VkPresentInfoKHR presentInfo{ .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, .waitSemaphoreCount = 1, @@ -1096,7 +1100,8 @@ int main(int argc, const char *argv[]) { } case amdgpu::bridge::CommandId::MapProcess: { - mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, processInfo[cmd.mapProcess.pid]); + mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, + processInfo[cmd.mapProcess.pid]); break; } case amdgpu::bridge::CommandId::UnmapProcess: { @@ -1147,6 +1152,28 @@ int main(int argc, const char *argv[]) { break; } + case amdgpu::bridge::CommandId::RegisterBuffer: { + auto &process = processInfo[cmd.buffer.pid]; + + if (cmd.buffer.attrId >= 10 || cmd.buffer.index >= 10) { + std::abort(); + } + + process.buffers[cmd.buffer.index] = cmd.buffer; + break; + } + + case amdgpu::bridge::CommandId::RegisterBufferAttribute: { + auto &process = processInfo[cmd.bufferAttribute.pid]; + if (cmd.bufferAttribute.attrId >= 10) { + std::abort(); + } + + process.bufferAttributes[cmd.bufferAttribute.attrId] = + cmd.bufferAttribute; + break; + } + default: util::unreachable("Unexpected command id %u\n", (unsigned)cmd.id); } diff --git a/rpcsx-os/iodev/dce.cpp b/rpcsx-os/iodev/dce.cpp index 6526b9b68..680d27c1c 100644 --- a/rpcsx-os/iodev/dce.cpp +++ b/rpcsx-os/iodev/dce.cpp @@ -252,8 +252,6 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, void *argp, orbis::Thread *thread) { auto device = static_cast(file->device.get()); - std::lock_guard lock(device->mtx); - if (request == 0xc0308203) { // returns: // PERM @@ -298,11 +296,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, FlipControlStatus flipStatus{}; // TODO: lock bridge header - flipStatus.flipArg = rx::bridge.header->flipArg; - flipStatus.count = rx::bridge.header->flipCount; + flipStatus.flipArg = rx::bridge.header->flipArg[thread->tproc->vmId]; + flipStatus.count = rx::bridge.header->flipCount[thread->tproc->vmId]; flipStatus.processTime = 0; // TODO flipStatus.tsc = 0; // TODO - flipStatus.currentBuffer = rx::bridge.header->flipBuffer; + flipStatus.currentBuffer = rx::bridge.header->flipBuffer[thread->tproc->vmId]; flipStatus.flipPendingNum0 = 0; // TODO flipStatus.gcQueueNum = 0; // TODO flipStatus.flipPendingNum1 = 0; // TODO @@ -332,8 +330,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, *(std::uint64_t *)args->size = kDceControlMemorySize; // size } else if (args->id == 31) { if ((std::uint64_t)args->ptr == 0xc) { - rx::bridge.header->bufferInUseAddress = args->size; - } else { + rx::bridge.header->bufferInUseAddress[thread->tproc->vmId] = args->size; + } else if ((std::uint64_t)args->ptr != 1) { ORBIS_LOG_ERROR("buffer in use", args->ptr, args->size); thread->where(); } @@ -361,20 +359,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, ORBIS_LOG_ERROR("dce: RegisterBuffer", args->canary, args->index, args->address, args->address2); - if (args->index >= std::size(rx::bridge.header->buffers)) { - // TODO - ORBIS_LOG_FATAL("dce: out of buffers!", args->index); - return orbis::ErrorCode::NOMEM; - } - - // TODO: lock bridge header - rx::bridge.header->buffers[args->index] = { - .width = device->bufferAttributes.width, - .height = device->bufferAttributes.height, - .pitch = device->bufferAttributes.pitch, - .address = args->address, - .pixelFormat = device->bufferAttributes.pixelFormat, - .tilingMode = device->bufferAttributes.tilingMode}; + rx::bridge.sendRegisterBuffer(thread->tproc->pid, args->canary, args->index, + args->attrid, args->address, args->address2); return {}; } @@ -387,11 +373,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request, args->unk4_zero, args->unk5_zero, args->options, args->reserved1, args->reserved2); - device->bufferAttributes.pixelFormat = args->pixelFormat; - device->bufferAttributes.tilingMode = args->tilingMode; - device->bufferAttributes.pitch = args->pitch; - device->bufferAttributes.width = args->width; - device->bufferAttributes.height = args->height; + rx::bridge.sendRegisterBufferAttribute( + thread->tproc->pid, args->attrid, args->submit, args->canary, + args->pixelFormat, args->tilingMode, args->pitch, args->width, + args->height); + return {}; }