gpu: split command and gfx queues

2026-02-16 04:35:58 +01:00 · 2024-10-19 15:44:32 +03:00 · 2024-10-19 15:44:32 +03:00 · deb09371bc
parent 5ce8d5147a
commit deb09371bc
13 changed files with 190 additions and 108 deletions
--- a/rpcsx/gpu/Cache.cpp
+++ b/rpcsx/gpu/Cache.cpp
@ -469,13 +469,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
  barrier.image = image;
  barrier.subresourceRange = subresourceRange;

-  auto layoutToStageAccess = [](VkImageLayout layout)
-      -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
+  auto layoutToStageAccess =
+      [](VkImageLayout layout,
+         bool isSrc) -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
    switch (layout) {
    case VK_IMAGE_LAYOUT_UNDEFINED:
    case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
    case VK_IMAGE_LAYOUT_GENERAL:
-      return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
+      return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
+                    : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+              0};

    case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
      return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
@ -501,8 +504,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
    }
  };

-  auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
-  auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
+  auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true);
+  auto [destinationStage, destinationAccess] =
+      layoutToStageAccess(newLayout, false);

  barrier.srcAccessMask = sourceAccess;
  barrier.dstAccessMask = destinationAccess;
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@ -236,6 +236,14 @@ Device::Device() : vkContext(createVkContext(this)) {
    }
  });

+  commandPipe.device = this;
+  commandPipe.ring = {
+      .base = cmdRing,
+      .size = std::size(cmdRing),
+      .rptr = cmdRing,
+      .wptr = cmdRing,
+  };
+
  for (auto &pipe : computePipes) {
    pipe.device = this;
  }
@ -244,7 +252,7 @@ Device::Device() : vkContext(createVkContext(this)) {
    graphicsPipes[i].setDeQueue(
        Ring{
            .base = mainGfxRings[i],
-            .size = sizeof(mainGfxRings[i]) / sizeof(mainGfxRings[i][0]),
+            .size = std::size(mainGfxRings[i]),
            .rptr = mainGfxRings[i],
            .wptr = mainGfxRings[i],
        },
@ -621,6 +629,8 @@ void Device::onCommandBuffer(std::uint32_t pid, int cmdHeader,
 bool Device::processPipes() {
  bool allProcessed = true;

+  commandPipe.processAllRings();
+
  for (auto &pipe : computePipes) {
    if (!pipe.processAllRings()) {
      allProcessed = false;
@ -649,13 +659,16 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
  barrier.image = image;
  barrier.subresourceRange = subresourceRange;

-  auto layoutToStageAccess = [](VkImageLayout layout)
-      -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
+  auto layoutToStageAccess =
+      [](VkImageLayout layout,
+         bool isSrc) -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
    switch (layout) {
    case VK_IMAGE_LAYOUT_UNDEFINED:
    case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
    case VK_IMAGE_LAYOUT_GENERAL:
-      return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
+      return {isSrc ? VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
+                    : VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+              0};

    case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
      return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
@ -681,8 +694,9 @@ transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
    }
  };

-  auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
-  auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
+  auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout, true);
+  auto [destinationStage, destinationAccess] =
+      layoutToStageAccess(newLayout, false);

  barrier.srcAccessMask = sourceAccess;
  barrier.dstAccessMask = destinationAccess;
@ -783,13 +797,13 @@ bool Device::flip(std::uint32_t pid, int bufferIndex, std::uint64_t arg,
            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
            .semaphore = vk::context->presentCompleteSemaphore,
            .value = 1,
-            .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+            .stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
        },
        {
            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
            .semaphore = scheduler.getSemaphoreHandle(),
            .value = submitCompleteTask - 1,
-            .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+            .stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
        },
    };

--- a/rpcsx/gpu/Device.hpp
+++ b/rpcsx/gpu/Device.hpp
@ -78,6 +78,7 @@ struct Device : orbis::RcBase, DeviceContext {
  GpuTiler tiler;
  GraphicsPipe graphicsPipes[kGfxPipeCount]{0, 1};
  ComputePipe computePipes[kComputePipeCount]{0, 1, 2, 3, 4, 5, 6, 7};
+  CommandPipe commandPipe;
  FlipPipeline flipPipeline;

  orbis::shared_mutex writeCommandMtx;
@ -94,6 +95,7 @@ struct Device : orbis::RcBase, DeviceContext {
  };

  std::uint32_t mainGfxRings[kGfxPipeCount][0x4000 / sizeof(std::uint32_t)];
+  std::uint32_t cmdRing[0x4000 / sizeof(std::uint32_t)];

  Device();
  ~Device();
--- a/rpcsx/gpu/DeviceCtl.cpp
+++ b/rpcsx/gpu/DeviceCtl.cpp
@ -54,12 +54,6 @@ void DeviceCtl::submitGfxCommand(int gfxPipe, int vmId,
 void DeviceCtl::submitSwitchBuffer(int gfxPipe) {
  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
 }
-void DeviceCtl::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                           std::uint64_t flipArg) {
-  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_FLIP, bufferIndex,
-                                                     flipArg & 0xffff'ffff,
-                                                     flipArg >> 32, pid));
-}

 orbis::ErrorCode DeviceCtl::submitWriteEop(int gfxPipe, std::uint32_t waitMode,
                                           std::uint64_t eopValue) {
@ -107,40 +101,47 @@ orbis::ErrorCode DeviceCtl::submitFlipOnEop(int gfxPipe, std::uint32_t pid,

  return {};
 }
+void DeviceCtl::submitFlip(std::uint32_t pid, int bufferIndex,
+                           std::uint64_t flipArg) {
+  mDevice->submitCommand(mDevice->commandPipe.ring,
+                         createPm4Packet(IT_FLIP, bufferIndex,
+                                         flipArg & 0xffff'ffff, flipArg >> 32,
+                                         pid));
+}

-void DeviceCtl::submitMapMemory(int gfxPipe, std::uint32_t pid,
-                                std::uint64_t address, std::uint64_t size,
-                                int memoryType, int dmemIndex, int prot,
-                                std::int64_t offset) {
-  mDevice->submitGfxCommand(
-      gfxPipe,
+void DeviceCtl::submitMapMemory(std::uint32_t pid, std::uint64_t address,
+                                std::uint64_t size, int memoryType,
+                                int dmemIndex, int prot, std::int64_t offset) {
+  mDevice->submitCommand(
+      mDevice->commandPipe.ring,
      createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff, address >> 32,
                      size & 0xffff'ffff, size >> 32, memoryType, dmemIndex,
                      prot, offset & 0xffff'ffff, offset >> 32));
 }
-void DeviceCtl::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
-                                  std::uint64_t address, std::uint64_t size) {
-  mDevice->submitGfxCommand(
-      gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
-                               address >> 32, size & 0xffff'ffff, size >> 32));
+void DeviceCtl::submitUnmapMemory(std::uint32_t pid, std::uint64_t address,
+                                  std::uint64_t size) {
+  mDevice->submitCommand(mDevice->commandPipe.ring,
+                         createPm4Packet(IT_UNMAP_MEMORY, pid,
+                                         address & 0xffff'ffff, address >> 32,
+                                         size & 0xffff'ffff, size >> 32));
 }

-void DeviceCtl::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
-  mDevice->submitGfxCommand(gfxPipe,
-                            createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
+void DeviceCtl::submitMapProcess(std::uint32_t pid, int vmId) {
+  mDevice->submitCommand(mDevice->commandPipe.ring,
+                         createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
 }

-void DeviceCtl::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
-  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
+void DeviceCtl::submitUnmapProcess(std::uint32_t pid) {
+  mDevice->submitCommand(mDevice->commandPipe.ring,
+                         createPm4Packet(IT_UNMAP_PROCESS, pid));
 }

-void DeviceCtl::submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                                    std::uint64_t address, std::uint64_t size,
-                                    int prot) {
-  mDevice->submitGfxCommand(
-      gfxPipe,
-      createPm4Packet(IT_PROTECT_MEMORY, pid, address & 0xffff'ffff,
-                      address >> 32, size & 0xffff'ffff, size >> 32, prot));
+void DeviceCtl::submitProtectMemory(std::uint32_t pid, std::uint64_t address,
+                                    std::uint64_t size, int prot) {
+  mDevice->submitCommand(mDevice->commandPipe.ring,
+                         createPm4Packet(IT_PROTECT_MEMORY, pid,
+                                         address & 0xffff'ffff, address >> 32,
+                                         size & 0xffff'ffff, size >> 32, prot));
 }

 void DeviceCtl::registerBuffer(std::uint32_t pid, Buffer buffer) {
--- a/rpcsx/gpu/DeviceCtl.hpp
+++ b/rpcsx/gpu/DeviceCtl.hpp
@ -28,22 +28,21 @@ public:
  void submitGfxCommand(int gfxPipe, int vmId,
                        std::span<const std::uint32_t> command);
  void submitSwitchBuffer(int gfxPipe);
-  void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                  std::uint64_t flipArg);
  orbis::ErrorCode submitWriteEop(int gfxPipe, std::uint32_t waitMode,
                                  std::uint64_t eopValue);
  orbis::ErrorCode submitFlipOnEop(int gfxPipe, std::uint32_t pid,
                                   int bufferIndex, std::uint64_t flipArg,
                                   std::uint64_t eopValue);
-  void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+  void submitFlip(std::uint32_t pid, int bufferIndex, std::uint64_t flipArg);
+  void submitMapMemory(std::uint32_t pid, std::uint64_t address,
                       std::uint64_t size, int memoryType, int dmemIndex,
                       int prot, std::int64_t offset);
-  void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+  void submitUnmapMemory(std::uint32_t pid, std::uint64_t address,
                         std::uint64_t size);
-  void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
-  void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
-  void submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                           std::uint64_t address, std::uint64_t size, int prot);
+  void submitMapProcess(std::uint32_t pid, int vmId);
+  void submitUnmapProcess(std::uint32_t pid);
+  void submitProtectMemory(std::uint32_t pid, std::uint64_t address,
+                           std::uint64_t size, int prot);
  void registerBuffer(std::uint32_t pid, Buffer buffer);
  void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);

--- a/rpcsx/gpu/FlipPipeline.cpp
+++ b/rpcsx/gpu/FlipPipeline.cpp
@ -161,8 +161,8 @@ FlipPipeline::FlipPipeline() {
        .pAttachments = &blendAttachmentState};

    VkDynamicState dynamicStates[] = {
-        VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT,
-        VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT,
+        VK_DYNAMIC_STATE_VIEWPORT,
+        VK_DYNAMIC_STATE_SCISSOR,
    };

    VkPipelineDynamicStateCreateInfo dynamicState{
@ -183,6 +183,7 @@ FlipPipeline::FlipPipeline() {
        {
            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
            .pNext = &info,
+            .flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT,
            .stageCount = std::size(stagesStd),
            .pStages = stagesStd,
            .pVertexInputState = &vertexInputState,
@ -198,6 +199,7 @@ FlipPipeline::FlipPipeline() {
        {
            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
            .pNext = &info,
+            .flags = VK_PIPELINE_CREATE_DESCRIPTOR_BUFFER_BIT_EXT,
            .stageCount = std::size(stagesAlt),
            .pStages = stagesAlt,
            .pVertexInputState = &vertexInputState,
--- a/rpcsx/gpu/Pipe.cpp
+++ b/rpcsx/gpu/Pipe.cpp
@ -540,13 +540,8 @@ GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) {
  // IT_WAIT_ON_AVAIL_BUFFER
  mainHandlers[gnm::IT_SWITCH_BUFFER] = &GraphicsPipe::switchBuffer;
  // IT_SET_RESOURCES
-  mainHandlers[gnm::IT_MAP_PROCESS] = &GraphicsPipe::mapProcess;
  mainHandlers[gnm::IT_MAP_QUEUES] = &GraphicsPipe::mapQueues;
  mainHandlers[gnm::IT_UNMAP_QUEUES] = &GraphicsPipe::unmapQueues;
-  mainHandlers[IT_MAP_MEMORY] = &GraphicsPipe::mapMemory;
-  mainHandlers[IT_UNMAP_MEMORY] = &GraphicsPipe::unmapMemory;
-  mainHandlers[IT_PROTECT_MEMORY] = &GraphicsPipe::protectMemory;
-  mainHandlers[IT_UNMAP_PROCESS] = &GraphicsPipe::unmapProcess;
  // IT_QUERY_STATUS
  // IT_RUN_LIST
  // IT_DISPATCH_DRAW_PREAMBLE
@ -558,8 +553,6 @@ GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) {
  ceHandlers[gnm::IT_LOAD_CONST_RAM] = &GraphicsPipe::loadConstRam;
  ceHandlers[gnm::IT_WRITE_CONST_RAM] = &GraphicsPipe::writeConstRam;
  ceHandlers[gnm::IT_DUMP_CONST_RAM] = &GraphicsPipe::dumpConstRam;
-
-  mainHandlers[IT_FLIP] = &GraphicsPipe::flip;
 }

 void GraphicsPipe::setCeQueue(Ring ring) {
@ -1601,14 +1594,6 @@ bool GraphicsPipe::switchBuffer(Ring &ring) {
  return true;
 }

-bool GraphicsPipe::mapProcess(Ring &ring) {
-  auto pid = ring.rptr[1];
-  int vmId = ring.rptr[2];
-
-  device->mapProcess(pid, vmId);
-  return true;
-}
-
 bool GraphicsPipe::mapQueues(Ring &ring) {
  // FIXME: implement
  return true;
@ -1619,7 +1604,59 @@ bool GraphicsPipe::unmapQueues(Ring &ring) {
  return true;
 }

-bool GraphicsPipe::mapMemory(Ring &ring) {
+CommandPipe::CommandPipe() {
+  for (auto &handler : commandHandlers) {
+    handler = &CommandPipe::unknownPacket;
+  }
+
+  commandHandlers[gnm::IT_MAP_PROCESS] = &CommandPipe::mapProcess;
+  commandHandlers[IT_MAP_MEMORY] = &CommandPipe::mapMemory;
+  commandHandlers[IT_UNMAP_MEMORY] = &CommandPipe::unmapMemory;
+  commandHandlers[IT_PROTECT_MEMORY] = &CommandPipe::protectMemory;
+  commandHandlers[IT_UNMAP_PROCESS] = &CommandPipe::unmapProcess;
+  commandHandlers[IT_FLIP] = &CommandPipe::flip;
+}
+
+void CommandPipe::processAllRings() { processRing(ring); }
+
+void CommandPipe::processRing(Ring &ring) {
+  while (ring.rptr != ring.wptr) {
+    if (ring.rptr >= ring.base + ring.size) {
+      ring.rptr = ring.base;
+      continue;
+    }
+
+    auto header = *ring.rptr;
+    auto type = rx::getBits(header, 31, 30);
+
+    if (type == 3) {
+      auto op = rx::getBits(header, 15, 8);
+      auto len = rx::getBits(header, 29, 16) + 2;
+
+      if (op == gnm::IT_COND_EXEC) {
+        rx::die("unimplemented COND_EXEC");
+      }
+
+      auto handler = commandHandlers[op];
+      (this->*handler)(ring);
+
+      ring.rptr += len;
+      continue;
+    }
+
+    if (type == 2) {
+      ++ring.rptr;
+      continue;
+    }
+
+    rx::die("cmd pipe: unexpected pm4 packet type %u, ring %u, header %u, rptr "
+            "%p, wptr "
+            "%p, base %p",
+            type, ring.indirectLevel, header, ring.rptr, ring.wptr, ring.base);
+  }
+}
+
+void CommandPipe::mapMemory(Ring &ring) {
  auto pid = ring.rptr[1];
  auto addressLo = ring.rptr[2];
  auto addressHi = ring.rptr[3];
@ -1636,9 +1673,8 @@ bool GraphicsPipe::mapMemory(Ring &ring) {
  auto offset = offsetLo | (static_cast<std::uint64_t>(offsetHi) << 32);

  device->mapMemory(pid, address, size, memoryType, dmemIndex, prot, offset);
-  return true;
 }
-bool GraphicsPipe::unmapMemory(Ring &ring) {
+void CommandPipe::unmapMemory(Ring &ring) {
  auto pid = ring.rptr[1];
  auto addressLo = ring.rptr[2];
  auto addressHi = ring.rptr[3];
@ -1648,9 +1684,8 @@ bool GraphicsPipe::unmapMemory(Ring &ring) {
  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
  auto size = sizeLo | (static_cast<std::uint64_t>(sizeHi) << 32);
  device->unmapMemory(pid, address, size);
-  return true;
 }
-bool GraphicsPipe::protectMemory(Ring &ring) {
+void CommandPipe::protectMemory(Ring &ring) {
  auto pid = ring.rptr[1];
  auto addressLo = ring.rptr[2];
  auto addressHi = ring.rptr[3];
@ -1661,15 +1696,19 @@ bool GraphicsPipe::protectMemory(Ring &ring) {
  auto size = sizeLo | (static_cast<std::uint64_t>(sizeHi) << 32);

  device->protectMemory(pid, address, size, prot);
-  return true;
 }
-bool GraphicsPipe::unmapProcess(Ring &ring) {
+void CommandPipe::mapProcess(Ring &ring) {
+  auto pid = ring.rptr[1];
+  int vmId = ring.rptr[2];
+
+  device->mapProcess(pid, vmId);
+}
+void CommandPipe::unmapProcess(Ring &ring) {
  auto pid = ring.rptr[1];
  device->unmapProcess(pid);
-  return true;
 }

-bool GraphicsPipe::flip(Ring &ring) {
+void CommandPipe::flip(Ring &ring) {
  auto buffer = ring.rptr[1];
  auto dataLo = ring.rptr[2];
  auto dataHi = ring.rptr[3];
@ -1677,5 +1716,11 @@ bool GraphicsPipe::flip(Ring &ring) {
  auto data = dataLo | (static_cast<std::uint64_t>(dataHi) << 32);

  device->flip(pid, buffer, data);
-  return true;
+}
+
+void CommandPipe::unknownPacket(Ring &ring) {
+  auto op = rx::getBits(ring.rptr[0], 15, 8);
+
+  rx::die("unexpected command pm4 packet: %s, queue %u\n",
+          gnm::pm4OpcodeToString(op), ring.indirectLevel);
 }
--- a/rpcsx/gpu/Pipe.hpp
+++ b/rpcsx/gpu/Pipe.hpp
@ -164,19 +164,33 @@ struct GraphicsPipe {
  bool setShReg(Ring &ring);
  bool setUConfigReg(Ring &ring);
  bool setContextReg(Ring &ring);
-
-  bool unknownPacket(Ring &ring);
-
-  bool switchBuffer(Ring &ring);
-  bool mapProcess(Ring &ring);
  bool mapQueues(Ring &ring);
  bool unmapQueues(Ring &ring);
-  bool mapMemory(Ring &ring);
-  bool unmapMemory(Ring &ring);
-  bool protectMemory(Ring &ring);
-  bool unmapProcess(Ring &ring);
-  bool flip(Ring &ring);
+
+  bool unknownPacket(Ring &ring);
+  bool switchBuffer(Ring &ring);

  std::uint32_t *getMmRegister(std::uint32_t dwAddress);
 };
+
+struct CommandPipe {
+  Ring ring;
+  Device *device;
+  using CommandHandler = void (CommandPipe::*)(Ring &);
+  CommandHandler commandHandlers[255];
+
+  CommandPipe();
+
+  void processAllRings();
+  void processRing(Ring &ring);
+
+  void mapProcess(Ring &ring);
+  void mapMemory(Ring &ring);
+  void unmapMemory(Ring &ring);
+  void protectMemory(Ring &ring);
+  void unmapProcess(Ring &ring);
+  void flip(Ring &ring);
+
+  void unknownPacket(Ring &ring);
+};
 } // namespace amdgpu
--- a/rpcsx/gpu/lib/vk/include/Scheduler.hpp
+++ b/rpcsx/gpu/lib/vk/include/Scheduler.hpp
@ -180,7 +180,7 @@ private:
        auto value = mSemaphore.getCounterValue();
        auto endIt = mTasks.upper_bound(value);

-        for (auto it = mTasks.begin(); it != mTasks.end();
+        for (auto it = mTasks.begin(); it != endIt;
             it = mTasks.erase(it)) {
          taskList.reserve(taskList.size() + it->second.size());
          for (auto &&fn : it->second) {
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@ -100,8 +100,8 @@ struct ResolutionStatus {
  std::uint32_t heigth;
  std::uint32_t paneWidth;
  std::uint32_t paneHeight;
-  std::uint32_t refreshHz;        // float
-  std::uint32_t screenSizeInInch; // float
+  float refreshHz;        // float
+  float screenSizeInInch; // float
  std::byte padding[20];
 };

@ -316,8 +316,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
      status->heigth = 1080;
      status->paneWidth = 1920;
      status->paneHeight = 1080;
-      status->refreshHz = 0x426fc28f;        //( 59.94)
-      status->screenSizeInInch = 0x42500000; //( 52.00)
+      status->refreshHz = 59.94f;
+      status->screenSizeInInch = 52.0f;
    } else if (args->id == 9) {
      ORBIS_LOG_NOTICE("dce: FlipControl allocate", args->id, args->arg2,
                       args->ptr, args->size);
@ -393,8 +393,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
    auto args = reinterpret_cast<FlipRequestArgs *>(argp);

    if (args->eop_nz == 0) {
-      gpu.submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
-                     args->displayBufferIndex, args->flipArg);
+      gpu.submitFlip(thread->tproc->pid, args->displayBufferIndex,
+                     args->flipArg);
    } else if (args->eop_nz == 1) {
      std::uint64_t eopValue = args->canary;
      eopValue ^= 0xff00'0000;
@ -473,7 +473,7 @@ void DceDevice::initializeProcess(orbis::Process *process) {
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
    {
      auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
-      gpu.submitMapProcess(process->gfxRing, process->pid, vmId);
+      gpu.submitMapProcess(process->pid, vmId);
      process->vmId = vmId;
    }

--- a/rpcsx/iodev/dmem.cpp
+++ b/rpcsx/iodev/dmem.cpp
@ -69,10 +69,9 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
  }

  if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
-    gpu.submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
-                         orbis::g_currentThread->tproc->pid,
-                         reinterpret_cast<std::uint64_t>(result), len,
-                         memoryType, index, prot, directMemoryStart);
+    gpu.submitMapMemory(orbis::g_currentThread->tproc->pid,
+                        reinterpret_cast<std::uint64_t>(result), len,
+                        memoryType, index, prot, directMemoryStart);
  }

  *address = result;
--- a/rpcsx/iodev/gc.cpp
+++ b/rpcsx/iodev/gc.cpp
@ -143,13 +143,16 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  }

  case 0xc0048116: { // submit done?
+    break;
+  }
+
+  case 0xc0048117:
    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
      gpu.waitForIdle();
    } else {
      return orbis::ErrorCode::BUSY;
    }
    break;
-  }

  case 0xc00c8110: {
    // set gs ring sizes
--- a/rpcsx/vm.cpp
+++ b/rpcsx/vm.cpp
@ -933,8 +933,8 @@ void *vm::map(void *addr, std::uint64_t len, std::int32_t prot,
  if (auto thr = orbis::g_currentThread) {
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
-      gpu.submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
-                          -1, -1, prot, address - kMinAddress);
+      gpu.submitMapMemory(thr->tproc->pid, address, len, -1, -1, prot,
+                          address - kMinAddress);
    }
  }

@ -990,11 +990,11 @@ bool vm::unmap(void *addr, std::uint64_t size) {
  if (auto thr = orbis::g_currentThread) {
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
-      gpu.submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
-                            size);
+      gpu.submitUnmapMemory(thr->tproc->pid, address, size);
    }
  } else {
-    std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
+    std::println(stderr, "ignoring unmapping {:x}-{:x}", address,
+                 address + size);
  }
  return rx::mem::unmap(addr, size);
 }
@ -1032,10 +1032,9 @@ bool vm::protect(void *addr, std::uint64_t size, std::int32_t prot) {
    std::println("memory prot: {:x}", prot);
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
-      gpu.submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
-                               size, prot);
+      gpu.submitProtectMemory(thr->tproc->pid, address, size, prot);
    }
-  } else {
+  } else if (prot >> 4) {
    std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
  }
  return ::mprotect(addr, size, prot & kMapProtCpuAll) == 0;