From 4fe857485c5508b49b0b9d600477c4c6f2240b71 Mon Sep 17 00:00:00 2001
From: DH <dh.rpcs3@gmail.com>
Date: Tue, 15 Oct 2024 18:35:17 +0300
Subject: [PATCH] gpu: implement compute queue

---
 orbis-kernel/include/orbis/KernelContext.hpp |   1 +
 rpcsx/gpu/Device.cpp                         |   8 +-
 rpcsx/gpu/Device.hpp                         |   2 +-
 rpcsx/gpu/DeviceCtl.cpp                      |  44 +
 rpcsx/gpu/DeviceCtl.hpp                      |   9 +
 rpcsx/gpu/Pipe.cpp                           | 902 ++++++++++++-------
 rpcsx/gpu/Pipe.hpp                           | 161 ++--
 rpcsx/gpu/Registers.hpp                      |  29 +-
 rpcsx/iodev/dce.cpp                          |  44 +-
 rpcsx/iodev/dce.hpp                          |  25 +
 rpcsx/iodev/gc.cpp                           |  89 +-
 rpcsx/main.cpp                               |   5 +-
 12 files changed, 836 insertions(+), 483 deletions(-)
 create mode 100644 rpcsx/iodev/dce.hpp
diff --git a/orbis-kernel/include/orbis/KernelContext.hpp b/orbis-kernel/include/orbis/KernelContext.hpp
index b6b80689f..76d5cdc01 100644
--- a/orbis-kernel/include/orbis/KernelContext.hpp
+++ b/orbis-kernel/include/orbis/KernelContext.hpp
@@ -181,6 +181,7 @@ public:
   Ref<RcBase> blockpoolDevice;
   shared_mutex gpuDeviceMtx;
   Ref<RcBase> gpuDevice;
+  Ref<RcBase> dceDevice;
   uint sdkVersion{};
   uint fwSdkVersion{};
   uint safeMode{};
diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp
index 64af24036..8efee25c5 100644
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@@ -236,7 +236,7 @@ Device::Device() : vkContext(createVkContext(this)) {
 
   for (int i = 0; i < kGfxPipeCount; ++i) {
     graphicsPipes[i].setDeQueue(
-        Queue{
+        Ring{
             .base = mainGfxRings[i],
             .size = sizeof(mainGfxRings[i]) / sizeof(mainGfxRings[i][0]),
             .rptr = mainGfxRings[i],
@@ -474,7 +474,7 @@ void Device::start() {
   }
 }
 
-void Device::submitCommand(Queue &ring,
+void Device::submitCommand(Ring &ring,
                            std::span<const std::uint32_t> command) {
   std::scoped_lock lock(writeCommandMtx);
   if (ring.wptr + command.size() > ring.base + ring.size) {
@@ -599,12 +599,12 @@ void Device::onCommandBuffer(std::uint32_t pid, int cmdHeader,
   auto op = rx::getBits(cmdHeader, 15, 8);
 
   if (op == gnm::IT_INDIRECT_BUFFER_CNST) {
-    graphicsPipes[0].setCeQueue(Queue::createFromRange(
+    graphicsPipes[0].setCeQueue(Ring::createFromRange(
         process.vmId, memory.getPointer<std::uint32_t>(address),
         size / sizeof(std::uint32_t)));
   } else if (op == gnm::IT_INDIRECT_BUFFER) {
     graphicsPipes[0].setDeQueue(
-        Queue::createFromRange(process.vmId,
+        Ring::createFromRange(process.vmId,
                                memory.getPointer<std::uint32_t>(address),
                                size / sizeof(std::uint32_t)),
         1);
diff --git a/rpcsx/gpu/Device.hpp b/rpcsx/gpu/Device.hpp
index 89417c6ae..7bc4979d0 100644
--- a/rpcsx/gpu/Device.hpp
+++ b/rpcsx/gpu/Device.hpp
@@ -112,7 +112,7 @@ struct Device : orbis::RcBase, DeviceContext {
     return caches[vmId].createComputeTag(scheduler);
   }
 
-  void submitCommand(Queue &ring, std::span<const std::uint32_t> command);
+  void submitCommand(Ring &ring, std::span<const std::uint32_t> command);
   void submitGfxCommand(int gfxPipe, std::span<const std::uint32_t> command);
 
   void mapProcess(std::uint32_t pid, int vmId);
diff --git a/rpcsx/gpu/DeviceCtl.cpp b/rpcsx/gpu/DeviceCtl.cpp
index 26f527370..1c3c9de6a 100644
--- a/rpcsx/gpu/DeviceCtl.cpp
+++ b/rpcsx/gpu/DeviceCtl.cpp
@@ -117,5 +117,49 @@ void DeviceCtl::registerBufferAttribute(std::uint32_t pid,
   process.bufferAttributes[attr.attrId] = attr;
 }
 
+void DeviceCtl::mapComputeQueue(int vmId, std::uint32_t meId,
+                                std::uint32_t pipeId, std::uint32_t queueId,
+                                std::uint32_t vqueueId,
+                                orbis::uint64_t ringBaseAddress,
+                                orbis::uint64_t readPtrAddress,
+                                orbis::uint64_t doorbell,
+                                orbis::uint64_t ringSize) {
+  if (meId != 1) {
+    rx::die("unexpected ME %d", meId);
+  }
+
+  auto &pipe = mDevice->computePipes[pipeId];
+  auto lock = pipe.lockQueue(queueId);
+  auto memory = RemoteMemory{vmId};
+  auto base = memory.getPointer<std::uint32_t>(ringBaseAddress);
+  pipe.mapQueue(queueId,
+                Ring{
+                    .vmId = vmId,
+                    .indirectLevel = 0,
+                    .doorbell = memory.getPointer<std::uint32_t>(doorbell),
+                    .base = base,
+                    .size = ringSize,
+                    .rptr = base,
+                    .wptr = base,
+                    .rptrReportLocation =
+                        memory.getPointer<std::uint32_t>(readPtrAddress),
+                },
+                lock);
+
+  auto config = std::bit_cast<amdgpu::Registers::ComputeConfig *>(doorbell);
+  config->state = 1;
+}
+
+void DeviceCtl::submitComputeQueue(std::uint32_t meId, std::uint32_t pipeId,
+                                   std::uint32_t queueId,
+                                   std::uint64_t offset) {
+  if (meId != 1) {
+    rx::die("unexpected ME %d", meId);
+  }
+
+  auto &pipe = mDevice->computePipes[pipeId];
+  pipe.submit(queueId, offset);
+}
+
 void DeviceCtl::start() { mDevice->start(); }
 void DeviceCtl::waitForIdle() { mDevice->waitForIdle(); }
diff --git a/rpcsx/gpu/DeviceCtl.hpp b/rpcsx/gpu/DeviceCtl.hpp
index 7a370a4cf..d8ea308d6 100644
--- a/rpcsx/gpu/DeviceCtl.hpp
+++ b/rpcsx/gpu/DeviceCtl.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "DeviceContext.hpp"
+#include "orbis-config.hpp"
 #include "orbis/utils/Rc.hpp"
 #include <cstdint>
 #include <span>
@@ -40,6 +41,14 @@ public:
                            std::uint64_t address, std::uint64_t size, int prot);
   void registerBuffer(std::uint32_t pid, Buffer buffer);
   void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);
+
+  void mapComputeQueue(int vmId, std::uint32_t meId, std::uint32_t pipeId,
+                       std::uint32_t queueId, std::uint32_t vqueueId,
+                       orbis::uint64_t ringBaseAddress,
+                       orbis::uint64_t readPtrAddress, orbis::uint64_t doorbell,
+                       orbis::uint64_t ringSize);
+  void submitComputeQueue(std::uint32_t meId, std::uint32_t pipeId,
+                        std::uint32_t queueId, std::uint64_t offset);
   void start();
   void waitForIdle();
 
diff --git a/rpcsx/gpu/Pipe.cpp b/rpcsx/gpu/Pipe.cpp
index 5c625c9bf..fdf5b2f1d 100644
--- a/rpcsx/gpu/Pipe.cpp
+++ b/rpcsx/gpu/Pipe.cpp
@@ -8,6 +8,7 @@
 #include "vk.hpp"
 #include <bit>
 #include <cstdio>
+#include <mutex>
 #include <print>
 #include <rx/bits.hpp>
 #include <rx/die.hpp>
@@ -81,77 +82,336 @@ static bool compare(int cmpFn, std::uint32_t poll, std::uint32_t mask,
   return false;
 }
 
-ComputePipe::ComputePipe(int index) : scheduler(createComputeScheduler(index)) {
+ComputePipe::ComputePipe(int index)
+    : scheduler(createComputeScheduler(index)), index(index) {
   for (auto &handler : commandHandlers) {
     handler = &ComputePipe::unknownPacket;
   }
 
   commandHandlers[gnm::IT_NOP] = &ComputePipe::handleNop;
+  commandHandlers[gnm::IT_SET_SH_REG] = &ComputePipe::setShReg;
+  commandHandlers[gnm::IT_DISPATCH_DIRECT] = &ComputePipe::dispatchDirect;
+  commandHandlers[gnm::IT_DISPATCH_INDIRECT] = &ComputePipe::dispatchIndirect;
+  commandHandlers[gnm::IT_RELEASE_MEM] = &ComputePipe::releaseMem;
+  commandHandlers[gnm::IT_WAIT_REG_MEM] = &ComputePipe::waitRegMem;
+  commandHandlers[gnm::IT_WRITE_DATA] = &ComputePipe::writeData;
 }
 
 bool ComputePipe::processAllRings() {
   bool allProcessed = true;
 
-  for (auto &ring : queues) {
-    processRing(ring);
+  for (auto &queue : queues) {
+    std::lock_guard lock(queueMtx[&queue - queues]);
 
-    if (ring.rptr != ring.wptr) {
-      allProcessed = false;
-      break;
+    for (auto &ring : queue) {
+      if (!processRing(ring)) {
+        allProcessed = false;
+      }
     }
   }
 
   return allProcessed;
 }
 
-void ComputePipe::processRing(Queue &queue) {
-  while (queue.rptr != queue.wptr) {
-    if (queue.rptr >= queue.base + queue.size) {
-      queue.rptr = queue.base;
-    }
-
-    auto header = *queue.rptr;
-    auto type = rx::getBits(header, 31, 30);
-
-    if (type == 3) {
-      auto op = rx::getBits(header, 15, 8);
-      auto len = rx::getBits(header, 29, 16) + 2;
-
-      // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel,
-      //              gnm::pm4OpcodeToString(op));
-
-      if (op == gnm::IT_COND_EXEC) {
-        rx::die("unimplemented COND_EXEC");
-      }
-
-      auto handler = commandHandlers[op];
-      if (!(this->*handler)(queue)) {
-        return;
-      }
-
-      queue.rptr += len;
-      continue;
-    }
-
-    if (type == 2) {
-      ++queue.rptr;
-      continue;
-    }
-
-    rx::die("unexpected pm4 packet type %u", type);
+bool ComputePipe::processRing(Ring &ring) {
+  if (ring.size == 0) {
+    return true;
   }
-}
 
-bool ComputePipe::unknownPacket(Queue &queue) {
-  auto op = rx::getBits(queue.rptr[0], 15, 8);
+  while (true) {
+    if (ring.rptrReportLocation != nullptr) {
+      // FIXME: verify
+      ring.rptr = ring.base + *ring.rptrReportLocation;
+    }
 
-  rx::die("unimplemented compute pm4 packet: %s, queue %u\n",
-          gnm::pm4OpcodeToString(op), queue.indirectLevel);
+    while (ring.rptr != ring.wptr) {
+      if (ring.rptr >= ring.base + ring.size) {
+        ring.rptr = ring.base;
+        continue;
+      }
+
+      auto header = *ring.rptr;
+      auto type = rx::getBits(header, 31, 30);
+
+      if (type == 3) {
+        auto op = rx::getBits(header, 15, 8);
+        auto len = rx::getBits(header, 29, 16) + 2;
+
+        // std::fprintf(stderr, "queue %d: %s\n", ring.indirectLevel,
+        //              gnm::pm4OpcodeToString(op));
+
+        if (op == gnm::IT_COND_EXEC) {
+          rx::die("unimplemented COND_EXEC");
+        }
+
+        auto handler = commandHandlers[op];
+        if (!(this->*handler)(ring)) {
+          if (ring.rptrReportLocation != nullptr) {
+            *ring.rptrReportLocation = ring.rptr - ring.base;
+          }
+          return false;
+        }
+
+        ring.rptr += len;
+        continue;
+      }
+
+      if (type == 2) {
+        ++ring.rptr;
+        continue;
+      }
+
+      rx::die("unexpected pm4 packet type %u", type);
+    }
+
+    if (ring.rptrReportLocation != nullptr) {
+      *ring.rptrReportLocation = ring.rptr - ring.base;
+    }
+  }
 
   return true;
 }
 
-bool ComputePipe::handleNop(Queue &queue) { return true; }
+void ComputePipe::mapQueue(int queueId, Ring ring,
+                           std::unique_lock<orbis::shared_mutex> &lock) {
+  if (ring.indirectLevel < 0 || ring.indirectLevel > 1) {
+    rx::die("unexpected compute ring indirect level %d", ring.indirectLevel);
+  }
+
+  if (ring.indirectLevel == 0) {
+    waitForIdle(queueId, lock);
+  }
+
+  std::println("mapQueue: {}, {}, {}", (void *)ring.base, (void *)ring.wptr,
+               ring.size);
+
+  queues[1 - ring.indirectLevel][queueId] = ring;
+}
+
+void ComputePipe::waitForIdle(int queueId,
+                              std::unique_lock<orbis::shared_mutex> &lock) {
+  auto &ring = queues[1][queueId];
+
+  while (true) {
+    if (ring.size == 0) {
+      return;
+    }
+
+    if (ring.rptr == ring.wptr) {
+      return;
+    }
+
+    lock.unlock();
+    std::this_thread::sleep_for(std::chrono::microseconds(10));
+    lock.lock();
+  }
+}
+
+void ComputePipe::submit(int queueId, std::uint32_t offset) {
+  auto &ring = queues[1][queueId];
+  ring.wptr = ring.base + offset;
+}
+
+bool ComputePipe::setShReg(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16);
+  auto offset = ring.rptr[1] & 0xffff;
+  auto index = ring.rptr[1] >> 26;
+  auto data = ring.rptr + 2;
+
+  if (Registers::ShaderConfig::kMmioOffset + offset <
+      Registers::ComputeConfig::kMmioOffset) {
+    rx::die(
+        "unexpected compute pipe offset %x %s", offset,
+        gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset + offset));
+  }
+
+  offset -= Registers::ComputeConfig::kMmioOffset -
+            Registers::ShaderConfig::kMmioOffset;
+
+  rx::dieIf(
+      (offset + len) * sizeof(std::uint32_t) > sizeof(Registers::ComputeConfig),
+      "out of compute regs, offset: %x, count %u, %s\n", offset, len,
+      gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset + offset));
+
+  for (std::size_t i = 0; i < len; ++i) {
+    std::fprintf(stderr, "writing to %s value %x\n",
+                 gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset +
+                                         offset + i),
+                 data[i]);
+  }
+
+  std::memcpy(ring.doorbell + offset, data, sizeof(std::uint32_t) * len);
+
+  return true;
+}
+
+bool ComputePipe::dispatchDirect(Ring &ring) {
+  auto config = std::bit_cast<Registers::ComputeConfig *>(ring.doorbell);
+  auto dimX = ring.rptr[1];
+  auto dimY = ring.rptr[2];
+  auto dimZ = ring.rptr[3];
+  auto dispatchInitiator = ring.rptr[4];
+  config->computeDispatchInitiator = dispatchInitiator;
+
+  amdgpu::dispatch(device->caches[ring.vmId], scheduler, *config, dimX, dimY,
+                   dimZ);
+  return true;
+}
+
+bool ComputePipe::dispatchIndirect(Ring &ring) {
+  auto config = std::bit_cast<Registers::ComputeConfig *>(ring.doorbell);
+  auto offset = ring.rptr[1];
+  auto dispatchInitiator = ring.rptr[2];
+
+  config->computeDispatchInitiator = dispatchInitiator;
+  auto buffer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(
+      drawIndexIndirPatchBase + offset);
+
+  auto dimX = buffer[0];
+  auto dimY = buffer[1];
+  auto dimZ = buffer[2];
+
+  amdgpu::dispatch(device->caches[ring.vmId], scheduler, *config, dimX, dimY,
+                   dimZ);
+  return true;
+}
+
+bool ComputePipe::releaseMem(Ring &ring) {
+  auto eventCntl = ring.rptr[1];
+  auto dataCntl = ring.rptr[2];
+  auto addressLo = ring.rptr[3] & ~3;
+  auto addressHi = ring.rptr[4] & ((1 << 16) - 1);
+  auto dataLo = ring.rptr[5];
+  auto dataHi = ring.rptr[6];
+
+  auto eventIndex = rx::getBits(eventCntl, 11, 8);
+  auto eventType = rx::getBits(eventCntl, 5, 0);
+  auto dataSel = rx::getBits(dataCntl, 31, 29);
+  auto intSel = rx::getBits(dataCntl, 25, 24);
+
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  auto pointer = RemoteMemory{ring.vmId}.getPointer<std::uint64_t>(address);
+
+  switch (dataSel) {
+  case 0: // none
+    break;
+  case 1: // 32 bit, low
+    *reinterpret_cast<std::uint32_t *>(pointer) = dataLo;
+    break;
+  case 2: // 64 bit
+    *pointer = dataLo | (static_cast<std::uint64_t>(dataHi) << 32);
+    break;
+  case 3: // 64 bit, global GPU clock
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::system_clock::now().time_since_epoch())
+                   .count();
+    break;
+  case 4: // 64 bit, perf counter
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::steady_clock::now().time_since_epoch())
+                   .count();
+    break;
+
+  default:
+    rx::die("unimplemented event release mem data %#x", dataSel);
+  }
+
+  if (intSel) {
+    orbis::g_context.deviceEventEmitter->emit(orbis::kEvFiltGraphicsCore, 0,
+                                              kGcEventCompute0RelMem + index);
+  }
+
+  return true;
+}
+
+bool ComputePipe::waitRegMem(Ring &ring) {
+  auto engine = rx::getBit(ring.rptr[1], 8);
+  auto memSpace = rx::getBit(ring.rptr[1], 4);
+  auto function = rx::getBits(ring.rptr[1], 2, 0);
+  auto pollAddressLo = ring.rptr[2];
+  auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1);
+  auto reference = ring.rptr[4];
+  auto mask = ring.rptr[5];
+  auto pollInterval = ring.rptr[6];
+
+  std::uint32_t pollData;
+
+  if (memSpace == 0) {
+    pollData = *getMmRegister(ring, pollAddressLo & ((1 << 16) - 1));
+  } else {
+    auto pollAddress = (pollAddressLo & ~3) |
+                       (static_cast<std::uint64_t>(pollAddressHi) << 32);
+    pollData = *RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(pollAddress);
+  }
+
+  return compare(function, pollData, mask, reference);
+}
+
+bool ComputePipe::writeData(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16) - 1;
+  auto control = ring.rptr[1];
+  auto dstAddressLo = ring.rptr[2];
+  auto dstAddressHi = ring.rptr[3];
+  auto data = ring.rptr + 4;
+
+  auto engineSel = rx::getBits(control, 31, 30);
+  auto wrConfirm = rx::getBit(control, 20);
+  auto wrOneAddress = rx::getBit(control, 16);
+  auto dstSel = rx::getBits(control, 11, 8);
+
+  std::uint32_t *dstPointer = nullptr;
+
+  switch (dstSel) {
+  case 0: // memory mapped register
+    dstPointer = getMmRegister(ring, dstAddressLo & ((1 << 16) - 1));
+    break;
+
+  case 1:   // memory sync
+  case 2:   // TC L2
+  case 5: { // memory async
+    auto address =
+        (dstAddressLo & ~3) | (static_cast<std::uint64_t>(dstAddressHi) << 32);
+    dstPointer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(address);
+    break;
+  }
+
+  default:
+    rx::die("unimplemented write data, dst sel = %#x", dstSel);
+  }
+
+  if (wrOneAddress) {
+    for (std::uint32_t i = 0; i < len; ++i) {
+      *dstPointer = data[i];
+    }
+  } else {
+    std::memcpy(dstPointer, data, len * sizeof(std::uint32_t));
+  }
+
+  return true;
+}
+
+bool ComputePipe::unknownPacket(Ring &ring) {
+  auto op = rx::getBits(ring.rptr[0], 15, 8);
+
+  rx::die("unimplemented compute pm4 packet: %s, indirect level %u\n",
+          gnm::pm4OpcodeToString(op), ring.indirectLevel);
+
+  return true;
+}
+
+bool ComputePipe::handleNop(Ring &ring) { return true; }
+
+std::uint32_t *ComputePipe::getMmRegister(Ring &ring, std::uint32_t dwAddress) {
+  if (dwAddress >= Registers::ComputeConfig::kMmioOffset &&
+      dwAddress <
+          Registers::ComputeConfig::kMmioOffset +
+              sizeof(Registers::ComputeConfig) / sizeof(std::uint32_t)) {
+    return ring.doorbell + (dwAddress - Registers::ComputeConfig::kMmioOffset);
+  }
+
+  rx::die("unexpected memory mapped compute register address %x, %s", dwAddress,
+          gnm::mmio::registerName(dwAddress));
+}
 
 GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) {
   for (auto &processorHandlers : commandHandlers) {
@@ -263,15 +523,15 @@ GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) {
   mainHandlers[IT_FLIP] = &GraphicsPipe::flip;
 }
 
-void GraphicsPipe::setCeQueue(Queue queue) {
-  queue.indirectLevel = -1;
-  ceQueue = queue;
+void GraphicsPipe::setCeQueue(Ring ring) {
+  ring.indirectLevel = -1;
+  ceQueue = ring;
 }
 
-void GraphicsPipe::setDeQueue(Queue queue, int ring) {
-  rx::dieIf(ring > 2, "out of indirect gfx rings, %u", ring);
-  queue.indirectLevel = ring;
-  deQueues[2 - ring] = queue;
+void GraphicsPipe::setDeQueue(Ring ring, int indirectLevel) {
+  rx::dieIf(indirectLevel > 2, "out of indirect gfx rings, %u", indirectLevel);
+  ring.indirectLevel = indirectLevel;
+  deQueues[2 - indirectLevel] = ring;
 }
 
 std::uint32_t *GraphicsPipe::getMmRegister(std::uint32_t dwAddress) {
@@ -318,14 +578,14 @@ bool GraphicsPipe::processAllRings() {
     }
   }
 
-  for (auto &queue : deQueues) {
-    if (queue.rptr == queue.wptr) {
+  for (auto &ring : deQueues) {
+    if (ring.rptr == ring.wptr) {
       continue;
     }
 
-    processRing(queue);
+    processRing(ring);
 
-    if (queue.rptr != queue.wptr) {
+    if (ring.rptr != ring.wptr) {
       allProcessed = false;
       break;
     }
@@ -334,21 +594,21 @@ bool GraphicsPipe::processAllRings() {
   return allProcessed;
 }
 
-void GraphicsPipe::processRing(Queue &queue) {
+void GraphicsPipe::processRing(Ring &ring) {
   int cp;
-  if (queue.indirectLevel < 0) {
+  if (ring.indirectLevel < 0) {
     cp = 0;
   } else {
-    cp = queue.indirectLevel + 1;
+    cp = ring.indirectLevel + 1;
   }
 
-  while (queue.rptr != queue.wptr) {
-    if (queue.rptr >= queue.base + queue.size) {
-      queue.rptr = queue.base;
+  while (ring.rptr != ring.wptr) {
+    if (ring.rptr >= ring.base + ring.size) {
+      ring.rptr = ring.base;
       continue;
     }
 
-    auto header = *queue.rptr;
+    auto header = *ring.rptr;
     auto type = rx::getBits(header, 31, 30);
 
     if (type == 3) {
@@ -356,9 +616,9 @@ void GraphicsPipe::processRing(Queue &queue) {
       auto len = rx::getBits(header, 29, 16) + 2;
 
       // if (auto str = gnm::pm4OpcodeToString(op)) {
-      //   std::println(stderr, "queue {}: {}", queue.indirectLevel, str);
+      //   std::println(stderr, "queue {}: {}", ring.indirectLevel, str);
       // } else {
-      //   std::println(stderr, "queue {}: {:x}", queue.indirectLevel, op);
+      //   std::println(stderr, "queue {}: {:x}", ring.indirectLevel, op);
       // }
 
       if (op == gnm::IT_COND_EXEC) {
@@ -366,11 +626,11 @@ void GraphicsPipe::processRing(Queue &queue) {
       }
 
       auto handler = commandHandlers[cp][op];
-      if (!(this->*handler)(queue)) {
+      if (!(this->*handler)(ring)) {
         return;
       }
 
-      queue.rptr += len;
+      ring.rptr += len;
 
       if (op == gnm::IT_INDIRECT_BUFFER || op == gnm::IT_INDIRECT_BUFFER_CNST) {
         break;
@@ -380,34 +640,33 @@ void GraphicsPipe::processRing(Queue &queue) {
     }
 
     if (type == 2) {
-      ++queue.rptr;
+      ++ring.rptr;
       continue;
     }
 
     rx::die("unexpected pm4 packet type %u, ring %u, header %u, rptr %p, wptr "
             "%p, base %p",
-            type, queue.indirectLevel, header, queue.rptr, queue.wptr,
-            queue.base);
+            type, ring.indirectLevel, header, ring.rptr, ring.wptr, ring.base);
   }
 }
 
-bool GraphicsPipe::handleNop(Queue &queue) { return true; }
+bool GraphicsPipe::handleNop(Ring &ring) { return true; }
 
-bool GraphicsPipe::setBase(Queue &queue) {
-  auto baseIndex = queue.rptr[1] & 0xf;
+bool GraphicsPipe::setBase(Ring &ring) {
+  auto baseIndex = ring.rptr[1] & 0xf;
 
   switch (baseIndex) {
   case 0: {
-    auto address0 = queue.rptr[2] & ~3;
-    auto address1 = queue.rptr[3] & ((1 << 16) - 1);
+    auto address0 = ring.rptr[2] & ~3;
+    auto address1 = ring.rptr[3] & ((1 << 16) - 1);
 
     displayListPatchBase =
         address0 | (static_cast<std::uint64_t>(address1) << 32);
     break;
   }
   case 1: {
-    auto address0 = queue.rptr[2] & ~3;
-    auto address1 = queue.rptr[3] & ((1 << 16) - 1);
+    auto address0 = ring.rptr[2] & ~3;
+    auto address1 = ring.rptr[3] & ((1 << 16) - 1);
 
     drawIndexIndirPatchBase =
         address0 | (static_cast<std::uint64_t>(address1) << 32);
@@ -415,16 +674,16 @@ bool GraphicsPipe::setBase(Queue &queue) {
   }
 
   case 2: {
-    auto cs1Index = queue.rptr[2] & ((1 << 16) - 1);
-    auto cs2Index = queue.rptr[3] & ((1 << 16) - 1);
+    auto cs1Index = ring.rptr[2] & ((1 << 16) - 1);
+    auto cs2Index = ring.rptr[3] & ((1 << 16) - 1);
     gdsPartitionBases[0] = cs1Index;
     gdsPartitionBases[1] = cs2Index;
     break;
   }
 
   case 3: {
-    auto cs1Index = queue.rptr[2] & ((1 << 16) - 1);
-    auto cs2Index = queue.rptr[3] & ((1 << 16) - 1);
+    auto cs1Index = ring.rptr[2] & ((1 << 16) - 1);
+    auto cs2Index = ring.rptr[3] & ((1 << 16) - 1);
     cePartitionBases[0] = cs1Index;
     cePartitionBases[1] = cs2Index;
     break;
@@ -437,7 +696,7 @@ bool GraphicsPipe::setBase(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::clearState(Queue &queue) {
+bool GraphicsPipe::clearState(Ring &ring) {
   auto paScClipRectRule = context.paScClipRectRule.value;
   auto cbTargetMask = context.cbTargetMask.raw;
   auto cbShaderMask = context.cbShaderMask.raw;
@@ -460,15 +719,15 @@ bool GraphicsPipe::clearState(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::contextControl(Queue &queue) { return true; }
-bool GraphicsPipe::acquireMem(Queue &queue) { return true; }
-bool GraphicsPipe::releaseMem(Queue &queue) {
-  auto eventCntl = queue.rptr[1];
-  auto dataCntl = queue.rptr[2];
-  auto addressLo = queue.rptr[3] & ~3;
-  auto addressHi = queue.rptr[3] & ~3;
-  auto dataLo = queue.rptr[4];
-  auto dataHi = queue.rptr[5];
+bool GraphicsPipe::contextControl(Ring &ring) { return true; }
+bool GraphicsPipe::acquireMem(Ring &ring) { return true; }
+bool GraphicsPipe::releaseMem(Ring &ring) {
+  auto eventCntl = ring.rptr[1];
+  auto dataCntl = ring.rptr[2];
+  auto addressLo = ring.rptr[3] & ~3;
+  auto addressHi = ring.rptr[4] & ((1 << 16) - 1);
+  auto dataLo = ring.rptr[5];
+  auto dataHi = ring.rptr[6];
 
   auto eventIndex = rx::getBits(eventCntl, 11, 8);
   auto eventType = rx::getBits(eventCntl, 5, 0);
@@ -476,7 +735,7 @@ bool GraphicsPipe::releaseMem(Queue &queue) {
   auto intSel = rx::getBits(dataCntl, 25, 24);
 
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
-  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint64_t>(address);
+  auto pointer = RemoteMemory{ring.vmId}.getPointer<std::uint64_t>(address);
 
   context.vgtEventInitiator = eventType;
 
@@ -507,43 +766,43 @@ bool GraphicsPipe::releaseMem(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::drawPreamble(Queue &queue) { return true; }
+bool GraphicsPipe::drawPreamble(Ring &ring) { return true; }
 
-bool GraphicsPipe::indexBufferSize(Queue &queue) {
-  vgtIndexBufferSize = queue.rptr[1];
+bool GraphicsPipe::indexBufferSize(Ring &ring) {
+  vgtIndexBufferSize = ring.rptr[1];
   return true;
 }
-bool GraphicsPipe::dispatchDirect(Queue &queue) {
-  auto dimX = queue.rptr[1];
-  auto dimY = queue.rptr[2];
-  auto dimZ = queue.rptr[3];
-  auto dispatchInitiator = queue.rptr[4];
+bool GraphicsPipe::dispatchDirect(Ring &ring) {
+  auto dimX = ring.rptr[1];
+  auto dimY = ring.rptr[2];
+  auto dimZ = ring.rptr[3];
+  auto dispatchInitiator = ring.rptr[4];
   sh.compute.computeDispatchInitiator = dispatchInitiator;
 
-  amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX,
-                   dimY, dimZ);
+  amdgpu::dispatch(device->caches[ring.vmId], scheduler, sh.compute, dimX, dimY,
+                   dimZ);
   return true;
 }
-bool GraphicsPipe::dispatchIndirect(Queue &queue) {
-  auto offset = queue.rptr[1];
-  auto dispatchInitiator = queue.rptr[2];
+bool GraphicsPipe::dispatchIndirect(Ring &ring) {
+  auto offset = ring.rptr[1];
+  auto dispatchInitiator = ring.rptr[2];
 
   sh.compute.computeDispatchInitiator = dispatchInitiator;
-  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+  auto buffer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(
       drawIndexIndirPatchBase + offset);
 
   auto dimX = buffer[0];
   auto dimY = buffer[1];
   auto dimZ = buffer[2];
 
-  amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX,
-                   dimY, dimZ);
+  amdgpu::dispatch(device->caches[ring.vmId], scheduler, sh.compute, dimX, dimY,
+                   dimZ);
   return true;
 }
 
-bool GraphicsPipe::setPredication(Queue &queue) {
-  auto startAddressLo = queue.rptr[1] & ~0xf;
-  auto predProperties = queue.rptr[2];
+bool GraphicsPipe::setPredication(Ring &ring) {
+  auto startAddressLo = ring.rptr[1] & ~0xf;
+  auto predProperties = ring.rptr[2];
 
   auto startAddressHi = rx::getBits(predProperties, 15, 0);
   auto predBool = rx::getBit(predProperties, 8);
@@ -562,15 +821,15 @@ bool GraphicsPipe::setPredication(Queue &queue) {
 
   return true;
 }
-bool GraphicsPipe::drawIndirect(Queue &queue) {
-  auto dataOffset = queue.rptr[1];
-  auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1);
-  auto startInstLoc = queue.rptr[3] & ((1 << 16) - 1);
-  auto drawInitiator = queue.rptr[4];
+bool GraphicsPipe::drawIndirect(Ring &ring) {
+  auto dataOffset = ring.rptr[1];
+  auto baseVtxLoc = ring.rptr[2] & ((1 << 16) - 1);
+  auto startInstLoc = ring.rptr[3] & ((1 << 16) - 1);
+  auto drawInitiator = ring.rptr[4];
 
   context.vgtDrawInitiator = drawInitiator;
 
-  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+  auto buffer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(
       drawIndexIndirPatchBase + dataOffset);
 
   std::uint32_t vertexCountPerInstance = buffer[0];
@@ -578,16 +837,16 @@ bool GraphicsPipe::drawIndirect(Queue &queue) {
   std::uint32_t startVertexLocation = buffer[2];
   std::uint32_t startInstanceLocation = buffer[3];
 
-  draw(*this, queue.vmId, startVertexLocation, vertexCountPerInstance,
+  draw(*this, ring.vmId, startVertexLocation, vertexCountPerInstance,
        startInstanceLocation, instanceCount, 0, 0, 0);
   return true;
 }
-bool GraphicsPipe::drawIndexIndirect(Queue &queue) {
-  auto dataOffset = queue.rptr[1];
-  auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1);
-  auto drawInitiator = queue.rptr[3];
+bool GraphicsPipe::drawIndexIndirect(Ring &ring) {
+  auto dataOffset = ring.rptr[1];
+  auto baseVtxLoc = ring.rptr[2] & ((1 << 16) - 1);
+  auto drawInitiator = ring.rptr[3];
 
-  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+  auto buffer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(
       drawIndexIndirPatchBase + dataOffset);
 
   context.vgtDrawInitiator = drawInitiator;
@@ -598,24 +857,24 @@ bool GraphicsPipe::drawIndexIndirect(Queue &queue) {
   std::uint32_t baseVertexLocation = buffer[3];
   std::uint32_t startInstanceLocation = buffer[4];
 
-  draw(*this, queue.vmId, baseVertexLocation, indexCountPerInstance,
+  draw(*this, ring.vmId, baseVertexLocation, indexCountPerInstance,
        startInstanceLocation, instanceCount, vgtIndexBase, startIndexLocation,
        indexCountPerInstance);
   return true;
 }
-bool GraphicsPipe::indexBase(Queue &queue) {
-  auto addressLo = queue.rptr[1] & ~1;
-  auto addressHi = queue.rptr[2] & ((1 << 16) - 1);
+bool GraphicsPipe::indexBase(Ring &ring) {
+  auto addressLo = ring.rptr[1] & ~1;
+  auto addressHi = ring.rptr[2] & ((1 << 16) - 1);
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
   vgtIndexBase = address;
   return true;
 }
-bool GraphicsPipe::drawIndex2(Queue &queue) {
-  auto maxSize = queue.rptr[1];
-  auto indexBaseLo = queue.rptr[2] & ~1;
-  auto indexBaseHi = queue.rptr[3] & ((1 << 16) - 1);
-  auto indexCount = queue.rptr[4];
-  auto drawInitiator = queue.rptr[5];
+bool GraphicsPipe::drawIndex2(Ring &ring) {
+  auto maxSize = ring.rptr[1];
+  auto indexBaseLo = ring.rptr[2] & ~1;
+  auto indexBaseHi = ring.rptr[3] & ((1 << 16) - 1);
+  auto indexCount = ring.rptr[4];
+  auto drawInitiator = ring.rptr[5];
 
   context.vgtDrawInitiator = drawInitiator;
   uConfig.vgtNumIndices = indexCount;
@@ -623,32 +882,32 @@ bool GraphicsPipe::drawIndex2(Queue &queue) {
   auto indexBase =
       indexBaseLo | (static_cast<std::uint64_t>(indexBaseHi) << 32);
 
-  draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, indexBase,
+  draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, indexBase,
        0, maxSize);
   return true;
 }
-bool GraphicsPipe::indexType(Queue &queue) {
-  uConfig.vgtIndexType = static_cast<gnm::IndexType>(queue.rptr[1] & 1);
+bool GraphicsPipe::indexType(Ring &ring) {
+  uConfig.vgtIndexType = static_cast<gnm::IndexType>(ring.rptr[1] & 1);
   return true;
 }
-bool GraphicsPipe::drawIndexAuto(Queue &queue) {
-  auto indexCount = queue.rptr[1];
-  auto drawInitiator = queue.rptr[2];
+bool GraphicsPipe::drawIndexAuto(Ring &ring) {
+  auto indexCount = ring.rptr[1];
+  auto drawInitiator = ring.rptr[2];
 
   uConfig.vgtNumIndices = indexCount;
   context.vgtDrawInitiator = drawInitiator;
 
-  draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0, 0);
+  draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0, 0);
   return true;
 }
-bool GraphicsPipe::numInstances(Queue &queue) {
-  uConfig.vgtNumInstances = std::max(queue.rptr[1], 1u);
+bool GraphicsPipe::numInstances(Ring &ring) {
+  uConfig.vgtNumInstances = std::max(ring.rptr[1], 1u);
   return true;
 }
-bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) {
-  auto primCount = queue.rptr[1];
-  auto drawInitiator = queue.rptr[2];
-  auto control = queue.rptr[3];
+bool GraphicsPipe::drawIndexMultiAuto(Ring &ring) {
+  auto primCount = ring.rptr[1];
+  auto drawInitiator = ring.rptr[2];
+  auto control = ring.rptr[3];
 
   auto indexOffset = rx::getBits(control, 15, 0);
   auto primType = rx::getBits(control, 20, 16);
@@ -658,27 +917,27 @@ bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) {
   uConfig.vgtPrimitiveType = static_cast<gnm::PrimitiveType>(primType);
   uConfig.vgtNumIndices = indexCount;
 
-  draw(*this, queue.vmId, 0, primCount, 0, uConfig.vgtNumInstances,
-       vgtIndexBase, indexOffset, indexCount);
+  draw(*this, ring.vmId, 0, primCount, 0, uConfig.vgtNumInstances, vgtIndexBase,
+       indexOffset, indexCount);
   return true;
 }
-bool GraphicsPipe::drawIndexOffset2(Queue &queue) {
-  auto maxSize = queue.rptr[1];
-  auto indexOffset = queue.rptr[2];
-  auto indexCount = queue.rptr[3];
-  auto drawInitiator = queue.rptr[4];
+bool GraphicsPipe::drawIndexOffset2(Ring &ring) {
+  auto maxSize = ring.rptr[1];
+  auto indexOffset = ring.rptr[2];
+  auto indexCount = ring.rptr[3];
+  auto drawInitiator = ring.rptr[4];
 
   context.vgtDrawInitiator = drawInitiator;
-  draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances,
+  draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances,
        vgtIndexBase, indexOffset, maxSize);
   return true;
 }
-bool GraphicsPipe::writeData(Queue &queue) {
-  auto len = rx::getBits(queue.rptr[0], 29, 16) - 1;
-  auto control = queue.rptr[1];
-  auto dstAddressLo = queue.rptr[2];
-  auto dstAddressHi = queue.rptr[3];
-  auto data = queue.rptr + 4;
+bool GraphicsPipe::writeData(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16) - 1;
+  auto control = ring.rptr[1];
+  auto dstAddressLo = ring.rptr[2];
+  auto dstAddressHi = ring.rptr[3];
+  auto data = ring.rptr + 4;
 
   auto engineSel = rx::getBits(control, 31, 30);
   auto wrConfirm = rx::getBit(control, 20);
@@ -697,7 +956,7 @@ bool GraphicsPipe::writeData(Queue &queue) {
   case 5: { // memory async
     auto address =
         (dstAddressLo & ~3) | (static_cast<std::uint64_t>(dstAddressHi) << 32);
-    dstPointer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(address);
+    dstPointer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(address);
     break;
   }
 
@@ -715,19 +974,19 @@ bool GraphicsPipe::writeData(Queue &queue) {
 
   return true;
 }
-bool GraphicsPipe::memSemaphore(Queue &queue) {
+bool GraphicsPipe::memSemaphore(Ring &ring) {
   // FIXME
   return true;
 }
-bool GraphicsPipe::waitRegMem(Queue &queue) {
-  auto engine = rx::getBit(queue.rptr[1], 8);
-  auto memSpace = rx::getBit(queue.rptr[1], 4);
-  auto function = rx::getBits(queue.rptr[1], 2, 0);
-  auto pollAddressLo = queue.rptr[2];
-  auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1);
-  auto reference = queue.rptr[4];
-  auto mask = queue.rptr[5];
-  auto pollInterval = queue.rptr[6];
+bool GraphicsPipe::waitRegMem(Ring &ring) {
+  auto engine = rx::getBit(ring.rptr[1], 8);
+  auto memSpace = rx::getBit(ring.rptr[1], 4);
+  auto function = rx::getBits(ring.rptr[1], 2, 0);
+  auto pollAddressLo = ring.rptr[2];
+  auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1);
+  auto reference = ring.rptr[4];
+  auto mask = ring.rptr[5];
+  auto pollInterval = ring.rptr[6];
 
   std::uint32_t pollData;
 
@@ -736,61 +995,60 @@ bool GraphicsPipe::waitRegMem(Queue &queue) {
   } else {
     auto pollAddress = (pollAddressLo & ~3) |
                        (static_cast<std::uint64_t>(pollAddressHi) << 32);
-    pollData = *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(pollAddress);
+    pollData = *RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(pollAddress);
   }
 
   return compare(function, pollData, mask, reference);
 }
 
-bool GraphicsPipe::indirectBufferConst(Queue &queue) {
-  rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP");
+bool GraphicsPipe::indirectBufferConst(Ring &ring) {
+  rx::dieIf(ring.indirectLevel < 0, "unexpected indirect buffer from CP");
 
-  auto addressLo = queue.rptr[1] & ~3;
-  auto addressHi = queue.rptr[2] & ((1 << 8) - 1);
-  int vmId = queue.rptr[3] >> 24;
-  auto ibSize = queue.rptr[3] & ((1 << 20) - 1);
+  auto addressLo = ring.rptr[1] & ~3;
+  auto addressHi = ring.rptr[2] & ((1 << 8) - 1);
+  int vmId = ring.rptr[3] >> 24;
+  auto ibSize = ring.rptr[3] & ((1 << 20) - 1);
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
 
-  if (queue.indirectLevel != 0) {
-    vmId = queue.vmId;
+  if (ring.indirectLevel != 0) {
+    vmId = ring.vmId;
   }
 
   auto rptr = RemoteMemory{vmId}.getPointer<std::uint32_t>(address);
-  setCeQueue(Queue::createFromRange(vmId, rptr, ibSize));
+  setCeQueue(Ring::createFromRange(vmId, rptr, ibSize));
   return true;
 }
-bool GraphicsPipe::indirectBuffer(Queue &queue) {
-  rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP");
+bool GraphicsPipe::indirectBuffer(Ring &ring) {
+  rx::dieIf(ring.indirectLevel < 0, "unexpected indirect buffer from CP");
 
-  auto addressLo = queue.rptr[1] & ~3;
-  auto addressHi = queue.rptr[2] & ((1 << 8) - 1);
-  int vmId = queue.rptr[3] >> 24;
-  auto ibSize = queue.rptr[3] & ((1 << 20) - 1);
+  auto addressLo = ring.rptr[1] & ~3;
+  auto addressHi = ring.rptr[2] & ((1 << 8) - 1);
+  int vmId = ring.rptr[3] >> 24;
+  auto ibSize = ring.rptr[3] & ((1 << 20) - 1);
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
 
-  if (queue.indirectLevel != 0) {
-    vmId = queue.vmId;
+  if (ring.indirectLevel != 0) {
+    vmId = ring.vmId;
   }
   auto rptr = RemoteMemory{vmId}.getPointer<std::uint32_t>(address);
-  setDeQueue(Queue::createFromRange(vmId, rptr, ibSize),
-             queue.indirectLevel + 1);
+  setDeQueue(Ring::createFromRange(vmId, rptr, ibSize), ring.indirectLevel + 1);
   return true;
 }
-bool GraphicsPipe::pfpSyncMe(Queue &queue) {
+bool GraphicsPipe::pfpSyncMe(Ring &ring) {
   // TODO
   return true;
 }
-bool GraphicsPipe::condWrite(Queue &queue) {
-  auto writeSpace = rx::getBit(queue.rptr[1], 8);
-  auto pollSpace = rx::getBit(queue.rptr[1], 4);
-  auto function = rx::getBits(queue.rptr[1], 2, 0);
-  auto pollAddressLo = queue.rptr[2];
-  auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1);
-  auto reference = queue.rptr[4];
-  auto mask = queue.rptr[5];
-  auto writeAddressLo = queue.rptr[6];
-  auto writeAddressHi = queue.rptr[7] & ((1 << 16) - 1);
-  auto writeData = queue.rptr[8];
+bool GraphicsPipe::condWrite(Ring &ring) {
+  auto writeSpace = rx::getBit(ring.rptr[1], 8);
+  auto pollSpace = rx::getBit(ring.rptr[1], 4);
+  auto function = rx::getBits(ring.rptr[1], 2, 0);
+  auto pollAddressLo = ring.rptr[2];
+  auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1);
+  auto reference = ring.rptr[4];
+  auto mask = ring.rptr[5];
+  auto writeAddressLo = ring.rptr[6];
+  auto writeAddressHi = ring.rptr[7] & ((1 << 16) - 1);
+  auto writeData = ring.rptr[8];
 
   std::uint32_t pollData;
 
@@ -799,7 +1057,7 @@ bool GraphicsPipe::condWrite(Queue &queue) {
   } else {
     auto pollAddress = (pollAddressLo & ~3) |
                        (static_cast<std::uint64_t>(pollAddressHi) << 32);
-    pollData = *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(pollAddress);
+    pollData = *RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(pollAddress);
   }
 
   if (compare(function, pollData, mask, reference)) {
@@ -809,7 +1067,7 @@ bool GraphicsPipe::condWrite(Queue &queue) {
       auto writeAddress = (writeAddressLo & ~3) |
                           (static_cast<std::uint64_t>(writeAddressHi) << 32);
 
-      *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(writeAddress) =
+      *RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(writeAddress) =
           writeData;
     }
   }
@@ -817,7 +1075,7 @@ bool GraphicsPipe::condWrite(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::eventWrite(Queue &queue) {
+bool GraphicsPipe::eventWrite(Ring &ring) {
   enum {
     kEventZPassDone = 1,
     kEventSamplePipelineStat = 2,
@@ -825,7 +1083,7 @@ bool GraphicsPipe::eventWrite(Queue &queue) {
     kEventPartialFlush = 4,
   };
 
-  auto eventCntl = queue.rptr[1];
+  auto eventCntl = ring.rptr[1];
   auto invL2 = rx::getBit(eventCntl, 20);
   auto eventIndex = rx::getBits(eventCntl, 11, 8);
   auto eventType = rx::getBits(eventCntl, 5, 0);
@@ -834,8 +1092,8 @@ bool GraphicsPipe::eventWrite(Queue &queue) {
 
   if (eventIndex == kEventZPassDone || eventIndex == kEventSamplePipelineStat ||
       eventIndex == kEventSampleStreamOutStat) {
-    auto addressLo = queue.rptr[2] & ~7;
-    auto addressHi = queue.rptr[3] & ((1 << 16) - 1);
+    auto addressLo = ring.rptr[2] & ~7;
+    auto addressHi = ring.rptr[3] & ((1 << 16) - 1);
     auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
     rx::die("unimplemented event write, event index %#x, address %lx",
             eventIndex, address);
@@ -846,12 +1104,12 @@ bool GraphicsPipe::eventWrite(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::eventWriteEop(Queue &queue) {
-  auto eventCntl = queue.rptr[1];
-  auto addressLo = queue.rptr[2] & ~3;
-  auto dataCntl = queue.rptr[3];
-  auto dataLo = queue.rptr[4];
-  auto dataHi = queue.rptr[5];
+bool GraphicsPipe::eventWriteEop(Ring &ring) {
+  auto eventCntl = ring.rptr[1];
+  auto addressLo = ring.rptr[2] & ~3;
+  auto dataCntl = ring.rptr[3];
+  auto dataLo = ring.rptr[4];
+  auto dataHi = ring.rptr[5];
 
   auto invL2 = rx::getBit(eventCntl, 20);
   auto eventIndex = rx::getBits(eventCntl, 11, 8);
@@ -861,7 +1119,7 @@ bool GraphicsPipe::eventWriteEop(Queue &queue) {
   auto addressHi = rx::getBits(dataCntl, 15, 0);
 
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
-  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint64_t>(address);
+  auto pointer = RemoteMemory{ring.vmId}.getPointer<std::uint64_t>(address);
 
   context.vgtEventInitiator = eventType;
 
@@ -897,11 +1155,11 @@ bool GraphicsPipe::eventWriteEop(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::eventWriteEos(Queue &queue) {
-  auto eventCntl = queue.rptr[1];
-  auto addressLo = queue.rptr[2] & ~3;
-  auto cmdInfo = queue.rptr[3];
-  auto dataInfo = queue.rptr[4];
+bool GraphicsPipe::eventWriteEos(Ring &ring) {
+  auto eventCntl = ring.rptr[1];
+  auto addressLo = ring.rptr[2] & ~3;
+  auto cmdInfo = ring.rptr[3];
+  auto dataInfo = ring.rptr[4];
 
   auto eventIndex = rx::getBits(eventCntl, 11, 8);
   auto eventType = rx::getBits(eventCntl, 5, 0);
@@ -909,10 +1167,10 @@ bool GraphicsPipe::eventWriteEos(Queue &queue) {
   auto addressHi = rx::getBits(cmdInfo, 15, 0);
 
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
-  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(address);
+  auto pointer = RemoteMemory{ring.vmId}.getPointer<std::uint32_t>(address);
 
   context.vgtEventInitiator = eventType;
-  auto &cache = device->caches[queue.vmId];
+  auto &cache = device->caches[ring.vmId];
 
   switch (cmd) {
   case 1: { // store GDS data to memory
@@ -940,14 +1198,14 @@ bool GraphicsPipe::eventWriteEos(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::dmaData(Queue &queue) {
-  auto control = queue.rptr[1];
-  auto srcAddressLo = queue.rptr[2];
+bool GraphicsPipe::dmaData(Ring &ring) {
+  auto control = ring.rptr[1];
+  auto srcAddressLo = ring.rptr[2];
   auto data = srcAddressLo;
-  auto srcAddressHi = queue.rptr[3];
-  auto dstAddressLo = queue.rptr[4];
-  auto dstAddressHi = queue.rptr[5];
-  auto cmdSize = queue.rptr[6];
+  auto srcAddressHi = ring.rptr[3];
+  auto dstAddressLo = ring.rptr[4];
+  auto dstAddressHi = ring.rptr[5];
+  auto cmdSize = ring.rptr[6];
   auto size = rx::getBits(cmdSize, 20, 0);
 
   auto engine = rx::getBit(control, 0);
@@ -1000,8 +1258,8 @@ bool GraphicsPipe::dmaData(Queue &queue) {
     if (dstSel == 3 || das == 0) {
       auto dstAddress =
           dstAddressLo | (static_cast<std::uint64_t>(dstAddressHi) << 32);
-      dst = amdgpu::RemoteMemory{queue.vmId}.getPointer(dstAddress);
-      device->caches[queue.vmId].invalidate(
+      dst = amdgpu::RemoteMemory{ring.vmId}.getPointer(dstAddress);
+      device->caches[ring.vmId].invalidate(
           scheduler, rx::AddressRange::fromBeginSize(dstAddress, size));
     } else {
       dst = getMmRegister(dstAddressLo / sizeof(std::uint32_t));
@@ -1009,7 +1267,7 @@ bool GraphicsPipe::dmaData(Queue &queue) {
     break;
 
   case 1:
-    dst = device->caches[queue.vmId].getGdsBuffer().getData() + dstAddressLo;
+    dst = device->caches[ring.vmId].getGdsBuffer().getData() + dstAddressLo;
     break;
 
   default:
@@ -1024,8 +1282,8 @@ bool GraphicsPipe::dmaData(Queue &queue) {
     if (srcSel == 3 || sas == 0) {
       auto srcAddress =
           srcAddressLo | (static_cast<std::uint64_t>(srcAddressHi) << 32);
-      src = amdgpu::RemoteMemory{queue.vmId}.getPointer(srcAddress);
-      device->caches[queue.vmId].flush(
+      src = amdgpu::RemoteMemory{ring.vmId}.getPointer(srcAddress);
+      device->caches[ring.vmId].flush(
           scheduler, rx::AddressRange::fromBeginSize(srcAddress, size));
     } else {
       src = getMmRegister(srcAddressLo / sizeof(std::uint32_t));
@@ -1034,7 +1292,7 @@ bool GraphicsPipe::dmaData(Queue &queue) {
     srcSize = ~0;
     break;
   case 1:
-    src = device->caches[queue.vmId].getGdsBuffer().getData() + srcAddressLo;
+    src = device->caches[ring.vmId].getGdsBuffer().getData() + srcAddressLo;
     srcSize = ~0;
     break;
 
@@ -1072,10 +1330,10 @@ bool GraphicsPipe::dmaData(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::setConfigReg(Queue &queue) {
-  auto len = rx::getBits(queue.rptr[0], 29, 16);
-  auto offset = queue.rptr[1] & 0xffff;
-  auto data = queue.rptr + 2;
+bool GraphicsPipe::setConfigReg(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16);
+  auto offset = ring.rptr[1] & 0xffff;
+  auto data = ring.rptr + 2;
 
   rx::dieIf(
       (offset + len) * sizeof(std::uint32_t) > sizeof(device->config),
@@ -1088,11 +1346,11 @@ bool GraphicsPipe::setConfigReg(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::setShReg(Queue &queue) {
-  auto len = rx::getBits(queue.rptr[0], 29, 16);
-  auto offset = queue.rptr[1] & 0xffff;
-  auto index = queue.rptr[1] >> 26;
-  auto data = queue.rptr + 2;
+bool GraphicsPipe::setShReg(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16);
+  auto offset = ring.rptr[1] & 0xffff;
+  auto index = ring.rptr[1] >> 26;
+  auto data = ring.rptr + 2;
 
   rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(sh),
             "out of SH regs, offset: %x, count %u, %s\n", offset, len,
@@ -1109,11 +1367,11 @@ bool GraphicsPipe::setShReg(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::setUConfigReg(Queue &queue) {
-  auto len = rx::getBits(queue.rptr[0], 29, 16);
-  auto offset = queue.rptr[1] & 0xffff;
-  auto index = queue.rptr[1] >> 26;
-  auto data = queue.rptr + 2;
+bool GraphicsPipe::setUConfigReg(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16);
+  auto offset = ring.rptr[1] & 0xffff;
+  auto index = ring.rptr[1] >> 26;
+  auto data = ring.rptr + 2;
 
   if (index != 0) {
     {
@@ -1150,11 +1408,11 @@ bool GraphicsPipe::setUConfigReg(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::setContextReg(Queue &queue) {
-  auto len = rx::getBits(queue.rptr[0], 29, 16);
-  auto offset = queue.rptr[1] & 0xffff;
-  auto index = queue.rptr[1] >> 26;
-  auto data = queue.rptr + 2;
+bool GraphicsPipe::setContextReg(Ring &ring) {
+  auto len = rx::getBits(ring.rptr[0], 29, 16);
+  auto offset = ring.rptr[1] & 0xffff;
+  auto index = ring.rptr[1] >> 26;
+  auto data = ring.rptr + 2;
 
   if (index != 0) {
     {
@@ -1192,114 +1450,114 @@ bool GraphicsPipe::setContextReg(Queue &queue) {
   return true;
 }
 
-bool GraphicsPipe::setCeDeCounters(Queue &queue) {
-  auto counterLo = queue.rptr[1];
-  auto counterHi = queue.rptr[2];
+bool GraphicsPipe::setCeDeCounters(Ring &ring) {
+  auto counterLo = ring.rptr[1];
+  auto counterHi = ring.rptr[2];
   auto counter = counterLo | (static_cast<std::uint64_t>(counterHi) << 32);
   deCounter = counter;
   ceCounter = counter;
   return true;
 }
 
-bool GraphicsPipe::waitOnCeCounter(Queue &queue) {
-  auto counterLo = queue.rptr[1];
-  auto counterHi = queue.rptr[2];
+bool GraphicsPipe::waitOnCeCounter(Ring &ring) {
+  auto counterLo = ring.rptr[1];
+  auto counterHi = ring.rptr[2];
   auto counter = counterLo | (static_cast<std::uint64_t>(counterHi) << 32);
   return deCounter >= counter;
 }
 
-bool GraphicsPipe::waitOnDeCounterDiff(Queue &queue) {
-  auto waitDiff = queue.rptr[1];
+bool GraphicsPipe::waitOnDeCounterDiff(Ring &ring) {
+  auto waitDiff = ring.rptr[1];
   auto diff = ceCounter - deCounter;
   return diff < waitDiff;
 }
 
-bool GraphicsPipe::incrementCeCounter(Queue &) {
+bool GraphicsPipe::incrementCeCounter(Ring &) {
   ceCounter++;
   return true;
 }
 
-bool GraphicsPipe::incrementDeCounter(Queue &) {
+bool GraphicsPipe::incrementDeCounter(Ring &) {
   deCounter++;
   return true;
 }
 
-bool GraphicsPipe::loadConstRam(Queue &queue) {
-  std::uint32_t addressLo = queue.rptr[1];
-  std::uint32_t addressHi = queue.rptr[2];
-  std::uint32_t numDw = queue.rptr[3] & ((1 << 15) - 1);
+bool GraphicsPipe::loadConstRam(Ring &ring) {
+  std::uint32_t addressLo = ring.rptr[1];
+  std::uint32_t addressHi = ring.rptr[2];
+  std::uint32_t numDw = ring.rptr[3] & ((1 << 15) - 1);
   std::uint32_t offset =
-      (queue.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+      (ring.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
   std::memcpy(constantMemory + offset,
-              RemoteMemory{queue.vmId}.getPointer(address),
+              RemoteMemory{ring.vmId}.getPointer(address),
               numDw * sizeof(std::uint32_t));
 
   return true;
 }
 
-bool GraphicsPipe::writeConstRam(Queue &queue) {
+bool GraphicsPipe::writeConstRam(Ring &ring) {
   std::uint32_t offset =
-      (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
-  std::uint32_t data = queue.rptr[2];
+      (ring.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+  std::uint32_t data = ring.rptr[2];
   std::memcpy(constantMemory + offset, &data, sizeof(std::uint32_t));
   return true;
 }
 
-bool GraphicsPipe::dumpConstRam(Queue &queue) {
+bool GraphicsPipe::dumpConstRam(Ring &ring) {
   std::uint32_t offset =
-      (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
-  std::uint32_t numDw = queue.rptr[2] & ((1 << 15) - 1);
-  std::uint32_t addressLo = queue.rptr[3];
-  std::uint32_t addressHi = queue.rptr[4];
+      (ring.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+  std::uint32_t numDw = ring.rptr[2] & ((1 << 15) - 1);
+  std::uint32_t addressLo = ring.rptr[3];
+  std::uint32_t addressHi = ring.rptr[4];
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
-  std::memcpy(RemoteMemory{queue.vmId}.getPointer(address),
+  std::memcpy(RemoteMemory{ring.vmId}.getPointer(address),
               constantMemory + offset, numDw * sizeof(std::uint32_t));
 
   return true;
 }
 
-bool GraphicsPipe::unknownPacket(Queue &queue) {
-  auto op = rx::getBits(queue.rptr[0], 15, 8);
+bool GraphicsPipe::unknownPacket(Ring &ring) {
+  auto op = rx::getBits(ring.rptr[0], 15, 8);
 
   rx::die("unimplemented gfx pm4 packet: %s, queue %u\n",
-          gnm::pm4OpcodeToString(op), queue.indirectLevel);
+          gnm::pm4OpcodeToString(op), ring.indirectLevel);
 }
 
-bool GraphicsPipe::switchBuffer(Queue &queue) {
+bool GraphicsPipe::switchBuffer(Ring &ring) {
   // FIXME: implement
   return true;
 }
 
-bool GraphicsPipe::mapProcess(Queue &queue) {
-  auto pid = queue.rptr[1];
-  int vmId = queue.rptr[2];
+bool GraphicsPipe::mapProcess(Ring &ring) {
+  auto pid = ring.rptr[1];
+  int vmId = ring.rptr[2];
 
   device->mapProcess(pid, vmId);
   return true;
 }
 
-bool GraphicsPipe::mapQueues(Queue &queue) {
+bool GraphicsPipe::mapQueues(Ring &ring) {
   // FIXME: implement
   return true;
 }
 
-bool GraphicsPipe::unmapQueues(Queue &queue) {
+bool GraphicsPipe::unmapQueues(Ring &ring) {
   // FIXME: implement
   return true;
 }
 
-bool GraphicsPipe::mapMemory(Queue &queue) {
-  auto pid = queue.rptr[1];
-  auto addressLo = queue.rptr[2];
-  auto addressHi = queue.rptr[3];
-  auto sizeLo = queue.rptr[4];
-  auto sizeHi = queue.rptr[5];
-  auto memoryType = queue.rptr[6];
-  auto dmemIndex = queue.rptr[7];
-  auto prot = queue.rptr[8];
-  auto offsetLo = queue.rptr[9];
-  auto offsetHi = queue.rptr[10];
+bool GraphicsPipe::mapMemory(Ring &ring) {
+  auto pid = ring.rptr[1];
+  auto addressLo = ring.rptr[2];
+  auto addressHi = ring.rptr[3];
+  auto sizeLo = ring.rptr[4];
+  auto sizeHi = ring.rptr[5];
+  auto memoryType = ring.rptr[6];
+  auto dmemIndex = ring.rptr[7];
+  auto prot = ring.rptr[8];
+  auto offsetLo = ring.rptr[9];
+  auto offsetHi = ring.rptr[10];
 
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
   auto size = sizeLo | (static_cast<std::uint64_t>(sizeHi) << 32);
@@ -1308,42 +1566,42 @@ bool GraphicsPipe::mapMemory(Queue &queue) {
   device->mapMemory(pid, address, size, memoryType, dmemIndex, prot, offset);
   return true;
 }
-bool GraphicsPipe::unmapMemory(Queue &queue) {
-  auto pid = queue.rptr[1];
-  auto addressLo = queue.rptr[2];
-  auto addressHi = queue.rptr[3];
-  auto sizeLo = queue.rptr[4];
-  auto sizeHi = queue.rptr[5];
+bool GraphicsPipe::unmapMemory(Ring &ring) {
+  auto pid = ring.rptr[1];
+  auto addressLo = ring.rptr[2];
+  auto addressHi = ring.rptr[3];
+  auto sizeLo = ring.rptr[4];
+  auto sizeHi = ring.rptr[5];
 
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
   auto size = sizeLo | (static_cast<std::uint64_t>(sizeHi) << 32);
   device->unmapMemory(pid, address, size);
   return true;
 }
-bool GraphicsPipe::protectMemory(Queue &queue) {
-  auto pid = queue.rptr[1];
-  auto addressLo = queue.rptr[2];
-  auto addressHi = queue.rptr[3];
-  auto sizeLo = queue.rptr[4];
-  auto sizeHi = queue.rptr[5];
-  auto prot = queue.rptr[6];
+bool GraphicsPipe::protectMemory(Ring &ring) {
+  auto pid = ring.rptr[1];
+  auto addressLo = ring.rptr[2];
+  auto addressHi = ring.rptr[3];
+  auto sizeLo = ring.rptr[4];
+  auto sizeHi = ring.rptr[5];
+  auto prot = ring.rptr[6];
   auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
   auto size = sizeLo | (static_cast<std::uint64_t>(sizeHi) << 32);
 
   device->protectMemory(pid, address, size, prot);
   return true;
 }
-bool GraphicsPipe::unmapProcess(Queue &queue) {
-  auto pid = queue.rptr[1];
+bool GraphicsPipe::unmapProcess(Ring &ring) {
+  auto pid = ring.rptr[1];
   device->unmapProcess(pid);
   return true;
 }
 
-bool GraphicsPipe::flip(Queue &queue) {
-  auto buffer = queue.rptr[1];
-  auto dataLo = queue.rptr[2];
-  auto dataHi = queue.rptr[3];
-  auto pid = queue.rptr[4];
+bool GraphicsPipe::flip(Ring &ring) {
+  auto buffer = ring.rptr[1];
+  auto dataLo = ring.rptr[2];
+  auto dataHi = ring.rptr[3];
+  auto pid = ring.rptr[4];
   auto data = dataLo | (static_cast<std::uint64_t>(dataHi) << 32);
 
   device->flip(pid, buffer, data);
diff --git a/rpcsx/gpu/Pipe.hpp b/rpcsx/gpu/Pipe.hpp
index 1dc9e449f..18ab4ecdb 100644
--- a/rpcsx/gpu/Pipe.hpp
+++ b/rpcsx/gpu/Pipe.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include "Registers.hpp"
 #include "Scheduler.hpp"
+#include "orbis/utils/SharedMutex.hpp"
 
 #include <cstdint>
 #include <vulkan/vulkan_core.h>
@@ -8,7 +9,7 @@
 namespace amdgpu {
 struct Device;
 
-struct Queue {
+struct Ring {
   int vmId = -1;
   int indirectLevel = -1;
   std::uint32_t *doorbell{};
@@ -16,11 +17,12 @@ struct Queue {
   std::uint64_t size{};
   std::uint32_t *rptr{};
   std::uint32_t *wptr{};
+  std::uint32_t *rptrReportLocation{};
 
-  static Queue createFromRange(int vmId, std::uint32_t *base,
-                               std::uint64_t size, int indirectLevel = 0,
-                               std::uint32_t *doorbell = nullptr) {
-    Queue result;
+  static Ring createFromRange(int vmId, std::uint32_t *base, std::uint64_t size,
+                              int indirectLevel = 0,
+                              std::uint32_t *doorbell = nullptr) {
+    Ring result;
     result.vmId = vmId;
     result.indirectLevel = indirectLevel;
     result.doorbell = doorbell;
@@ -36,20 +38,35 @@ struct ComputePipe {
   Device *device;
   Scheduler scheduler;
 
-  using CommandHandler = bool (ComputePipe::*)(Queue &);
+  using CommandHandler = bool (ComputePipe::*)(Ring &);
   CommandHandler commandHandlers[255];
-  Queue queues[8];
-  Registers::ComputeConfig computeConfig;
+  orbis::shared_mutex queueMtx[8];
+  int index;
+  Ring queues[2][8];
+  std::uint64_t drawIndexIndirPatchBase = 0;
 
   ComputePipe(int index);
 
   bool processAllRings();
-  void processRing(Queue &queue);
-  void mapQueue(int queueId, Queue queue);
+  bool processRing(Ring &ring);
+  void mapQueue(int queueId, Ring ring, std::unique_lock<orbis::shared_mutex> &lock);
+  void waitForIdle(int queueId, std::unique_lock<orbis::shared_mutex> &lock);
+  void submit(int queueId, std::uint32_t offset);
 
-  bool setShReg(Queue &queue);
-  bool unknownPacket(Queue &queue);
-  bool handleNop(Queue &queue);
+  std::unique_lock<orbis::shared_mutex> lockQueue(int queueId) {
+    return std::unique_lock<orbis::shared_mutex>(queueMtx[queueId]);
+  }
+
+  bool setShReg(Ring &ring);
+  bool dispatchDirect(Ring &ring);
+  bool dispatchIndirect(Ring &ring);
+  bool releaseMem(Ring &ring);
+  bool waitRegMem(Ring &ring);
+  bool writeData(Ring &ring);
+  bool unknownPacket(Ring &ring);
+  bool handleNop(Ring &ring);
+
+  std::uint32_t *getMmRegister(Ring &ring, std::uint32_t dwAddress);
 };
 
 struct GraphicsPipe {
@@ -71,75 +88,75 @@ struct GraphicsPipe {
   Registers::Context context;
   Registers::UConfig uConfig;
 
-  Queue deQueues[3];
-  Queue ceQueue;
+  Ring deQueues[3];
+  Ring ceQueue;
 
-  using CommandHandler = bool (GraphicsPipe::*)(Queue &);
+  using CommandHandler = bool (GraphicsPipe::*)(Ring &);
   CommandHandler commandHandlers[4][255];
 
   GraphicsPipe(int index);
 
-  void setCeQueue(Queue queue);
-  void setDeQueue(Queue queue, int ring);
+  void setCeQueue(Ring ring);
+  void setDeQueue(Ring ring, int indirectLevel);
 
   bool processAllRings();
-  void processRing(Queue &queue);
+  void processRing(Ring &ring);
 
-  bool drawPreamble(Queue &queue);
-  bool indexBufferSize(Queue &queue);
-  bool handleNop(Queue &queue);
-  bool contextControl(Queue &queue);
-  bool acquireMem(Queue &queue);
-  bool releaseMem(Queue &queue);
-  bool dispatchDirect(Queue &queue);
-  bool dispatchIndirect(Queue &queue);
-  bool writeData(Queue &queue);
-  bool memSemaphore(Queue &queue);
-  bool waitRegMem(Queue &queue);
-  bool indirectBufferConst(Queue &queue);
-  bool indirectBuffer(Queue &queue);
-  bool condWrite(Queue &queue);
-  bool eventWrite(Queue &queue);
-  bool eventWriteEop(Queue &queue);
-  bool eventWriteEos(Queue &queue);
-  bool dmaData(Queue &queue);
-  bool setBase(Queue &queue);
-  bool clearState(Queue &queue);
-  bool setPredication(Queue &queue);
-  bool drawIndirect(Queue &queue);
-  bool drawIndexIndirect(Queue &queue);
-  bool indexBase(Queue &queue);
-  bool drawIndex2(Queue &queue);
-  bool indexType(Queue &queue);
-  bool drawIndexAuto(Queue &queue);
-  bool numInstances(Queue &queue);
-  bool drawIndexMultiAuto(Queue &queue);
-  bool drawIndexOffset2(Queue &queue);
-  bool pfpSyncMe(Queue &queue);
-  bool setCeDeCounters(Queue &queue);
-  bool waitOnCeCounter(Queue &queue);
-  bool waitOnDeCounterDiff(Queue &queue);
-  bool incrementCeCounter(Queue &queue);
-  bool incrementDeCounter(Queue &queue);
-  bool loadConstRam(Queue &queue);
-  bool writeConstRam(Queue &queue);
-  bool dumpConstRam(Queue &queue);
-  bool setConfigReg(Queue &queue);
-  bool setShReg(Queue &queue);
-  bool setUConfigReg(Queue &queue);
-  bool setContextReg(Queue &queue);
+  bool drawPreamble(Ring &ring);
+  bool indexBufferSize(Ring &ring);
+  bool handleNop(Ring &ring);
+  bool contextControl(Ring &ring);
+  bool acquireMem(Ring &ring);
+  bool releaseMem(Ring &ring);
+  bool dispatchDirect(Ring &ring);
+  bool dispatchIndirect(Ring &ring);
+  bool writeData(Ring &ring);
+  bool memSemaphore(Ring &ring);
+  bool waitRegMem(Ring &ring);
+  bool indirectBufferConst(Ring &ring);
+  bool indirectBuffer(Ring &ring);
+  bool condWrite(Ring &ring);
+  bool eventWrite(Ring &ring);
+  bool eventWriteEop(Ring &ring);
+  bool eventWriteEos(Ring &ring);
+  bool dmaData(Ring &ring);
+  bool setBase(Ring &ring);
+  bool clearState(Ring &ring);
+  bool setPredication(Ring &ring);
+  bool drawIndirect(Ring &ring);
+  bool drawIndexIndirect(Ring &ring);
+  bool indexBase(Ring &ring);
+  bool drawIndex2(Ring &ring);
+  bool indexType(Ring &ring);
+  bool drawIndexAuto(Ring &ring);
+  bool numInstances(Ring &ring);
+  bool drawIndexMultiAuto(Ring &ring);
+  bool drawIndexOffset2(Ring &ring);
+  bool pfpSyncMe(Ring &ring);
+  bool setCeDeCounters(Ring &ring);
+  bool waitOnCeCounter(Ring &ring);
+  bool waitOnDeCounterDiff(Ring &ring);
+  bool incrementCeCounter(Ring &ring);
+  bool incrementDeCounter(Ring &ring);
+  bool loadConstRam(Ring &ring);
+  bool writeConstRam(Ring &ring);
+  bool dumpConstRam(Ring &ring);
+  bool setConfigReg(Ring &ring);
+  bool setShReg(Ring &ring);
+  bool setUConfigReg(Ring &ring);
+  bool setContextReg(Ring &ring);
 
-  bool unknownPacket(Queue &queue);
+  bool unknownPacket(Ring &ring);
 
-  bool switchBuffer(Queue &queue);
-  bool mapProcess(Queue &queue);
-  bool mapQueues(Queue &queue);
-  bool unmapQueues(Queue &queue);
-  bool mapMemory(Queue &queue);
-  bool unmapMemory(Queue &queue);
-  bool protectMemory(Queue &queue);
-  bool unmapProcess(Queue &queue);
-  bool flip(Queue &queue);
+  bool switchBuffer(Ring &ring);
+  bool mapProcess(Ring &ring);
+  bool mapQueues(Ring &ring);
+  bool unmapQueues(Ring &ring);
+  bool mapMemory(Ring &ring);
+  bool unmapMemory(Ring &ring);
+  bool protectMemory(Ring &ring);
+  bool unmapProcess(Ring &ring);
+  bool flip(Ring &ring);
 
   std::uint32_t *getMmRegister(std::uint32_t dwAddress);
 };
diff --git a/rpcsx/gpu/Registers.hpp b/rpcsx/gpu/Registers.hpp
index 15e96d875..33461d49d 100644
--- a/rpcsx/gpu/Registers.hpp
+++ b/rpcsx/gpu/Registers.hpp
@@ -399,10 +399,10 @@ struct DbDepthSize {
     std::uint32_t raw;
   };
 
-  std::uint32_t getPitch() const {
+  [[nodiscard]] std::uint32_t getPitch() const {
     return (pitchTileMax + 1) * 8;
   }
-  std::uint32_t getHeight() const {
+  [[nodiscard]] std::uint32_t getHeight() const {
     return (heightTileMax + 1) * 8;
   }
 };
@@ -591,8 +591,12 @@ struct Registers {
         };
       };
 
-      std::uint8_t getVGprCount() const { return (vgprs + 1) * 4; }
-      std::uint8_t getSGprCount() const { return (sgprs + 1) * 8; }
+      [[nodiscard]] std::uint8_t getVGprCount() const {
+        return (vgprs + 1) * 4;
+      }
+      [[nodiscard]] std::uint8_t getSGprCount() const {
+        return (sgprs + 1) * 8;
+      }
     } rsrc1;
     struct {
       union {
@@ -613,7 +617,9 @@ struct Registers {
         };
       };
 
-      std::uint32_t getLdsDwordsCount() const { return ldsSize * 64; }
+      [[nodiscard]] std::uint32_t getLdsDwordsCount() const {
+        return ldsSize * 64;
+      }
     } rsrc2;
     std::uint32_t _pad3[1];
 
@@ -624,20 +630,25 @@ struct Registers {
           std::uint32_t wavesPerSh : 6;
           std::uint32_t : 6;
           std::uint32_t tgPerCu : 4;
-          std::uint32_t lockThreshold: 6;
+          std::uint32_t lockThreshold : 6;
           std::uint32_t simdDestCntl : 1;
         };
-
       };
-      std::uint32_t getWavesPerSh() const { return wavesPerSh << 4; }
+      [[nodiscard]] std::uint32_t getWavesPerSh() const {
+        return wavesPerSh << 4;
+      }
     } resourceLimits;
     std::uint32_t staticThreadMgmtSe0;
     std::uint32_t staticThreadMgmtSe1;
     std::uint32_t tmpRingSize;
-    std::uint32_t _pad4[39];
+    std::uint32_t _unk0[5];
+    std::uint32_t state;
+    std::uint32_t _unk1[33];
     std::array<std::uint32_t, 16> userData;
   };
 
+  static_assert(sizeof(ComputeConfig) == 320);
+
   struct ShaderConfig {
     static constexpr auto kMmioOffset = 0x2c00;
 
diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp
index f620c0e29..3e91a463f 100644
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@@ -1,3 +1,4 @@
+#include "dce.hpp"
 #include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
@@ -8,7 +9,6 @@
 #include "orbis/thread/Process.hpp"
 #include "orbis/thread/Thread.hpp"
 #include "orbis/utils/Logs.hpp"
-#include "orbis/utils/SharedMutex.hpp"
 #include "rx/mem.hpp"
 #include "rx/watchdog.hpp"
 #include "vm.hpp"
@@ -192,32 +192,21 @@ static void runBridge(int vmId) {
   }}.detach();
 }
 
-static constexpr auto kVmIdCount = 6;
 struct DceFile : public orbis::File {};
 
-struct DceDevice : IoDevice {
-  orbis::shared_mutex mtx;
-  std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1;
-  orbis::uint64_t dmemOffset = ~static_cast<std::uint64_t>(0);
+int DceDevice::allocateVmId() {
+  int id = std::countr_zero(freeVmIds);
 
-  orbis::ErrorCode open(orbis::Ref<orbis::File> *file, const char *path,
-                        std::uint32_t flags, std::uint32_t mode,
-                        orbis::Thread *thread) override;
+  if (id >= kVmIdCount) {
+    std::println(stderr, "out of vm slots");
+    std::abort();
+  }
 
-  int allocateVmId() {
-    int id = std::countr_zero(freeVmIds);
+  freeVmIds &= ~(1 << id);
+  return id;
+}
 
-    if (id >= kVmIdCount) {
-      std::println(stderr, "out of vm slots");
-      std::abort();
-    }
-
-    freeVmIds &= ~(1 << id);
-    return id;
-  };
-
-  void deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); };
-};
+void DceDevice::deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); }
 
 static void initDceMemory(DceDevice *device) {
   if (device->dmemOffset + 1) {
@@ -466,21 +455,24 @@ orbis::ErrorCode DceDevice::open(orbis::Ref<orbis::File> *file,
   newFile->device = this;
   newFile->ops = &ops;
   *file = newFile;
+  initializeProcess(thread->tproc);
+  return {};
+}
 
-  if (thread->tproc->vmId == -1) {
+void DceDevice::initializeProcess(orbis::Process *process) {
+  if (process->vmId == -1) {
     createGpu();
     auto vmId = allocateVmId();
 
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
     {
       auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
-      gpu.submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId);
-      thread->tproc->vmId = vmId;
+      gpu.submitMapProcess(process->gfxRing, process->pid, vmId);
+      process->vmId = vmId;
     }
 
     runBridge(vmId);
   }
-  return {};
 }
 
 IoDevice *createDceCharacterDevice() { return orbis::knew<DceDevice>(); }
diff --git a/rpcsx/iodev/dce.hpp b/rpcsx/iodev/dce.hpp
new file mode 100644
index 000000000..d3cfb81a9
--- /dev/null
+++ b/rpcsx/iodev/dce.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "io-device.hpp"
+#include "orbis-config.hpp"
+#include "orbis/error/ErrorCode.hpp"
+#include "orbis/file.hpp"
+#include "orbis/thread/Process.hpp"
+#include "orbis/utils/Rc.hpp"
+#include "orbis/utils/SharedMutex.hpp"
+
+static constexpr auto kVmIdCount = 6;
+
+struct DceDevice : IoDevice {
+  orbis::shared_mutex mtx;
+  std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1;
+  orbis::uint64_t dmemOffset = ~static_cast<std::uint64_t>(0);
+
+  orbis::ErrorCode open(orbis::Ref<orbis::File> *file, const char *path,
+                        std::uint32_t flags, std::uint32_t mode,
+                        orbis::Thread *thread) override;
+
+  int allocateVmId();
+  void deallocateVmId(int vmId);
+  void initializeProcess(orbis::Process *process);
+};
diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp
index e2d5cc476..d898fee40 100644
--- a/rpcsx/iodev/gc.cpp
+++ b/rpcsx/iodev/gc.cpp
@@ -1,5 +1,6 @@
 #include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
+#include "iodev/dce.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelAllocator.hpp"
 #include "orbis/KernelContext.hpp"
@@ -87,11 +88,11 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       for (unsigned i = 0; i < args->count; ++i) {
         gpu.submitGfxCommand(gcFile->gfxPipe,
-                              orbis::g_currentThread->tproc->vmId,
-                              {args->cmds + i * 4, 4});
+                             orbis::g_currentThread->tproc->vmId,
+                             {args->cmds + i * 4, 4});
       }
     } else {
-      return orbis::ErrorCode::INVAL;
+      return orbis::ErrorCode::BUSY;
     }
     break;
   }
@@ -106,7 +107,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       gpu.submitSwitchBuffer(orbis::g_currentThread->tproc->vmId);
     } else {
-      return orbis::ErrorCode::INVAL;
+      return orbis::ErrorCode::BUSY;
     }
 
     // ORBIS_LOG_ERROR("gc ioctl 0xc0088101", args->arg0, args->arg1);
@@ -127,11 +128,11 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       for (unsigned i = 0; i < args->count; ++i) {
         gpu.submitGfxCommand(gcFile->gfxPipe,
-                              orbis::g_currentThread->tproc->vmId,
-                              {args->cmds + i * 4, 4});
+                             orbis::g_currentThread->tproc->vmId,
+                             {args->cmds + i * 4, 4});
       }
     } else {
-      return orbis::ErrorCode::INVAL;
+      return orbis::ErrorCode::BUSY;
     }
 
     // orbis::bridge.sendDoFlip();
@@ -142,7 +143,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       gpu.waitForIdle();
     } else {
-      return orbis::ErrorCode::INVAL;
+      return orbis::ErrorCode::BUSY;
     }
     break;
   }
@@ -193,64 +194,53 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
 
   case 0xc030810d: { // map compute queue
     struct Args {
-      std::uint32_t pipeHi;
-      std::uint32_t pipeLo;
-      std::uint32_t queueId;
-      std::uint32_t offset;
-      std::uint64_t ringBaseAddress;
-      std::uint64_t readPtrAddress;
-      std::uint64_t dingDongPtr;
-      std::uint32_t lenLog2;
+      orbis::uint32_t meId;
+      orbis::uint32_t pipeId;
+      orbis::uint32_t queueId;
+      orbis::uint32_t vqueueId;
+      orbis::uintptr_t ringBaseAddress;
+      orbis::uintptr_t readPtrAddress;
+      orbis::uintptr_t doorbell;
+      orbis::uint32_t ringSize;
     };
 
     auto args = reinterpret_cast<Args *>(argp);
 
-    ORBIS_LOG_ERROR("gc ioctl map compute queue", args->pipeHi, args->pipeLo,
-                    args->queueId, args->offset, args->ringBaseAddress,
-                    args->readPtrAddress, args->dingDongPtr, args->lenLog2);
+    ORBIS_LOG_ERROR("gc ioctl map compute queue", args->meId, args->pipeId,
+                    args->queueId, args->vqueueId, args->ringBaseAddress,
+                    args->readPtrAddress, args->doorbell, args->ringSize);
 
-    rx::die("gc ioctl map compute queue");
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.mapComputeQueue(thread->tproc->vmId, args->meId, args->pipeId,
+                          args->queueId, args->vqueueId, args->ringBaseAddress,
+                          args->readPtrAddress, args->doorbell,
+                          static_cast<std::uint64_t>(1) << args->ringSize);
 
-    // auto id = ((args->pipeHi * 4) + args->pipeLo) * 8 + args->queueId;
-    // device->computeQueues[id] = {
-    //     .ringBaseAddress = args->ringBaseAddress,
-    //     .readPtrAddress = args->readPtrAddress,
-    //     .dingDongPtr = args->dingDongPtr,
-    //     .len = static_cast<std::uint64_t>(1) << args->lenLog2,
-    // };
-    // args->pipeHi = 0x769c766;
-    // args->pipeLo = 0x72e8e3c1;
-    // args->queueId = -0x248d50d8;
-    // args->offset = 0xd245ed58;
-
-    // ((std::uint64_t *)args->dingDongPtr)[0xf0 / sizeof(std::uint64_t)] = 1;
+    } else {
+      return orbis::ErrorCode::BUSY;
+    }
     break;
   }
 
   case 0xc010811c: {
     // ding dong for workload
     struct Args {
-      std::uint32_t pipeHi;
-      std::uint32_t pipeLo;
+      std::uint32_t meId;
+      std::uint32_t pipeId;
       std::uint32_t queueId;
       std::uint32_t nextStartOffsetInDw;
     };
 
     auto args = reinterpret_cast<Args *>(argp);
-    ORBIS_LOG_ERROR("gc ioctl ding dong for workload", args->pipeHi,
-                    args->pipeLo, args->queueId, args->nextStartOffsetInDw);
-    rx::die("gc ioctl ding dong for workload");
+    ORBIS_LOG_ERROR("gc ioctl ding dong for workload", args->meId, args->pipeId,
+                    args->queueId, args->nextStartOffsetInDw);
 
-    // auto id = ((args->pipeHi * 4) + args->pipeLo) * 8 + args->queueId;
-
-    // auto queue = device->computeQueues.at(id);
-    // auto address = (queue.ringBaseAddress + queue.offset);
-    // auto endOffset = static_cast<std::uint64_t>(args->nextStartOffsetInDw) <<
-    // 2; auto size = endOffset - queue.offset;
-
-    // rx::bridge.sendCommandBuffer(thread->tproc->pid, id, address, size);
-
-    // queue.offset = endOffset;
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitComputeQueue(args->meId, args->pipeId, args->queueId,
+                           args->nextStartOffsetInDw);
+    } else {
+      return orbis::ErrorCode::BUSY;
+    }
     break;
   }
 
@@ -336,6 +326,9 @@ orbis::ErrorCode GcDevice::open(orbis::Ref<orbis::File> *file, const char *path,
 }
 
 void GcDevice::addClient(orbis::Process *process) {
+  auto dce = orbis::g_context.dceDevice.rawStaticCast<DceDevice>();
+  dce->initializeProcess(process);
+
   std::lock_guard lock(mtx);
   auto &client = clients[process->pid];
   ++client;
diff --git a/rpcsx/main.cpp b/rpcsx/main.cpp
index 6a980f5e1..7a83ec519 100644
--- a/rpcsx/main.cpp
+++ b/rpcsx/main.cpp
@@ -321,6 +321,9 @@ static void ps4InitDev() {
   auto dmem1 = createDmemCharacterDevice(1);
   orbis::g_context.dmemDevice = dmem1;
 
+  auto dce = createDceCharacterDevice();
+  orbis::g_context.dceDevice = dce;
+
   auto ttyFd = ::open("tty.txt", O_CREAT | O_TRUNC | O_WRONLY, 0666);
   auto consoleDev = createConsoleCharacterDevice(STDIN_FILENO, ttyFd);
   auto mbus = static_cast<MBusDevice *>(createMBusCharacterDevice());
@@ -357,7 +360,7 @@ static void ps4InitDev() {
   vfs::addDevice("zero", createZeroCharacterDevice());
   vfs::addDevice("null", createNullCharacterDevice());
   vfs::addDevice("dipsw", createDipswCharacterDevice());
-  vfs::addDevice("dce", createDceCharacterDevice());
+  vfs::addDevice("dce", dce);
   vfs::addDevice("hmd_cmd", createHmdCmdCharacterDevice());
   vfs::addDevice("hmd_snsr", createHmdSnsrCharacterDevice());
   vfs::addDevice("hmd_3da", createHmd3daCharacterDevice());