From 988212a91e118c46618c60503a3f77a8551e81c6 Mon Sep 17 00:00:00 2001
From: DH <dh.rpcs3@gmail.com>
Date: Sat, 12 Oct 2024 12:36:39 +0300
Subject: [PATCH] gpu: expose public API fix release build

---
 rpcsx/core/src/watchdog.cpp |  10 +--
 rpcsx/gpu/CMakeLists.txt    |   1 +
 rpcsx/gpu/Device.cpp        |  84 -------------------------
 rpcsx/gpu/Device.hpp        |  90 +--------------------------
 rpcsx/gpu/DeviceContext.hpp |  79 +++++++++++++++++++++++
 rpcsx/gpu/DeviceCtl.cpp     | 121 ++++++++++++++++++++++++++++++++++++
 rpcsx/gpu/DeviceCtl.hpp     |  48 ++++++++++++++
 rpcsx/iodev/dce.cpp         |  69 ++++++++++----------
 rpcsx/iodev/dmem.cpp        |   6 +-
 rpcsx/iodev/gc.cpp          |  20 +++---
 rpcsx/iodev/hid.cpp         |  10 +--
 rpcsx/main.cpp              |  13 ++--
 rpcsx/vm.cpp                |  18 +++---
 13 files changed, 326 insertions(+), 243 deletions(-)
 create mode 100644 rpcsx/gpu/DeviceContext.hpp
 create mode 100644 rpcsx/gpu/DeviceCtl.cpp
 create mode 100644 rpcsx/gpu/DeviceCtl.hpp
diff --git a/rpcsx/core/src/watchdog.cpp b/rpcsx/core/src/watchdog.cpp
index a460e988e..85b9dd5d3 100644
--- a/rpcsx/core/src/watchdog.cpp
+++ b/rpcsx/core/src/watchdog.cpp
@@ -1,5 +1,5 @@
 #include "rx/watchdog.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "orbis/KernelContext.hpp"
 #include <chrono>
 #include <csignal>
@@ -38,7 +38,7 @@ static void runGPU() {
     return;
   }
 
-  amdgpu::Device *gpu;
+  amdgpu::DeviceCtl gpu;
   {
     pthread_setname_np(pthread_self(), "rpcsx-gpu");
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
@@ -52,11 +52,11 @@ static void runGPU() {
     dup2(logFd, 2);
     ::close(logFd);
 
-    gpu = orbis::knew<amdgpu::Device>();
-    orbis::g_context.gpuDevice = gpu;
+    gpu = amdgpu::DeviceCtl::createDevice();
+    orbis::g_context.gpuDevice = gpu.getOpaque();
   }
 
-  gpu->start();
+  gpu.start();
   std::exit(0);
 }
 
diff --git a/rpcsx/gpu/CMakeLists.txt b/rpcsx/gpu/CMakeLists.txt
index 90ffc5dba..21edf646f 100644
--- a/rpcsx/gpu/CMakeLists.txt
+++ b/rpcsx/gpu/CMakeLists.txt
@@ -12,6 +12,7 @@ add_library(rpcsx-gpu
 STATIC
     Cache.cpp
     Device.cpp
+    DeviceCtl.cpp
     FlipPipeline.cpp
     Pipe.cpp
     Registers.cpp
diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp
index 215b463d7..7fed4a4db 100644
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@@ -476,71 +476,6 @@ void Device::submitGfxCommand(int gfxPipe,
   submitCommand(ring, command);
 }
 
-void Device::submitGfxCommand(int gfxPipe, int vmId,
-                              std::span<const std::uint32_t> command) {
-  auto op = rx::getBits(command[0], 15, 8);
-  auto type = rx::getBits(command[0], 31, 30);
-  auto len = rx::getBits(command[0], 29, 16) + 2;
-
-  if ((op != gnm::IT_INDIRECT_BUFFER && op != gnm::IT_INDIRECT_BUFFER_CNST) ||
-      type != 3 || len != 4 || command.size() != len) {
-    std::println(stderr, "unexpected gfx command for main ring: {}, {}, {}", op,
-                 type, len);
-    rx::die("");
-  }
-
-  std::vector<std::uint32_t> patchedCommand{command.data(),
-                                            command.data() + command.size()};
-  patchedCommand[3] &= ~(~0 << 24);
-  patchedCommand[3] |= vmId << 24;
-
-  submitGfxCommand(gfxPipe, patchedCommand);
-}
-
-void Device::submitSwitchBuffer(int gfxPipe) {
-  submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
-}
-void Device::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                        std::uint64_t flipArg) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_FLIP, bufferIndex, flipArg & 0xffff'ffff,
-                                   flipArg >> 32, pid));
-}
-
-void Device::submitMapMemory(int gfxPipe, std::uint32_t pid,
-                             std::uint64_t address, std::uint64_t size,
-                             int memoryType, int dmemIndex, int prot,
-                             std::int64_t offset) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff,
-                                   address >> 32, size & 0xffff'ffff,
-                                   size >> 32, memoryType, dmemIndex, prot,
-                                   offset & 0xffff'ffff, offset >> 32));
-}
-void Device::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
-                               std::uint64_t address, std::uint64_t size) {
-  submitGfxCommand(
-      gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
-                               address >> 32, size & 0xffff'ffff, size >> 32));
-}
-
-void Device::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
-  submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
-}
-
-void Device::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
-  submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
-}
-
-void Device::submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                                 std::uint64_t address, std::uint64_t size,
-                                 int prot) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_PROTECT_MEMORY, pid,
-                                   address & 0xffff'ffff, address >> 32,
-                                   size & 0xffff'ffff, size >> 32, prot));
-}
-
 void Device::mapProcess(std::uint32_t pid, int vmId) {
   auto &process = processInfo[pid];
   process.vmId = vmId;
@@ -987,25 +922,6 @@ void Device::unmapMemory(std::uint32_t pid, std::uint64_t address,
   protectMemory(pid, address, size, 0);
 }
 
-void Device::registerBuffer(std::uint32_t pid, Buffer buffer) {
-  auto &process = processInfo[pid];
-
-  if (buffer.attrId >= 10 || buffer.index >= 10) {
-    rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
-  }
-
-  process.buffers[buffer.index] = buffer;
-}
-
-void Device::registerBufferAttribute(std::uint32_t pid, BufferAttribute attr) {
-  auto &process = processInfo[pid];
-  if (attr.attrId >= 10) {
-    rx::die("out of buffer attributes %u", attr.attrId);
-  }
-
-  process.bufferAttributes[attr.attrId] = attr;
-}
-
 static void notifyPageChanges(Device *device, int vmId, std::uint32_t firstPage,
                               std::uint32_t pageCount) {
   std::uint64_t command =
diff --git a/rpcsx/gpu/Device.hpp b/rpcsx/gpu/Device.hpp
index 8e62162ac..89417c6ae 100644
--- a/rpcsx/gpu/Device.hpp
+++ b/rpcsx/gpu/Device.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "Cache.hpp"
+#include "DeviceContext.hpp"
 #include "FlipPipeline.hpp"
 #include "Pipe.hpp"
 #include "amdgpu/tiler_vulkan.hpp"
@@ -13,7 +14,6 @@
 #include <GLFW/glfw3.h>
 #include <array>
 #include <thread>
-#include <unordered_map>
 #include <vulkan/vulkan_core.h>
 
 namespace amdgpu {
@@ -44,25 +44,6 @@ struct VmMapSlot {
   auto operator<=>(const VmMapSlot &) const = default;
 };
 
-struct BufferAttribute {
-  std::uint8_t attrId;
-  std::uint8_t submit;
-  std::uint64_t canary;
-  std::uint32_t pixelFormat;
-  std::uint32_t tilingMode;
-  std::uint32_t pitch;
-  std::uint32_t width;
-  std::uint32_t height;
-};
-
-struct Buffer {
-  std::uint64_t canary;
-  std::uint32_t index;
-  std::uint32_t attrId;
-  std::uint64_t address;
-  std::uint64_t address2;
-};
-
 struct ProcessInfo {
   int vmId = -1;
   int vmFd = -1;
@@ -71,46 +52,6 @@ struct ProcessInfo {
   rx::MemoryTableWithPayload<VmMapSlot> vmTable;
 };
 
-enum {
-  kPageWriteWatch = 1 << 0,
-  kPageReadWriteLock = 1 << 1,
-  kPageInvalidated = 1 << 2,
-  kPageLazyLock = 1 << 3
-};
-
-struct PadState {
-  std::uint64_t timestamp;
-  std::uint32_t unk;
-  std::uint32_t buttons;
-  std::uint8_t leftStickX;
-  std::uint8_t leftStickY;
-  std::uint8_t rightStickX;
-  std::uint8_t rightStickY;
-  std::uint8_t l2;
-  std::uint8_t r2;
-};
-
-enum {
-  kPadBtnL3 = 1 << 1,
-  kPadBtnR3 = 1 << 2,
-  kPadBtnOptions = 1 << 3,
-  kPadBtnUp = 1 << 4,
-  kPadBtnRight = 1 << 5,
-  kPadBtnDown = 1 << 6,
-  kPadBtnLeft = 1 << 7,
-  kPadBtnL2 = 1 << 8,
-  kPadBtnR2 = 1 << 9,
-  kPadBtnL1 = 1 << 10,
-  kPadBtnR1 = 1 << 11,
-  kPadBtnTriangle = 1 << 12,
-  kPadBtnCircle = 1 << 13,
-  kPadBtnCross = 1 << 14,
-  kPadBtnSquare = 1 << 15,
-  kPadBtnPs = 1 << 16,
-  kPadBtnTouchPad = 1 << 20,
-  kPadBtnIntercepted = 1 << 31,
-};
-
 struct RemoteMemory {
   int vmId;
 
@@ -121,10 +62,9 @@ struct RemoteMemory {
   }
 };
 
-struct Device : orbis::RcBase {
+struct Device : orbis::RcBase, DeviceContext {
   static constexpr auto kComputePipeCount = 8;
   static constexpr auto kGfxPipeCount = 2;
-  static constexpr auto kMaxProcessCount = 6;
 
   shader::SemanticInfo gcnSemantic;
   shader::spv::Context shaderSemanticContext;
@@ -153,16 +93,6 @@ struct Device : orbis::RcBase {
       {this, 0}, {this, 1}, {this, 2}, {this, 3}, {this, 4}, {this, 5},
   };
 
-  PadState kbPadState;
-  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
-  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
-  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
-
-  volatile std::uint32_t flipBuffer[kMaxProcessCount];
-  volatile std::uint64_t flipArg[kMaxProcessCount];
-  volatile std::uint64_t flipCount[kMaxProcessCount];
-  volatile std::uint64_t bufferInUseAddress[kMaxProcessCount];
-
   std::uint32_t mainGfxRings[kGfxPipeCount][0x4000 / sizeof(std::uint32_t)];
 
   Device();
@@ -184,20 +114,6 @@ struct Device : orbis::RcBase {
 
   void submitCommand(Queue &ring, std::span<const std::uint32_t> command);
   void submitGfxCommand(int gfxPipe, std::span<const std::uint32_t> command);
-  void submitGfxCommand(int gfxPipe, int vmId,
-                        std::span<const std::uint32_t> command);
-  void submitSwitchBuffer(int gfxPipe);
-  void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                  std::uint64_t flipArg);
-  void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
-                       std::uint64_t size, int memoryType, int dmemIndex,
-                       int prot, std::int64_t offset);
-  void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
-                         std::uint64_t size);
-  void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
-  void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
-  void submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                           std::uint64_t address, std::uint64_t size, int prot);
 
   void mapProcess(std::uint32_t pid, int vmId);
   void unmapProcess(std::uint32_t pid);
@@ -214,8 +130,6 @@ struct Device : orbis::RcBase {
                  int memoryType, int dmemIndex, int prot, std::int64_t offset);
   void unmapMemory(std::uint32_t pid, std::uint64_t address,
                    std::uint64_t size);
-  void registerBuffer(std::uint32_t pid, Buffer buffer);
-  void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);
   void watchWrites(int vmId, std::uint64_t address, std::uint64_t size);
   void lockReadWrite(int vmId, std::uint64_t address, std::uint64_t size,
                      bool isLazy);
diff --git a/rpcsx/gpu/DeviceContext.hpp b/rpcsx/gpu/DeviceContext.hpp
new file mode 100644
index 000000000..70c36ac55
--- /dev/null
+++ b/rpcsx/gpu/DeviceContext.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+namespace amdgpu {
+struct BufferAttribute {
+  std::uint8_t attrId;
+  std::uint8_t submit;
+  std::uint64_t canary;
+  std::uint32_t pixelFormat;
+  std::uint32_t tilingMode;
+  std::uint32_t pitch;
+  std::uint32_t width;
+  std::uint32_t height;
+};
+
+struct Buffer {
+  std::uint64_t canary;
+  std::uint32_t index;
+  std::uint32_t attrId;
+  std::uint64_t address;
+  std::uint64_t address2;
+};
+
+enum {
+  kPageWriteWatch = 1 << 0,
+  kPageReadWriteLock = 1 << 1,
+  kPageInvalidated = 1 << 2,
+  kPageLazyLock = 1 << 3
+};
+
+struct PadState {
+  std::uint64_t timestamp;
+  std::uint32_t unk;
+  std::uint32_t buttons;
+  std::uint8_t leftStickX;
+  std::uint8_t leftStickY;
+  std::uint8_t rightStickX;
+  std::uint8_t rightStickY;
+  std::uint8_t l2;
+  std::uint8_t r2;
+};
+
+enum {
+  kPadBtnL3 = 1 << 1,
+  kPadBtnR3 = 1 << 2,
+  kPadBtnOptions = 1 << 3,
+  kPadBtnUp = 1 << 4,
+  kPadBtnRight = 1 << 5,
+  kPadBtnDown = 1 << 6,
+  kPadBtnLeft = 1 << 7,
+  kPadBtnL2 = 1 << 8,
+  kPadBtnR2 = 1 << 9,
+  kPadBtnL1 = 1 << 10,
+  kPadBtnR1 = 1 << 11,
+  kPadBtnTriangle = 1 << 12,
+  kPadBtnCircle = 1 << 13,
+  kPadBtnCross = 1 << 14,
+  kPadBtnSquare = 1 << 15,
+  kPadBtnPs = 1 << 16,
+  kPadBtnTouchPad = 1 << 20,
+  kPadBtnIntercepted = 1 << 31,
+};
+
+struct DeviceContext {
+  static constexpr auto kMaxProcessCount = 6;
+
+  PadState kbPadState;
+  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
+  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
+  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
+
+  volatile std::uint32_t flipBuffer[kMaxProcessCount];
+  volatile std::uint64_t flipArg[kMaxProcessCount];
+  volatile std::uint64_t flipCount[kMaxProcessCount];
+  volatile std::uint64_t bufferInUseAddress[kMaxProcessCount];
+};
+} // namespace amdgpu
\ No newline at end of file
diff --git a/rpcsx/gpu/DeviceCtl.cpp b/rpcsx/gpu/DeviceCtl.cpp
new file mode 100644
index 000000000..26f527370
--- /dev/null
+++ b/rpcsx/gpu/DeviceCtl.cpp
@@ -0,0 +1,121 @@
+#include "DeviceCtl.hpp"
+#include "Device.hpp"
+#include "gnm/pm4.hpp"
+#include "rx/bits.hpp"
+#include "rx/die.hpp"
+#include "shader/dialect.hpp"
+#include <cstdio>
+#include <print>
+#include <vector>
+
+using namespace amdgpu;
+
+DeviceCtl::DeviceCtl() noexcept = default;
+DeviceCtl::DeviceCtl(orbis::Ref<orbis::RcBase> device) noexcept
+    : mDevice(device.rawStaticCast<Device>()) {}
+DeviceCtl::DeviceCtl(DeviceCtl &&) noexcept = default;
+DeviceCtl::DeviceCtl(const DeviceCtl &) = default;
+DeviceCtl &DeviceCtl::operator=(DeviceCtl &&) noexcept = default;
+DeviceCtl &DeviceCtl::operator=(const DeviceCtl &) = default;
+
+DeviceCtl::~DeviceCtl() = default;
+
+DeviceCtl DeviceCtl::createDevice() {
+  DeviceCtl result;
+  result.mDevice = orbis::knew<Device>();
+  return result;
+}
+
+DeviceContext &DeviceCtl::getContext() { return *mDevice.get(); }
+orbis::Ref<orbis::RcBase> DeviceCtl::getOpaque() { return mDevice; }
+
+void DeviceCtl::submitGfxCommand(int gfxPipe, int vmId,
+                                 std::span<const std::uint32_t> command) {
+  auto op = rx::getBits(command[0], 15, 8);
+  auto type = rx::getBits(command[0], 31, 30);
+  auto len = rx::getBits(command[0], 29, 16) + 2;
+
+  if ((op != gnm::IT_INDIRECT_BUFFER && op != gnm::IT_INDIRECT_BUFFER_CNST) ||
+      type != 3 || len != 4 || command.size() != len) {
+    std::println(stderr, "unexpected gfx command for main ring: {}, {}, {}", op,
+                 type, len);
+    rx::die("");
+  }
+
+  std::vector<std::uint32_t> patchedCommand{command.data(),
+                                            command.data() + command.size()};
+  patchedCommand[3] &= ~(~0 << 24);
+  patchedCommand[3] |= vmId << 24;
+
+  mDevice->submitGfxCommand(gfxPipe, patchedCommand);
+}
+
+void DeviceCtl::submitSwitchBuffer(int gfxPipe) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
+}
+void DeviceCtl::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
+                           std::uint64_t flipArg) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_FLIP, bufferIndex,
+                                                     flipArg & 0xffff'ffff,
+                                                     flipArg >> 32, pid));
+}
+
+void DeviceCtl::submitMapMemory(int gfxPipe, std::uint32_t pid,
+                                std::uint64_t address, std::uint64_t size,
+                                int memoryType, int dmemIndex, int prot,
+                                std::int64_t offset) {
+  mDevice->submitGfxCommand(
+      gfxPipe,
+      createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff, address >> 32,
+                      size & 0xffff'ffff, size >> 32, memoryType, dmemIndex,
+                      prot, offset & 0xffff'ffff, offset >> 32));
+}
+void DeviceCtl::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
+                                  std::uint64_t address, std::uint64_t size) {
+  mDevice->submitGfxCommand(
+      gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
+                               address >> 32, size & 0xffff'ffff, size >> 32));
+}
+
+void DeviceCtl::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
+  mDevice->submitGfxCommand(gfxPipe,
+                            createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
+}
+
+void DeviceCtl::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
+}
+
+void DeviceCtl::submitProtectMemory(int gfxPipe, std::uint32_t pid,
+                                    std::uint64_t address, std::uint64_t size,
+                                    int prot) {
+  mDevice->submitGfxCommand(
+      gfxPipe,
+      createPm4Packet(IT_PROTECT_MEMORY, pid, address & 0xffff'ffff,
+                      address >> 32, size & 0xffff'ffff, size >> 32, prot));
+}
+
+void DeviceCtl::registerBuffer(std::uint32_t pid, Buffer buffer) {
+  // FIXME: submit command
+  auto &process = mDevice->processInfo[pid];
+
+  if (buffer.attrId >= 10 || buffer.index >= 10) {
+    rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
+  }
+
+  process.buffers[buffer.index] = buffer;
+}
+
+void DeviceCtl::registerBufferAttribute(std::uint32_t pid,
+                                        BufferAttribute attr) {
+  // FIXME: submit command
+  auto &process = mDevice->processInfo[pid];
+  if (attr.attrId >= 10) {
+    rx::die("out of buffer attributes %u", attr.attrId);
+  }
+
+  process.bufferAttributes[attr.attrId] = attr;
+}
+
+void DeviceCtl::start() { mDevice->start(); }
+void DeviceCtl::waitForIdle() { mDevice->waitForIdle(); }
diff --git a/rpcsx/gpu/DeviceCtl.hpp b/rpcsx/gpu/DeviceCtl.hpp
new file mode 100644
index 000000000..7a370a4cf
--- /dev/null
+++ b/rpcsx/gpu/DeviceCtl.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "DeviceContext.hpp"
+#include "orbis/utils/Rc.hpp"
+#include <cstdint>
+#include <span>
+
+namespace amdgpu {
+class Device;
+
+class DeviceCtl {
+  orbis::Ref<Device> mDevice;
+
+public:
+  DeviceCtl() noexcept;
+  DeviceCtl(orbis::Ref<orbis::RcBase> device) noexcept;
+  DeviceCtl(DeviceCtl &&) noexcept;
+  DeviceCtl(const DeviceCtl &);
+  DeviceCtl &operator=(DeviceCtl &&) noexcept;
+  DeviceCtl &operator=(const DeviceCtl &);
+  ~DeviceCtl();
+
+  static DeviceCtl createDevice();
+  DeviceContext &getContext();
+  orbis::Ref<orbis::RcBase> getOpaque();
+
+  void submitGfxCommand(int gfxPipe, int vmId,
+                        std::span<const std::uint32_t> command);
+  void submitSwitchBuffer(int gfxPipe);
+  void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
+                  std::uint64_t flipArg);
+  void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+                       std::uint64_t size, int memoryType, int dmemIndex,
+                       int prot, std::int64_t offset);
+  void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+                         std::uint64_t size);
+  void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
+  void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
+  void submitProtectMemory(int gfxPipe, std::uint32_t pid,
+                           std::uint64_t address, std::uint64_t size, int prot);
+  void registerBuffer(std::uint32_t pid, Buffer buffer);
+  void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);
+  void start();
+  void waitForIdle();
+
+  explicit operator bool() const { return mDevice != nullptr; }
+};
+} // namespace amdgpu
diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp
index 6aeff72a0..f620c0e29 100644
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelAllocator.hpp"
@@ -134,12 +134,13 @@ static void runBridge(int vmId) {
   std::thread{[=] {
     pthread_setname_np(pthread_self(), "Bridge");
 
-    auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+    auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+    auto &gpuCtx = gpu.getContext();
     std::vector<std::uint64_t> fetchedCommands;
-    fetchedCommands.reserve(std::size(gpu->cacheCommands));
+    fetchedCommands.reserve(std::size(gpuCtx.cacheCommands));
 
     while (true) {
-      for (auto &command : gpu->cacheCommands) {
+      for (auto &command : gpuCtx.cacheCommands) {
         std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
 
         if (value != 0) {
@@ -157,7 +158,7 @@ static void runBridge(int vmId) {
         auto count = static_cast<std::uint32_t>(command >> 32) + 1;
 
         auto pageFlags =
-            gpu->cachePages[vmId][page].load(std::memory_order::relaxed);
+            gpuCtx.cachePages[vmId][page].load(std::memory_order::relaxed);
 
         auto address = static_cast<std::uint64_t>(page) * rx::mem::pageSize;
         auto origVmProt = vm::getPageProtection(address);
@@ -253,7 +254,9 @@ static void initDceMemory(DceDevice *device) {
 static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                                   void *argp, orbis::Thread *thread) {
   auto device = static_cast<DceDevice *>(file->device.get());
-  auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+
+  auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+  auto &gpuCtx = gpu.getContext();
 
   if (request == 0xc0308203) {
     // returns:
@@ -299,11 +302,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
 
       FlipControlStatus flipStatus{};
       // TODO: lock bridge header
-      flipStatus.flipArg = gpu->flipArg[thread->tproc->vmId];
-      flipStatus.count = gpu->flipCount[thread->tproc->vmId];
+      flipStatus.flipArg = gpuCtx.flipArg[thread->tproc->vmId];
+      flipStatus.count = gpuCtx.flipCount[thread->tproc->vmId];
       flipStatus.processTime = 0; // TODO
       flipStatus.tsc = 0;         // TODO
-      flipStatus.currentBuffer = gpu->flipBuffer[thread->tproc->vmId];
+      flipStatus.currentBuffer = gpuCtx.flipBuffer[thread->tproc->vmId];
       flipStatus.flipPendingNum0 = 0; // TODO
       flipStatus.gcQueueNum = 0;      // TODO
       flipStatus.flipPendingNum1 = 0; // TODO
@@ -333,7 +336,7 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
       *(std::uint64_t *)args->size = kDceControlMemorySize;  // size
     } else if (args->id == 31) {
       if ((std::uint64_t)args->ptr == 0xc) {
-        gpu->bufferInUseAddress[thread->tproc->vmId] = args->size;
+        gpuCtx.bufferInUseAddress[thread->tproc->vmId] = args->size;
       } else if ((std::uint64_t)args->ptr != 1) {
         ORBIS_LOG_ERROR("buffer in use", args->ptr, args->size);
         thread->where();
@@ -362,13 +365,13 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
     ORBIS_LOG_ERROR("dce: RegisterBuffer", args->canary, args->index,
                     args->address, args->address2);
 
-    gpu->registerBuffer(thread->tproc->pid, {
-                                                .canary = args->canary,
-                                                .index = args->index,
-                                                .attrId = args->attrid,
-                                                .address = args->address,
-                                                .address2 = args->address2,
-                                            });
+    gpu.registerBuffer(thread->tproc->pid, {
+                                               .canary = args->canary,
+                                               .index = args->index,
+                                               .attrId = args->attrid,
+                                               .address = args->address,
+                                               .address2 = args->address2,
+                                           });
     return {};
   }
 
@@ -381,17 +384,17 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                     args->unk4_zero, args->unk5_zero, args->options,
                     args->reserved1, args->reserved2);
 
-    gpu->registerBufferAttribute(thread->tproc->pid,
-                                 {
-                                     .attrId = args->attrid,
-                                     .submit = args->submit,
-                                     .canary = args->canary,
-                                     .pixelFormat = args->pixelFormat,
-                                     .tilingMode = args->tilingMode,
-                                     .pitch = args->pitch,
-                                     .width = args->width,
-                                     .height = args->height,
-                                 });
+    gpu.registerBufferAttribute(thread->tproc->pid,
+                                {
+                                    .attrId = args->attrid,
+                                    .submit = args->submit,
+                                    .canary = args->canary,
+                                    .pixelFormat = args->pixelFormat,
+                                    .tilingMode = args->tilingMode,
+                                    .pitch = args->pitch,
+                                    .width = args->width,
+                                    .height = args->height,
+                                });
 
     return {};
   }
@@ -404,9 +407,9 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
     //                 args->displayBufferIndex, args->flipMode, args->unk1,
     //                 args->flipArg, args->flipArg2, args->eop_nz, args->unk2,
     //                 args->eop_val, args->unk3, args->unk4, args->rout);
-    gpu->submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
-                    args->displayBufferIndex,
-                    /*args->flipMode,*/ args->flipArg);
+    gpu.submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
+                   args->displayBufferIndex,
+                   /*args->flipMode,*/ args->flipArg);
 
     // *args->rout = 0;
     return {};
@@ -470,8 +473,8 @@ orbis::ErrorCode DceDevice::open(orbis::Ref<orbis::File> *file,
 
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
     {
-      auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
-      gpu->submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId);
+      auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+      gpu.submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId);
       thread->tproc->vmId = vmId;
     }
 
diff --git a/rpcsx/iodev/dmem.cpp b/rpcsx/iodev/dmem.cpp
index d171e01d9..58db33d12 100644
--- a/rpcsx/iodev/dmem.cpp
+++ b/rpcsx/iodev/dmem.cpp
@@ -1,5 +1,5 @@
 #include "dmem.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "orbis/KernelAllocator.hpp"
 #include "orbis/KernelContext.hpp"
@@ -68,8 +68,8 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
     return orbis::ErrorCode::INVAL;
   }
 
-  if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-    gpu->submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
+  if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+    gpu.submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
                          orbis::g_currentThread->tproc->pid,
                          reinterpret_cast<std::uint64_t>(result), len,
                          memoryType, index, prot, directMemoryStart);
diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp
index bf518c421..e2d5cc476 100644
--- a/rpcsx/iodev/gc.cpp
+++ b/rpcsx/iodev/gc.cpp
@@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelAllocator.hpp"
@@ -8,12 +8,12 @@
 #include "orbis/thread/Thread.hpp"
 #include "orbis/utils/Logs.hpp"
 #include "orbis/utils/SharedMutex.hpp"
+#include "rx/die.hpp"
 #include "vm.hpp"
 #include <cstdio>
 #include <mutex>
 #include <print>
 #include <sys/mman.h>
-#include <unordered_map>
 
 struct ComputeQueue {
   std::uint64_t ringBaseAddress{};
@@ -84,9 +84,9 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     };
 
     auto args = reinterpret_cast<Args *>(argp);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       for (unsigned i = 0; i < args->count; ++i) {
-        gpu->submitGfxCommand(gcFile->gfxPipe,
+        gpu.submitGfxCommand(gcFile->gfxPipe,
                               orbis::g_currentThread->tproc->vmId,
                               {args->cmds + i * 4, 4});
       }
@@ -103,8 +103,8 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
     };
 
     auto args = reinterpret_cast<Args *>(argp);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitSwitchBuffer(orbis::g_currentThread->tproc->vmId);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitSwitchBuffer(orbis::g_currentThread->tproc->vmId);
     } else {
       return orbis::ErrorCode::INVAL;
     }
@@ -124,9 +124,9 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
 
     auto args = reinterpret_cast<Args *>(argp);
 
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       for (unsigned i = 0; i < args->count; ++i) {
-        gpu->submitGfxCommand(gcFile->gfxPipe,
+        gpu.submitGfxCommand(gcFile->gfxPipe,
                               orbis::g_currentThread->tproc->vmId,
                               {args->cmds + i * 4, 4});
       }
@@ -139,8 +139,8 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
   }
 
   case 0xc0048116: { // submit done?
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->waitForIdle();
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.waitForIdle();
     } else {
       return orbis::ErrorCode::INVAL;
     }
diff --git a/rpcsx/iodev/hid.cpp b/rpcsx/iodev/hid.cpp
index 6d5757af7..140bfc79e 100644
--- a/rpcsx/iodev/hid.cpp
+++ b/rpcsx/iodev/hid.cpp
@@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "orbis/KernelAllocator.hpp"
 #include "orbis/KernelContext.hpp"
@@ -51,8 +51,8 @@ static orbis::ErrorCode hid_ioctl(orbis::File *file, std::uint64_t request,
     // ORBIS_LOG_ERROR("hid read state", args.hidId, args.unk0, args.state,
     //                 args.unk2, args.connected, args.unk4, args.unk5);
 
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      *args.state = gpu->kbPadState;
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      *args.state = gpu.getContext().kbPadState;
       *args.connected = 1;
       *args.unk4 = 1; // is wireless?
       thread->retval[0] = 1;
@@ -75,9 +75,9 @@ static orbis::ErrorCode hid_ioctl(orbis::File *file, std::uint64_t request,
       orbis::uint padding;
       orbis::ptr<orbis::uint> unk5;
     };
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
       auto args = *reinterpret_cast<MiniReadStateArgs *>(argp);
-      *args.state = gpu->kbPadState;
+      *args.state = gpu.getContext().kbPadState;
       thread->retval[0] = 1;
     }
     return {};
diff --git a/rpcsx/main.cpp b/rpcsx/main.cpp
index 7657d3779..a69604caf 100644
--- a/rpcsx/main.cpp
+++ b/rpcsx/main.cpp
@@ -1,6 +1,6 @@
 #include "audio/AlsaDevice.hpp"
 #include "backtrace.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "io-devices.hpp"
 #include "iodev/mbus.hpp"
@@ -77,22 +77,23 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
       prot |= PROT_EXEC;
     }
 
-    auto gpuDevice = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+    auto gpuDevice = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
 
     if (gpuDevice && (prot & (isWrite ? PROT_WRITE : PROT_READ)) != 0) {
+      auto &gpuContext = gpuDevice.getContext();
       while (true) {
         auto flags =
-            gpuDevice->cachePages[vmid][page].load(std::memory_order::relaxed);
+            gpuContext.cachePages[vmid][page].load(std::memory_order::relaxed);
 
         if ((flags & amdgpu::kPageReadWriteLock) != 0) {
           if ((flags & amdgpu::kPageLazyLock) != 0) {
             if (std::uint32_t gpuCommand = 0;
-                !gpuDevice->gpuCacheCommand[vmid].compare_exchange_weak(
+                !gpuContext.gpuCacheCommand[vmid].compare_exchange_weak(
                     gpuCommand, page)) {
               continue;
             }
 
-            while (!gpuDevice->cachePages[vmid][page].compare_exchange_weak(
+            while (!gpuContext.cachePages[vmid][page].compare_exchange_weak(
                 flags, flags & ~amdgpu::kPageLazyLock,
                 std::memory_order::relaxed)) {
             }
@@ -109,7 +110,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
           break;
         }
 
-        if (gpuDevice->cachePages[vmid][page].compare_exchange_weak(
+        if (gpuContext.cachePages[vmid][page].compare_exchange_weak(
                 flags, amdgpu::kPageInvalidated, std::memory_order::relaxed)) {
           break;
         }
diff --git a/rpcsx/vm.cpp b/rpcsx/vm.cpp
index 1065f50c9..8fd5f6f64 100644
--- a/rpcsx/vm.cpp
+++ b/rpcsx/vm.cpp
@@ -1,5 +1,5 @@
 #include "vm.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelContext.hpp"
@@ -932,9 +932,9 @@ void *vm::map(void *addr, std::uint64_t len, std::int32_t prot,
 
   if (auto thr = orbis::g_currentThread) {
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
-                           -1, -1, prot, address - kMinAddress);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
+                          -1, -1, prot, address - kMinAddress);
     }
   }
 
@@ -989,9 +989,9 @@ bool vm::unmap(void *addr, std::uint64_t size) {
       (address & kBlockMask) >> kPageShift, pages, ~0);
   if (auto thr = orbis::g_currentThread) {
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
-                             size);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
+                            size);
     }
   } else {
     std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
@@ -1031,8 +1031,8 @@ bool vm::protect(void *addr, std::uint64_t size, std::int32_t prot) {
   if (auto thr = orbis::g_currentThread) {
     std::println("memory prot: {:x}", prot);
     std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
                                size, prot);
     }
   } else {