gpu: expose public API

fix release build
2026-03-28 16:15:19 +01:00 · 2024-10-12 12:36:39 +03:00 · 2024-10-12 12:36:39 +03:00 · 988212a91e
parent 0c16e294d4
commit 988212a91e
13 changed files with 326 additions and 243 deletions
--- a/rpcsx/core/src/watchdog.cpp
+++ b/rpcsx/core/src/watchdog.cpp
@ -1,5 +1,5 @@
 #include "rx/watchdog.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "orbis/KernelContext.hpp"
 #include <chrono>
 #include <csignal>
@ -38,7 +38,7 @@ static void runGPU() {
    return;
  }

-  amdgpu::Device *gpu;
+  amdgpu::DeviceCtl gpu;
  {
    pthread_setname_np(pthread_self(), "rpcsx-gpu");
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
@ -52,11 +52,11 @@ static void runGPU() {
    dup2(logFd, 2);
    ::close(logFd);

-    gpu = orbis::knew<amdgpu::Device>();
-    orbis::g_context.gpuDevice = gpu;
+    gpu = amdgpu::DeviceCtl::createDevice();
+    orbis::g_context.gpuDevice = gpu.getOpaque();
  }

-  gpu->start();
+  gpu.start();
  std::exit(0);
 }

--- a/rpcsx/gpu/CMakeLists.txt
+++ b/rpcsx/gpu/CMakeLists.txt
@ -12,6 +12,7 @@ add_library(rpcsx-gpu
 STATIC
    Cache.cpp
    Device.cpp
+    DeviceCtl.cpp
    FlipPipeline.cpp
    Pipe.cpp
    Registers.cpp
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@ -476,71 +476,6 @@ void Device::submitGfxCommand(int gfxPipe,
  submitCommand(ring, command);
 }

-void Device::submitGfxCommand(int gfxPipe, int vmId,
-                              std::span<const std::uint32_t> command) {
-  auto op = rx::getBits(command[0], 15, 8);
-  auto type = rx::getBits(command[0], 31, 30);
-  auto len = rx::getBits(command[0], 29, 16) + 2;
-
-  if ((op != gnm::IT_INDIRECT_BUFFER && op != gnm::IT_INDIRECT_BUFFER_CNST) ||
-      type != 3 || len != 4 || command.size() != len) {
-    std::println(stderr, "unexpected gfx command for main ring: {}, {}, {}", op,
-                 type, len);
-    rx::die("");
-  }
-
-  std::vector<std::uint32_t> patchedCommand{command.data(),
-                                            command.data() + command.size()};
-  patchedCommand[3] &= ~(~0 << 24);
-  patchedCommand[3] |= vmId << 24;
-
-  submitGfxCommand(gfxPipe, patchedCommand);
-}
-
-void Device::submitSwitchBuffer(int gfxPipe) {
-  submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
-}
-void Device::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                        std::uint64_t flipArg) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_FLIP, bufferIndex, flipArg & 0xffff'ffff,
-                                   flipArg >> 32, pid));
-}
-
-void Device::submitMapMemory(int gfxPipe, std::uint32_t pid,
-                             std::uint64_t address, std::uint64_t size,
-                             int memoryType, int dmemIndex, int prot,
-                             std::int64_t offset) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff,
-                                   address >> 32, size & 0xffff'ffff,
-                                   size >> 32, memoryType, dmemIndex, prot,
-                                   offset & 0xffff'ffff, offset >> 32));
-}
-void Device::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
-                               std::uint64_t address, std::uint64_t size) {
-  submitGfxCommand(
-      gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
-                               address >> 32, size & 0xffff'ffff, size >> 32));
-}
-
-void Device::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
-  submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
-}
-
-void Device::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
-  submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
-}
-
-void Device::submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                                 std::uint64_t address, std::uint64_t size,
-                                 int prot) {
-  submitGfxCommand(gfxPipe,
-                   createPm4Packet(IT_PROTECT_MEMORY, pid,
-                                   address & 0xffff'ffff, address >> 32,
-                                   size & 0xffff'ffff, size >> 32, prot));
-}
-
 void Device::mapProcess(std::uint32_t pid, int vmId) {
  auto &process = processInfo[pid];
  process.vmId = vmId;
@ -987,25 +922,6 @@ void Device::unmapMemory(std::uint32_t pid, std::uint64_t address,
  protectMemory(pid, address, size, 0);
 }

-void Device::registerBuffer(std::uint32_t pid, Buffer buffer) {
-  auto &process = processInfo[pid];
-
-  if (buffer.attrId >= 10 || buffer.index >= 10) {
-    rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
-  }
-
-  process.buffers[buffer.index] = buffer;
-}
-
-void Device::registerBufferAttribute(std::uint32_t pid, BufferAttribute attr) {
-  auto &process = processInfo[pid];
-  if (attr.attrId >= 10) {
-    rx::die("out of buffer attributes %u", attr.attrId);
-  }
-
-  process.bufferAttributes[attr.attrId] = attr;
-}
-
 static void notifyPageChanges(Device *device, int vmId, std::uint32_t firstPage,
                              std::uint32_t pageCount) {
  std::uint64_t command =
--- a/rpcsx/gpu/Device.hpp
+++ b/rpcsx/gpu/Device.hpp
@ -1,5 +1,6 @@
 #pragma once
 #include "Cache.hpp"
+#include "DeviceContext.hpp"
 #include "FlipPipeline.hpp"
 #include "Pipe.hpp"
 #include "amdgpu/tiler_vulkan.hpp"
@ -13,7 +14,6 @@
 #include <GLFW/glfw3.h>
 #include <array>
 #include <thread>
-#include <unordered_map>
 #include <vulkan/vulkan_core.h>

 namespace amdgpu {
@ -44,25 +44,6 @@ struct VmMapSlot {
  auto operator<=>(const VmMapSlot &) const = default;
 };

-struct BufferAttribute {
-  std::uint8_t attrId;
-  std::uint8_t submit;
-  std::uint64_t canary;
-  std::uint32_t pixelFormat;
-  std::uint32_t tilingMode;
-  std::uint32_t pitch;
-  std::uint32_t width;
-  std::uint32_t height;
-};
-
-struct Buffer {
-  std::uint64_t canary;
-  std::uint32_t index;
-  std::uint32_t attrId;
-  std::uint64_t address;
-  std::uint64_t address2;
-};
-
 struct ProcessInfo {
  int vmId = -1;
  int vmFd = -1;
@ -71,46 +52,6 @@ struct ProcessInfo {
  rx::MemoryTableWithPayload<VmMapSlot> vmTable;
 };

-enum {
-  kPageWriteWatch = 1 << 0,
-  kPageReadWriteLock = 1 << 1,
-  kPageInvalidated = 1 << 2,
-  kPageLazyLock = 1 << 3
-};
-
-struct PadState {
-  std::uint64_t timestamp;
-  std::uint32_t unk;
-  std::uint32_t buttons;
-  std::uint8_t leftStickX;
-  std::uint8_t leftStickY;
-  std::uint8_t rightStickX;
-  std::uint8_t rightStickY;
-  std::uint8_t l2;
-  std::uint8_t r2;
-};
-
-enum {
-  kPadBtnL3 = 1 << 1,
-  kPadBtnR3 = 1 << 2,
-  kPadBtnOptions = 1 << 3,
-  kPadBtnUp = 1 << 4,
-  kPadBtnRight = 1 << 5,
-  kPadBtnDown = 1 << 6,
-  kPadBtnLeft = 1 << 7,
-  kPadBtnL2 = 1 << 8,
-  kPadBtnR2 = 1 << 9,
-  kPadBtnL1 = 1 << 10,
-  kPadBtnR1 = 1 << 11,
-  kPadBtnTriangle = 1 << 12,
-  kPadBtnCircle = 1 << 13,
-  kPadBtnCross = 1 << 14,
-  kPadBtnSquare = 1 << 15,
-  kPadBtnPs = 1 << 16,
-  kPadBtnTouchPad = 1 << 20,
-  kPadBtnIntercepted = 1 << 31,
-};
-
 struct RemoteMemory {
  int vmId;

@ -121,10 +62,9 @@ struct RemoteMemory {
  }
 };

-struct Device : orbis::RcBase {
+struct Device : orbis::RcBase, DeviceContext {
  static constexpr auto kComputePipeCount = 8;
  static constexpr auto kGfxPipeCount = 2;
-  static constexpr auto kMaxProcessCount = 6;

  shader::SemanticInfo gcnSemantic;
  shader::spv::Context shaderSemanticContext;
@ -153,16 +93,6 @@ struct Device : orbis::RcBase {
      {this, 0}, {this, 1}, {this, 2}, {this, 3}, {this, 4}, {this, 5},
  };

-  PadState kbPadState;
-  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
-  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
-  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
-
-  volatile std::uint32_t flipBuffer[kMaxProcessCount];
-  volatile std::uint64_t flipArg[kMaxProcessCount];
-  volatile std::uint64_t flipCount[kMaxProcessCount];
-  volatile std::uint64_t bufferInUseAddress[kMaxProcessCount];
-
  std::uint32_t mainGfxRings[kGfxPipeCount][0x4000 / sizeof(std::uint32_t)];

  Device();
@ -184,20 +114,6 @@ struct Device : orbis::RcBase {

  void submitCommand(Queue &ring, std::span<const std::uint32_t> command);
  void submitGfxCommand(int gfxPipe, std::span<const std::uint32_t> command);
-  void submitGfxCommand(int gfxPipe, int vmId,
-                        std::span<const std::uint32_t> command);
-  void submitSwitchBuffer(int gfxPipe);
-  void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
-                  std::uint64_t flipArg);
-  void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
-                       std::uint64_t size, int memoryType, int dmemIndex,
-                       int prot, std::int64_t offset);
-  void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
-                         std::uint64_t size);
-  void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
-  void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
-  void submitProtectMemory(int gfxPipe, std::uint32_t pid,
-                           std::uint64_t address, std::uint64_t size, int prot);

  void mapProcess(std::uint32_t pid, int vmId);
  void unmapProcess(std::uint32_t pid);
@ -214,8 +130,6 @@ struct Device : orbis::RcBase {
                 int memoryType, int dmemIndex, int prot, std::int64_t offset);
  void unmapMemory(std::uint32_t pid, std::uint64_t address,
                   std::uint64_t size);
-  void registerBuffer(std::uint32_t pid, Buffer buffer);
-  void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);
  void watchWrites(int vmId, std::uint64_t address, std::uint64_t size);
  void lockReadWrite(int vmId, std::uint64_t address, std::uint64_t size,
                     bool isLazy);
--- a/rpcsx/gpu/DeviceContext.hpp
+++ b/rpcsx/gpu/DeviceContext.hpp
@ -0,0 +1,79 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+
+namespace amdgpu {
+struct BufferAttribute {
+  std::uint8_t attrId;
+  std::uint8_t submit;
+  std::uint64_t canary;
+  std::uint32_t pixelFormat;
+  std::uint32_t tilingMode;
+  std::uint32_t pitch;
+  std::uint32_t width;
+  std::uint32_t height;
+};
+
+struct Buffer {
+  std::uint64_t canary;
+  std::uint32_t index;
+  std::uint32_t attrId;
+  std::uint64_t address;
+  std::uint64_t address2;
+};
+
+enum {
+  kPageWriteWatch = 1 << 0,
+  kPageReadWriteLock = 1 << 1,
+  kPageInvalidated = 1 << 2,
+  kPageLazyLock = 1 << 3
+};
+
+struct PadState {
+  std::uint64_t timestamp;
+  std::uint32_t unk;
+  std::uint32_t buttons;
+  std::uint8_t leftStickX;
+  std::uint8_t leftStickY;
+  std::uint8_t rightStickX;
+  std::uint8_t rightStickY;
+  std::uint8_t l2;
+  std::uint8_t r2;
+};
+
+enum {
+  kPadBtnL3 = 1 << 1,
+  kPadBtnR3 = 1 << 2,
+  kPadBtnOptions = 1 << 3,
+  kPadBtnUp = 1 << 4,
+  kPadBtnRight = 1 << 5,
+  kPadBtnDown = 1 << 6,
+  kPadBtnLeft = 1 << 7,
+  kPadBtnL2 = 1 << 8,
+  kPadBtnR2 = 1 << 9,
+  kPadBtnL1 = 1 << 10,
+  kPadBtnR1 = 1 << 11,
+  kPadBtnTriangle = 1 << 12,
+  kPadBtnCircle = 1 << 13,
+  kPadBtnCross = 1 << 14,
+  kPadBtnSquare = 1 << 15,
+  kPadBtnPs = 1 << 16,
+  kPadBtnTouchPad = 1 << 20,
+  kPadBtnIntercepted = 1 << 31,
+};
+
+struct DeviceContext {
+  static constexpr auto kMaxProcessCount = 6;
+
+  PadState kbPadState;
+  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
+  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
+  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
+
+  volatile std::uint32_t flipBuffer[kMaxProcessCount];
+  volatile std::uint64_t flipArg[kMaxProcessCount];
+  volatile std::uint64_t flipCount[kMaxProcessCount];
+  volatile std::uint64_t bufferInUseAddress[kMaxProcessCount];
+};
+} // namespace amdgpu
--- a/rpcsx/gpu/DeviceCtl.cpp
+++ b/rpcsx/gpu/DeviceCtl.cpp
@ -0,0 +1,121 @@
+#include "DeviceCtl.hpp"
+#include "Device.hpp"
+#include "gnm/pm4.hpp"
+#include "rx/bits.hpp"
+#include "rx/die.hpp"
+#include "shader/dialect.hpp"
+#include <cstdio>
+#include <print>
+#include <vector>
+
+using namespace amdgpu;
+
+DeviceCtl::DeviceCtl() noexcept = default;
+DeviceCtl::DeviceCtl(orbis::Ref<orbis::RcBase> device) noexcept
+    : mDevice(device.rawStaticCast<Device>()) {}
+DeviceCtl::DeviceCtl(DeviceCtl &&) noexcept = default;
+DeviceCtl::DeviceCtl(const DeviceCtl &) = default;
+DeviceCtl &DeviceCtl::operator=(DeviceCtl &&) noexcept = default;
+DeviceCtl &DeviceCtl::operator=(const DeviceCtl &) = default;
+
+DeviceCtl::~DeviceCtl() = default;
+
+DeviceCtl DeviceCtl::createDevice() {
+  DeviceCtl result;
+  result.mDevice = orbis::knew<Device>();
+  return result;
+}
+
+DeviceContext &DeviceCtl::getContext() { return *mDevice.get(); }
+orbis::Ref<orbis::RcBase> DeviceCtl::getOpaque() { return mDevice; }
+
+void DeviceCtl::submitGfxCommand(int gfxPipe, int vmId,
+                                 std::span<const std::uint32_t> command) {
+  auto op = rx::getBits(command[0], 15, 8);
+  auto type = rx::getBits(command[0], 31, 30);
+  auto len = rx::getBits(command[0], 29, 16) + 2;
+
+  if ((op != gnm::IT_INDIRECT_BUFFER && op != gnm::IT_INDIRECT_BUFFER_CNST) ||
+      type != 3 || len != 4 || command.size() != len) {
+    std::println(stderr, "unexpected gfx command for main ring: {}, {}, {}", op,
+                 type, len);
+    rx::die("");
+  }
+
+  std::vector<std::uint32_t> patchedCommand{command.data(),
+                                            command.data() + command.size()};
+  patchedCommand[3] &= ~(~0 << 24);
+  patchedCommand[3] |= vmId << 24;
+
+  mDevice->submitGfxCommand(gfxPipe, patchedCommand);
+}
+
+void DeviceCtl::submitSwitchBuffer(int gfxPipe) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(gnm::IT_SWITCH_BUFFER, 0));
+}
+void DeviceCtl::submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
+                           std::uint64_t flipArg) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_FLIP, bufferIndex,
+                                                     flipArg & 0xffff'ffff,
+                                                     flipArg >> 32, pid));
+}
+
+void DeviceCtl::submitMapMemory(int gfxPipe, std::uint32_t pid,
+                                std::uint64_t address, std::uint64_t size,
+                                int memoryType, int dmemIndex, int prot,
+                                std::int64_t offset) {
+  mDevice->submitGfxCommand(
+      gfxPipe,
+      createPm4Packet(IT_MAP_MEMORY, pid, address & 0xffff'ffff, address >> 32,
+                      size & 0xffff'ffff, size >> 32, memoryType, dmemIndex,
+                      prot, offset & 0xffff'ffff, offset >> 32));
+}
+void DeviceCtl::submitUnmapMemory(int gfxPipe, std::uint32_t pid,
+                                  std::uint64_t address, std::uint64_t size) {
+  mDevice->submitGfxCommand(
+      gfxPipe, createPm4Packet(IT_UNMAP_MEMORY, pid, address & 0xffff'ffff,
+                               address >> 32, size & 0xffff'ffff, size >> 32));
+}
+
+void DeviceCtl::submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId) {
+  mDevice->submitGfxCommand(gfxPipe,
+                            createPm4Packet(gnm::IT_MAP_PROCESS, pid, vmId));
+}
+
+void DeviceCtl::submitUnmapProcess(int gfxPipe, std::uint32_t pid) {
+  mDevice->submitGfxCommand(gfxPipe, createPm4Packet(IT_UNMAP_PROCESS, pid));
+}
+
+void DeviceCtl::submitProtectMemory(int gfxPipe, std::uint32_t pid,
+                                    std::uint64_t address, std::uint64_t size,
+                                    int prot) {
+  mDevice->submitGfxCommand(
+      gfxPipe,
+      createPm4Packet(IT_PROTECT_MEMORY, pid, address & 0xffff'ffff,
+                      address >> 32, size & 0xffff'ffff, size >> 32, prot));
+}
+
+void DeviceCtl::registerBuffer(std::uint32_t pid, Buffer buffer) {
+  // FIXME: submit command
+  auto &process = mDevice->processInfo[pid];
+
+  if (buffer.attrId >= 10 || buffer.index >= 10) {
+    rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
+  }
+
+  process.buffers[buffer.index] = buffer;
+}
+
+void DeviceCtl::registerBufferAttribute(std::uint32_t pid,
+                                        BufferAttribute attr) {
+  // FIXME: submit command
+  auto &process = mDevice->processInfo[pid];
+  if (attr.attrId >= 10) {
+    rx::die("out of buffer attributes %u", attr.attrId);
+  }
+
+  process.bufferAttributes[attr.attrId] = attr;
+}
+
+void DeviceCtl::start() { mDevice->start(); }
+void DeviceCtl::waitForIdle() { mDevice->waitForIdle(); }
--- a/rpcsx/gpu/DeviceCtl.hpp
+++ b/rpcsx/gpu/DeviceCtl.hpp
@ -0,0 +1,48 @@
+#pragma once
+
+#include "DeviceContext.hpp"
+#include "orbis/utils/Rc.hpp"
+#include <cstdint>
+#include <span>
+
+namespace amdgpu {
+class Device;
+
+class DeviceCtl {
+  orbis::Ref<Device> mDevice;
+
+public:
+  DeviceCtl() noexcept;
+  DeviceCtl(orbis::Ref<orbis::RcBase> device) noexcept;
+  DeviceCtl(DeviceCtl &&) noexcept;
+  DeviceCtl(const DeviceCtl &);
+  DeviceCtl &operator=(DeviceCtl &&) noexcept;
+  DeviceCtl &operator=(const DeviceCtl &);
+  ~DeviceCtl();
+
+  static DeviceCtl createDevice();
+  DeviceContext &getContext();
+  orbis::Ref<orbis::RcBase> getOpaque();
+
+  void submitGfxCommand(int gfxPipe, int vmId,
+                        std::span<const std::uint32_t> command);
+  void submitSwitchBuffer(int gfxPipe);
+  void submitFlip(int gfxPipe, std::uint32_t pid, int bufferIndex,
+                  std::uint64_t flipArg);
+  void submitMapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+                       std::uint64_t size, int memoryType, int dmemIndex,
+                       int prot, std::int64_t offset);
+  void submitUnmapMemory(int gfxPipe, std::uint32_t pid, std::uint64_t address,
+                         std::uint64_t size);
+  void submitMapProcess(int gfxPipe, std::uint32_t pid, int vmId);
+  void submitUnmapProcess(int gfxPipe, std::uint32_t pid);
+  void submitProtectMemory(int gfxPipe, std::uint32_t pid,
+                           std::uint64_t address, std::uint64_t size, int prot);
+  void registerBuffer(std::uint32_t pid, Buffer buffer);
+  void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr);
+  void start();
+  void waitForIdle();
+
+  explicit operator bool() const { return mDevice != nullptr; }
+};
+} // namespace amdgpu
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelAllocator.hpp"
@ -134,12 +134,13 @@ static void runBridge(int vmId) {
  std::thread{[=] {
    pthread_setname_np(pthread_self(), "Bridge");

-    auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+    auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+    auto &gpuCtx = gpu.getContext();
    std::vector<std::uint64_t> fetchedCommands;
-    fetchedCommands.reserve(std::size(gpu->cacheCommands));
+    fetchedCommands.reserve(std::size(gpuCtx.cacheCommands));

    while (true) {
-      for (auto &command : gpu->cacheCommands) {
+      for (auto &command : gpuCtx.cacheCommands) {
        std::uint64_t value = command[vmId].load(std::memory_order::relaxed);

        if (value != 0) {
@ -157,7 +158,7 @@ static void runBridge(int vmId) {
        auto count = static_cast<std::uint32_t>(command >> 32) + 1;

        auto pageFlags =
-            gpu->cachePages[vmId][page].load(std::memory_order::relaxed);
+            gpuCtx.cachePages[vmId][page].load(std::memory_order::relaxed);

        auto address = static_cast<std::uint64_t>(page) * rx::mem::pageSize;
        auto origVmProt = vm::getPageProtection(address);
@ -253,7 +254,9 @@ static void initDceMemory(DceDevice *device) {
 static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                                  void *argp, orbis::Thread *thread) {
  auto device = static_cast<DceDevice *>(file->device.get());
-  auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+
+  auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+  auto &gpuCtx = gpu.getContext();

  if (request == 0xc0308203) {
    // returns:
@ -299,11 +302,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,

      FlipControlStatus flipStatus{};
      // TODO: lock bridge header
-      flipStatus.flipArg = gpu->flipArg[thread->tproc->vmId];
-      flipStatus.count = gpu->flipCount[thread->tproc->vmId];
+      flipStatus.flipArg = gpuCtx.flipArg[thread->tproc->vmId];
+      flipStatus.count = gpuCtx.flipCount[thread->tproc->vmId];
      flipStatus.processTime = 0; // TODO
      flipStatus.tsc = 0;         // TODO
-      flipStatus.currentBuffer = gpu->flipBuffer[thread->tproc->vmId];
+      flipStatus.currentBuffer = gpuCtx.flipBuffer[thread->tproc->vmId];
      flipStatus.flipPendingNum0 = 0; // TODO
      flipStatus.gcQueueNum = 0;      // TODO
      flipStatus.flipPendingNum1 = 0; // TODO
@ -333,7 +336,7 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
      *(std::uint64_t *)args->size = kDceControlMemorySize;  // size
    } else if (args->id == 31) {
      if ((std::uint64_t)args->ptr == 0xc) {
-        gpu->bufferInUseAddress[thread->tproc->vmId] = args->size;
+        gpuCtx.bufferInUseAddress[thread->tproc->vmId] = args->size;
      } else if ((std::uint64_t)args->ptr != 1) {
        ORBIS_LOG_ERROR("buffer in use", args->ptr, args->size);
        thread->where();
@ -362,13 +365,13 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
    ORBIS_LOG_ERROR("dce: RegisterBuffer", args->canary, args->index,
                    args->address, args->address2);

-    gpu->registerBuffer(thread->tproc->pid, {
-                                                .canary = args->canary,
-                                                .index = args->index,
-                                                .attrId = args->attrid,
-                                                .address = args->address,
-                                                .address2 = args->address2,
-                                            });
+    gpu.registerBuffer(thread->tproc->pid, {
+                                               .canary = args->canary,
+                                               .index = args->index,
+                                               .attrId = args->attrid,
+                                               .address = args->address,
+                                               .address2 = args->address2,
+                                           });
    return {};
  }

@ -381,17 +384,17 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                    args->unk4_zero, args->unk5_zero, args->options,
                    args->reserved1, args->reserved2);

-    gpu->registerBufferAttribute(thread->tproc->pid,
-                                 {
-                                     .attrId = args->attrid,
-                                     .submit = args->submit,
-                                     .canary = args->canary,
-                                     .pixelFormat = args->pixelFormat,
-                                     .tilingMode = args->tilingMode,
-                                     .pitch = args->pitch,
-                                     .width = args->width,
-                                     .height = args->height,
-                                 });
+    gpu.registerBufferAttribute(thread->tproc->pid,
+                                {
+                                    .attrId = args->attrid,
+                                    .submit = args->submit,
+                                    .canary = args->canary,
+                                    .pixelFormat = args->pixelFormat,
+                                    .tilingMode = args->tilingMode,
+                                    .pitch = args->pitch,
+                                    .width = args->width,
+                                    .height = args->height,
+                                });

    return {};
  }
@ -404,9 +407,9 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
    //                 args->displayBufferIndex, args->flipMode, args->unk1,
    //                 args->flipArg, args->flipArg2, args->eop_nz, args->unk2,
    //                 args->eop_val, args->unk3, args->unk4, args->rout);
-    gpu->submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
-                    args->displayBufferIndex,
-                    /*args->flipMode,*/ args->flipArg);
+    gpu.submitFlip(thread->tproc->gfxRing, thread->tproc->pid,
+                   args->displayBufferIndex,
+                   /*args->flipMode,*/ args->flipArg);

    // *args->rout = 0;
    return {};
@ -470,8 +473,8 @@ orbis::ErrorCode DceDevice::open(orbis::Ref<orbis::File> *file,

    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
    {
-      auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
-      gpu->submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId);
+      auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
+      gpu.submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId);
      thread->tproc->vmId = vmId;
    }

--- a/rpcsx/iodev/dmem.cpp
+++ b/rpcsx/iodev/dmem.cpp
@ -1,5 +1,5 @@
 #include "dmem.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "orbis/KernelAllocator.hpp"
 #include "orbis/KernelContext.hpp"
@ -68,8 +68,8 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
    return orbis::ErrorCode::INVAL;
  }

-  if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-    gpu->submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
+  if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+    gpu.submitMapMemory(orbis::g_currentThread->tproc->gfxRing,
                         orbis::g_currentThread->tproc->pid,
                         reinterpret_cast<std::uint64_t>(result), len,
                         memoryType, index, prot, directMemoryStart);
--- a/rpcsx/iodev/gc.cpp
+++ b/rpcsx/iodev/gc.cpp
@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelAllocator.hpp"
@ -8,12 +8,12 @@
 #include "orbis/thread/Thread.hpp"
 #include "orbis/utils/Logs.hpp"
 #include "orbis/utils/SharedMutex.hpp"
+#include "rx/die.hpp"
 #include "vm.hpp"
 #include <cstdio>
 #include <mutex>
 #include <print>
 #include <sys/mman.h>
-#include <unordered_map>

 struct ComputeQueue {
  std::uint64_t ringBaseAddress{};
@ -84,9 +84,9 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
    };

    auto args = reinterpret_cast<Args *>(argp);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
      for (unsigned i = 0; i < args->count; ++i) {
-        gpu->submitGfxCommand(gcFile->gfxPipe,
+        gpu.submitGfxCommand(gcFile->gfxPipe,
                              orbis::g_currentThread->tproc->vmId,
                              {args->cmds + i * 4, 4});
      }
@ -103,8 +103,8 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
    };

    auto args = reinterpret_cast<Args *>(argp);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitSwitchBuffer(orbis::g_currentThread->tproc->vmId);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitSwitchBuffer(orbis::g_currentThread->tproc->vmId);
    } else {
      return orbis::ErrorCode::INVAL;
    }
@ -124,9 +124,9 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,

    auto args = reinterpret_cast<Args *>(argp);

-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
      for (unsigned i = 0; i < args->count; ++i) {
-        gpu->submitGfxCommand(gcFile->gfxPipe,
+        gpu.submitGfxCommand(gcFile->gfxPipe,
                              orbis::g_currentThread->tproc->vmId,
                              {args->cmds + i * 4, 4});
      }
@ -139,8 +139,8 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  }

  case 0xc0048116: { // submit done?
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->waitForIdle();
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.waitForIdle();
    } else {
      return orbis::ErrorCode::INVAL;
    }
--- a/rpcsx/iodev/hid.cpp
+++ b/rpcsx/iodev/hid.cpp
@ -1,4 +1,4 @@
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "orbis/KernelAllocator.hpp"
 #include "orbis/KernelContext.hpp"
@ -51,8 +51,8 @@ static orbis::ErrorCode hid_ioctl(orbis::File *file, std::uint64_t request,
    // ORBIS_LOG_ERROR("hid read state", args.hidId, args.unk0, args.state,
    //                 args.unk2, args.connected, args.unk4, args.unk5);

-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      *args.state = gpu->kbPadState;
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      *args.state = gpu.getContext().kbPadState;
      *args.connected = 1;
      *args.unk4 = 1; // is wireless?
      thread->retval[0] = 1;
@ -75,9 +75,9 @@ static orbis::ErrorCode hid_ioctl(orbis::File *file, std::uint64_t request,
      orbis::uint padding;
      orbis::ptr<orbis::uint> unk5;
    };
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
      auto args = *reinterpret_cast<MiniReadStateArgs *>(argp);
-      *args.state = gpu->kbPadState;
+      *args.state = gpu.getContext().kbPadState;
      thread->retval[0] = 1;
    }
    return {};
--- a/rpcsx/main.cpp
+++ b/rpcsx/main.cpp
@ -1,6 +1,6 @@
 #include "audio/AlsaDevice.hpp"
 #include "backtrace.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "io-devices.hpp"
 #include "iodev/mbus.hpp"
@ -77,22 +77,23 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
      prot |= PROT_EXEC;
    }

-    auto gpuDevice = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>();
+    auto gpuDevice = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};

    if (gpuDevice && (prot & (isWrite ? PROT_WRITE : PROT_READ)) != 0) {
+      auto &gpuContext = gpuDevice.getContext();
      while (true) {
        auto flags =
-            gpuDevice->cachePages[vmid][page].load(std::memory_order::relaxed);
+            gpuContext.cachePages[vmid][page].load(std::memory_order::relaxed);

        if ((flags & amdgpu::kPageReadWriteLock) != 0) {
          if ((flags & amdgpu::kPageLazyLock) != 0) {
            if (std::uint32_t gpuCommand = 0;
-                !gpuDevice->gpuCacheCommand[vmid].compare_exchange_weak(
+                !gpuContext.gpuCacheCommand[vmid].compare_exchange_weak(
                    gpuCommand, page)) {
              continue;
            }

-            while (!gpuDevice->cachePages[vmid][page].compare_exchange_weak(
+            while (!gpuContext.cachePages[vmid][page].compare_exchange_weak(
                flags, flags & ~amdgpu::kPageLazyLock,
                std::memory_order::relaxed)) {
            }
@ -109,7 +110,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
          break;
        }

-        if (gpuDevice->cachePages[vmid][page].compare_exchange_weak(
+        if (gpuContext.cachePages[vmid][page].compare_exchange_weak(
                flags, amdgpu::kPageInvalidated, std::memory_order::relaxed)) {
          break;
        }
--- a/rpcsx/vm.cpp
+++ b/rpcsx/vm.cpp
@ -1,5 +1,5 @@
 #include "vm.hpp"
-#include "gpu/Device.hpp"
+#include "gpu/DeviceCtl.hpp"
 #include "io-device.hpp"
 #include "iodev/dmem.hpp"
 #include "orbis/KernelContext.hpp"
@ -932,9 +932,9 @@ void *vm::map(void *addr, std::uint64_t len, std::int32_t prot,

  if (auto thr = orbis::g_currentThread) {
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
-                           -1, -1, prot, address - kMinAddress);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitMapMemory(thr->tproc->gfxRing, thr->tproc->pid, address, len,
+                          -1, -1, prot, address - kMinAddress);
    }
  }

@ -989,9 +989,9 @@ bool vm::unmap(void *addr, std::uint64_t size) {
      (address & kBlockMask) >> kPageShift, pages, ~0);
  if (auto thr = orbis::g_currentThread) {
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
-                             size);
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitUnmapMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
+                            size);
    }
  } else {
    std::println(stderr, "ignoring mapping {:x}-{:x}", address, address + size);
@ -1031,8 +1031,8 @@ bool vm::protect(void *addr, std::uint64_t size, std::int32_t prot) {
  if (auto thr = orbis::g_currentThread) {
    std::println("memory prot: {:x}", prot);
    std::lock_guard lock(orbis::g_context.gpuDeviceMtx);
-    if (auto gpu = orbis::g_context.gpuDevice.staticCast<amdgpu::Device>()) {
-      gpu->submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
+    if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) {
+      gpu.submitProtectMemory(thr->tproc->gfxRing, thr->tproc->pid, address,
                               size, prot);
    }
  } else {