rpcsx-gpu: add multiprocess support

2026-04-20 22:05:12 +00:00 · 2024-09-01 17:43:45 +03:00 · 2024-09-01 17:43:45 +03:00 · 2c781626d3
commit 2c781626d3
parent f77376c1e3
14 changed files with 746 additions and 450 deletions
--- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
+++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
@ -1,14 +1,12 @@
 #pragma once

-#include <orbis/utils/SharedMutex.hpp>
 #include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
+#include <orbis/utils/SharedMutex.hpp>

 namespace amdgpu::bridge {
-extern std::uint32_t expGpuPid;
-
 struct PadState {
  std::uint64_t timestamp;
  std::uint32_t unk;
@ -47,7 +45,9 @@ enum class CommandId : std::uint32_t {
  ProtectMemory,
  CommandBuffer,
  Flip,
-  MapDmem,
+  MapMemory,
+  MapProcess,
+  UnmapProcess,
 };

 struct CmdMemoryProt {
@ -79,15 +79,25 @@ struct CmdFlip {
  std::uint64_t arg;
 };

-struct CmdMapDmem {
-  std::uint64_t offset;
+struct CmdMapMemory {
+  std::int64_t offset;
  std::uint64_t address;
  std::uint64_t size;
  std::uint32_t prot;
  std::uint32_t pid;
+  std::int32_t memoryType;
  std::uint32_t dmemIndex;
 };

+struct CmdMapProcess {
+  std::uint64_t pid;
+  int vmId;
+};
+
+struct CmdUnmapProcess {
+  std::uint64_t pid;
+};
+
 enum {
  kPageWriteWatch = 1 << 0,
  kPageReadWriteLock = 1 << 1,
@ -112,17 +122,15 @@ struct BridgeHeader {
  volatile std::uint64_t flipArg;
  volatile std::uint64_t flipCount;
  volatile std::uint64_t bufferInUseAddress;
-  std::uint32_t memoryAreaCount;
  std::uint32_t commandBufferCount;
  std::uint32_t bufferCount;
-  CmdMemoryProt memoryAreas[512];
  CmdCommandBuffer commandBuffers[32];
  CmdBuffer buffers[10];
  // orbis::shared_mutex cacheCommandMtx;
  // orbis::shared_cv cacheCommandCv;
-  std::atomic<std::uint64_t> cacheCommands[4];
-  std::atomic<std::uint32_t> gpuCacheCommand;
-  std::atomic<std::uint8_t> cachePages[0x100'0000'0000 / kHostPageSize];
+  std::atomic<std::uint64_t> cacheCommands[6][4];
+  std::atomic<std::uint32_t> gpuCacheCommand[6];
+  std::atomic<std::uint8_t> cachePages[6][0x100'0000'0000 / kHostPageSize];

  volatile std::uint64_t pull;
  volatile std::uint64_t push;
@ -137,7 +145,9 @@ struct Command {
    CmdCommandBuffer commandBuffer;
    CmdBuffer buffer;
    CmdFlip flip;
-    CmdMapDmem mapDmem;
+    CmdMapMemory mapMemory;
+    CmdMapProcess mapProcess;
+    CmdUnmapProcess unmapProcess;
  };
 };

@ -160,29 +170,32 @@ struct BridgePusher {

  void sendMemoryProtect(std::uint32_t pid, std::uint64_t address,
                         std::uint64_t size, std::uint32_t prot) {
-    if (pid == expGpuPid) {
-      sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
-    }
+    sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
  }

-  void sendMapDmem(std::uint32_t pid, std::uint32_t dmemIndex, std::uint64_t address, std::uint64_t size, std::uint32_t prot, std::uint64_t offset) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::MapDmem, {pid, dmemIndex, address, size, prot, offset});
-    // }
+  void sendMapMemory(std::uint32_t pid, std::uint32_t memoryType,
+                     std::uint32_t dmemIndex, std::uint64_t address,
+                     std::uint64_t size, std::uint32_t prot,
+                     std::uint64_t offset) {
+    sendCommand(CommandId::MapMemory,
+                {pid, memoryType, dmemIndex, address, size, prot, offset});
  }

  void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue,
                         std::uint64_t address, std::uint64_t size) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
-    // }
+    sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
  }

  void sendFlip(std::uint32_t pid, std::uint32_t bufferIndex,
                std::uint64_t arg) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
-    // }
+    sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
+  }
+
+  void sendMapProcess(std::uint32_t pid, unsigned vmId) {
+    sendCommand(CommandId::MapProcess, {pid, vmId});
+  }
+  void sendUnmapProcess(std::uint32_t pid) {
+    sendCommand(CommandId::UnmapProcess, {pid});
  }

  void wait() {
@ -198,7 +211,8 @@ private:

  void sendCommand(CommandId id, std::initializer_list<std::uint64_t> args) {
    std::uint64_t exp = 0;
-    while (!header->lock.compare_exchange_weak(exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
+    while (!header->lock.compare_exchange_weak(
+        exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
      exp = 0;
    }

@ -303,13 +317,23 @@ private:
      result.flip.arg = args[2];
      return result;

-    case CommandId::MapDmem:
-      result.mapDmem.pid = args[0];
-      result.mapDmem.dmemIndex = args[1];
-      result.mapDmem.address = args[2];
-      result.mapDmem.size = args[3];
-      result.mapDmem.prot = args[4];
-      result.mapDmem.offset = args[5];
+    case CommandId::MapMemory:
+      result.mapMemory.pid = args[0];
+      result.mapMemory.memoryType = args[1];
+      result.mapMemory.dmemIndex = args[2];
+      result.mapMemory.address = args[3];
+      result.mapMemory.size = args[4];
+      result.mapMemory.prot = args[5];
+      result.mapMemory.offset = args[6];
+      return result;
+
+    case CommandId::MapProcess:
+      result.mapProcess.pid = args[0];
+      result.mapProcess.vmId = args[1];
+      return result;
+
+    case CommandId::UnmapProcess:
+      result.unmapProcess.pid = args[0];
      return result;
    }

--- a/hw/amdgpu/bridge/src/bridge.cpp
+++ b/hw/amdgpu/bridge/src/bridge.cpp
@ -8,8 +8,6 @@
 static int gShmFd = -1;
 static constexpr std::size_t kShmSize = sizeof(amdgpu::bridge::BridgeHeader) +
                                        (sizeof(std::uint64_t) * 256);
-std::uint32_t amdgpu::bridge::expGpuPid = 0;
-
 amdgpu::bridge::BridgeHeader *
 amdgpu::bridge::createShmCommandBuffer(const char *name) {
  if (gShmFd != -1) {
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/bridge/bridge.hpp"
 #include "amdgpu/shader/Instruction.hpp"
 #include "gpu-scheduler.hpp"
@ -1259,6 +1260,42 @@ struct GnmTBuffer {

 static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);

+struct GnmSSampler {
+  int32_t clamp_x : 3;
+  int32_t clamp_y : 3;
+  int32_t clamp_z : 3;
+  int32_t max_aniso_ratio : 3;
+  int32_t depth_compare_func : 3;
+  int32_t force_unorm_coords : 1;
+  int32_t aniso_threshold : 3;
+  int32_t mc_coord_trunc : 1;
+  int32_t force_degamma : 1;
+  int32_t aniso_bias : 6;
+  int32_t trunc_coord : 1;
+  int32_t disable_cube_wrap : 1;
+  int32_t filter_mode : 2;
+  int32_t : 1;
+  int32_t min_lod : 12;
+  int32_t max_lod : 12;
+  int32_t perf_mip : 4;
+  int32_t perf_z : 4;
+  int32_t lod_bias : 14;
+  int32_t lod_bias_sec : 6;
+  int32_t xy_mag_filter : 2;
+  int32_t xy_min_filter : 2;
+  int32_t z_filter : 2;
+  int32_t mip_filter : 2;
+  int32_t : 4;
+  int32_t border_color_ptr : 12;
+  int32_t : 18;
+  int32_t border_color_type : 2;
+
+  auto operator<=>(const GnmSSampler &) const = default;
+  bool operator==(const GnmSSampler &) const = default;
+};
+
+static_assert(sizeof(GnmSSampler) == sizeof(std::uint32_t) * 4);
+
 constexpr auto kPageSize = 0x4000;

 void setVkDevice(VkDevice device,
@ -1266,11 +1303,11 @@ void setVkDevice(VkDevice device,
                 VkPhysicalDeviceProperties devProperties);

 struct AmdgpuDevice {
-  void handleProtectMemory(std::uint64_t address, std::uint64_t size,
-                           std::uint32_t prot);
-  void handleCommandBuffer(std::uint64_t queueId, std::uint64_t address,
-                           std::uint64_t size);
-  bool handleFlip(VkQueue queue, VkCommandBuffer cmdBuffer,
+  void handleProtectMemory(RemoteMemory memory, std::uint64_t address,
+                           std::uint64_t size, std::uint32_t prot);
+  void handleCommandBuffer(RemoteMemory memory, std::uint64_t queueId,
+                           std::uint64_t address, std::uint64_t size);
+  bool handleFlip(RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer,
                  TaskChain &initTaskChain, std::uint32_t bufferIndex,
                  std::uint64_t arg, VkImage targetImage,
                  VkExtent2D targetExtent, VkSemaphore waitSemaphore,
--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
--- a/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
+++ b/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
@ -3,10 +3,11 @@

 namespace amdgpu {
 struct RemoteMemory {
-  char *shmPointer;
+  int vmId;

  template <typename T = void> T *getPointer(std::uint64_t address) const {
-    return address ? reinterpret_cast<T *>(shmPointer + address - 0x40000)
+    return address ? reinterpret_cast<T *>(
+                         static_cast<std::uint64_t>(vmId) << 40 | address)
                   : nullptr;
  }
 };