rpcsx-gpu: add multiprocess support

2025-12-06 07:12:14 +01:00 · 2024-09-01 17:43:45 +03:00 · 2024-09-01 17:43:45 +03:00 · 2c781626d3
parent f77376c1e3
commit 2c781626d3
14 changed files with 746 additions and 450 deletions
--- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
+++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
@ -1,14 +1,12 @@
 #pragma once

-#include <orbis/utils/SharedMutex.hpp>
 #include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
+#include <orbis/utils/SharedMutex.hpp>

 namespace amdgpu::bridge {
-extern std::uint32_t expGpuPid;
-
 struct PadState {
  std::uint64_t timestamp;
  std::uint32_t unk;
@ -47,7 +45,9 @@ enum class CommandId : std::uint32_t {
  ProtectMemory,
  CommandBuffer,
  Flip,
-  MapDmem,
+  MapMemory,
+  MapProcess,
+  UnmapProcess,
 };

 struct CmdMemoryProt {
@ -79,15 +79,25 @@ struct CmdFlip {
  std::uint64_t arg;
 };

-struct CmdMapDmem {
-  std::uint64_t offset;
+struct CmdMapMemory {
+  std::int64_t offset;
  std::uint64_t address;
  std::uint64_t size;
  std::uint32_t prot;
  std::uint32_t pid;
+  std::int32_t memoryType;
  std::uint32_t dmemIndex;
 };

+struct CmdMapProcess {
+  std::uint64_t pid;
+  int vmId;
+};
+
+struct CmdUnmapProcess {
+  std::uint64_t pid;
+};
+
 enum {
  kPageWriteWatch = 1 << 0,
  kPageReadWriteLock = 1 << 1,
@ -112,17 +122,15 @@ struct BridgeHeader {
  volatile std::uint64_t flipArg;
  volatile std::uint64_t flipCount;
  volatile std::uint64_t bufferInUseAddress;
-  std::uint32_t memoryAreaCount;
  std::uint32_t commandBufferCount;
  std::uint32_t bufferCount;
-  CmdMemoryProt memoryAreas[512];
  CmdCommandBuffer commandBuffers[32];
  CmdBuffer buffers[10];
  // orbis::shared_mutex cacheCommandMtx;
  // orbis::shared_cv cacheCommandCv;
-  std::atomic<std::uint64_t> cacheCommands[4];
-  std::atomic<std::uint32_t> gpuCacheCommand;
-  std::atomic<std::uint8_t> cachePages[0x100'0000'0000 / kHostPageSize];
+  std::atomic<std::uint64_t> cacheCommands[6][4];
+  std::atomic<std::uint32_t> gpuCacheCommand[6];
+  std::atomic<std::uint8_t> cachePages[6][0x100'0000'0000 / kHostPageSize];

  volatile std::uint64_t pull;
  volatile std::uint64_t push;
@ -137,7 +145,9 @@ struct Command {
    CmdCommandBuffer commandBuffer;
    CmdBuffer buffer;
    CmdFlip flip;
-    CmdMapDmem mapDmem;
+    CmdMapMemory mapMemory;
+    CmdMapProcess mapProcess;
+    CmdUnmapProcess unmapProcess;
  };
 };

@ -160,29 +170,32 @@ struct BridgePusher {

  void sendMemoryProtect(std::uint32_t pid, std::uint64_t address,
                         std::uint64_t size, std::uint32_t prot) {
-    if (pid == expGpuPid) {
-      sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
-    }
+    sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
  }

-  void sendMapDmem(std::uint32_t pid, std::uint32_t dmemIndex, std::uint64_t address, std::uint64_t size, std::uint32_t prot, std::uint64_t offset) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::MapDmem, {pid, dmemIndex, address, size, prot, offset});
-    // }
+  void sendMapMemory(std::uint32_t pid, std::uint32_t memoryType,
+                     std::uint32_t dmemIndex, std::uint64_t address,
+                     std::uint64_t size, std::uint32_t prot,
+                     std::uint64_t offset) {
+    sendCommand(CommandId::MapMemory,
+                {pid, memoryType, dmemIndex, address, size, prot, offset});
  }

  void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue,
                         std::uint64_t address, std::uint64_t size) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
-    // }
+    sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
  }

  void sendFlip(std::uint32_t pid, std::uint32_t bufferIndex,
                std::uint64_t arg) {
-    // if (pid == expGpuPid) {
-      sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
-    // }
+    sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
+  }
+
+  void sendMapProcess(std::uint32_t pid, unsigned vmId) {
+    sendCommand(CommandId::MapProcess, {pid, vmId});
+  }
+  void sendUnmapProcess(std::uint32_t pid) {
+    sendCommand(CommandId::UnmapProcess, {pid});
  }

  void wait() {
@ -198,7 +211,8 @@ private:

  void sendCommand(CommandId id, std::initializer_list<std::uint64_t> args) {
    std::uint64_t exp = 0;
-    while (!header->lock.compare_exchange_weak(exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
+    while (!header->lock.compare_exchange_weak(
+        exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
      exp = 0;
    }

@ -303,13 +317,23 @@ private:
      result.flip.arg = args[2];
      return result;

-    case CommandId::MapDmem:
-      result.mapDmem.pid = args[0];
-      result.mapDmem.dmemIndex = args[1];
-      result.mapDmem.address = args[2];
-      result.mapDmem.size = args[3];
-      result.mapDmem.prot = args[4];
-      result.mapDmem.offset = args[5];
+    case CommandId::MapMemory:
+      result.mapMemory.pid = args[0];
+      result.mapMemory.memoryType = args[1];
+      result.mapMemory.dmemIndex = args[2];
+      result.mapMemory.address = args[3];
+      result.mapMemory.size = args[4];
+      result.mapMemory.prot = args[5];
+      result.mapMemory.offset = args[6];
+      return result;
+
+    case CommandId::MapProcess:
+      result.mapProcess.pid = args[0];
+      result.mapProcess.vmId = args[1];
+      return result;
+
+    case CommandId::UnmapProcess:
+      result.unmapProcess.pid = args[0];
      return result;
    }

--- a/hw/amdgpu/bridge/src/bridge.cpp
+++ b/hw/amdgpu/bridge/src/bridge.cpp
@ -8,8 +8,6 @@
 static int gShmFd = -1;
 static constexpr std::size_t kShmSize = sizeof(amdgpu::bridge::BridgeHeader) +
                                        (sizeof(std::uint64_t) * 256);
-std::uint32_t amdgpu::bridge::expGpuPid = 0;
-
 amdgpu::bridge::BridgeHeader *
 amdgpu::bridge::createShmCommandBuffer(const char *name) {
  if (gShmFd != -1) {
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/bridge/bridge.hpp"
 #include "amdgpu/shader/Instruction.hpp"
 #include "gpu-scheduler.hpp"
@ -1259,6 +1260,42 @@ struct GnmTBuffer {

 static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);

+struct GnmSSampler {
+  int32_t clamp_x : 3;
+  int32_t clamp_y : 3;
+  int32_t clamp_z : 3;
+  int32_t max_aniso_ratio : 3;
+  int32_t depth_compare_func : 3;
+  int32_t force_unorm_coords : 1;
+  int32_t aniso_threshold : 3;
+  int32_t mc_coord_trunc : 1;
+  int32_t force_degamma : 1;
+  int32_t aniso_bias : 6;
+  int32_t trunc_coord : 1;
+  int32_t disable_cube_wrap : 1;
+  int32_t filter_mode : 2;
+  int32_t : 1;
+  int32_t min_lod : 12;
+  int32_t max_lod : 12;
+  int32_t perf_mip : 4;
+  int32_t perf_z : 4;
+  int32_t lod_bias : 14;
+  int32_t lod_bias_sec : 6;
+  int32_t xy_mag_filter : 2;
+  int32_t xy_min_filter : 2;
+  int32_t z_filter : 2;
+  int32_t mip_filter : 2;
+  int32_t : 4;
+  int32_t border_color_ptr : 12;
+  int32_t : 18;
+  int32_t border_color_type : 2;
+
+  auto operator<=>(const GnmSSampler &) const = default;
+  bool operator==(const GnmSSampler &) const = default;
+};
+
+static_assert(sizeof(GnmSSampler) == sizeof(std::uint32_t) * 4);
+
 constexpr auto kPageSize = 0x4000;

 void setVkDevice(VkDevice device,
@ -1266,11 +1303,11 @@ void setVkDevice(VkDevice device,
                 VkPhysicalDeviceProperties devProperties);

 struct AmdgpuDevice {
-  void handleProtectMemory(std::uint64_t address, std::uint64_t size,
-                           std::uint32_t prot);
-  void handleCommandBuffer(std::uint64_t queueId, std::uint64_t address,
-                           std::uint64_t size);
-  bool handleFlip(VkQueue queue, VkCommandBuffer cmdBuffer,
+  void handleProtectMemory(RemoteMemory memory, std::uint64_t address,
+                           std::uint64_t size, std::uint32_t prot);
+  void handleCommandBuffer(RemoteMemory memory, std::uint64_t queueId,
+                           std::uint64_t address, std::uint64_t size);
+  bool handleFlip(RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer,
                  TaskChain &initTaskChain, std::uint32_t bufferIndex,
                  std::uint64_t arg, VkImage targetImage,
                  VkExtent2D targetExtent, VkSemaphore waitSemaphore,
--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
--- a/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
+++ b/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
@ -3,10 +3,11 @@

 namespace amdgpu {
 struct RemoteMemory {
-  char *shmPointer;
+  int vmId;

  template <typename T = void> T *getPointer(std::uint64_t address) const {
-    return address ? reinterpret_cast<T *>(shmPointer + address - 0x40000)
+    return address ? reinterpret_cast<T *>(
+                         static_cast<std::uint64_t>(vmId) << 40 | address)
                   : nullptr;
  }
 };
--- a/orbis-kernel/include/orbis/AuthInfo.hpp
+++ b/orbis-kernel/include/orbis/AuthInfo.hpp
@ -7,7 +7,14 @@ struct AuthInfo {
  uint64_t unk0;
  uint64_t caps[4];
  uint64_t attrs[4];
-  uint64_t unk[8];
+  uint64_t ucred[8];
+
+  bool hasUseHp3dPipeCapability() const {
+    return ucred[2] == 0x3800000000000009;
+  }
+  bool hasMmapSelfCapability() const { return ((ucred[4] >> 0x3a) & 1) != 1; }
+  bool hasSystemCapability() const { return ((ucred[3] >> 0x3e) & 1) != 0; }
+  bool hasSceProgramAttribute() const { return ((ucred[3] >> 0x1f) & 1) != 0; }
 };

 static_assert(sizeof(AuthInfo) == 136);
--- a/orbis-kernel/include/orbis/thread/Process.hpp
+++ b/orbis-kernel/include/orbis/thread/Process.hpp
@ -51,6 +51,7 @@ struct Process final {
  ProcessState state = ProcessState::NEW;
  Process *parentProcess = nullptr;
  shared_mutex mtx;
+  int vmId = -1;
  void (*onSysEnter)(Thread *thread, int id, uint64_t *args,
                     int argsCount) = nullptr;
  void (*onSysExit)(Thread *thread, int id, uint64_t *args, int argsCount,
--- a/rpcsx-gpu/CMakeLists.txt
+++ b/rpcsx-gpu/CMakeLists.txt
@ -8,4 +8,5 @@ add_executable(rpcsx-gpu
 target_include_directories(rpcsx-gpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(rpcsx-gpu PUBLIC amdgpu::bridge amdgpu::device glfw Vulkan::Vulkan rx)
 set_target_properties(rpcsx-gpu PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+target_link_options(rpcsx-os PUBLIC "LINKER:-Ttext-segment,0x0000060000000000")
 install(TARGETS rpcsx-gpu RUNTIME DESTINATION bin)
--- a/rpcsx-gpu/main.cpp
+++ b/rpcsx-gpu/main.cpp
@ -1,7 +1,9 @@
 #include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/device/gpu-scheduler.hpp"
 #include "amdgpu/device/vk.hpp"
+#include "rx/MemoryTable.hpp"
 #include "rx/Version.hpp"
+#include "rx/mem.hpp"
 #include "util/unreachable.hpp"
 #include <algorithm>
 #include <amdgpu/bridge/bridge.hpp>
@ -16,18 +18,14 @@
 #include <sys/stat.h>
 #include <thread>
 #include <unistd.h>
+#include <unordered_map>
+#include <unordered_set>
 #include <util/VerifyVulkan.hpp>
 #include <vulkan/vulkan.h>
 #include <vulkan/vulkan_core.h>

 #include <GLFW/glfw3.h> // TODO: make in optional

-// TODO
-// extern void *g_rwMemory;
-extern std::size_t g_memorySize;
-extern std::uint64_t g_memoryBase;
-extern amdgpu::RemoteMemory g_hostMemory;
-
 static void usage(std::FILE *out, const char *argv0) {
  std::fprintf(out, "usage: %s [options...]\n", argv0);
  std::fprintf(out, "  options:\n");
@ -159,6 +157,11 @@ int main(int argc, const char *argv[]) {
    return 1;
  }

+  if (!rx::mem::reserve((void *)0x40000, 0x60000000000 - 0x40000)) {
+    std::fprintf(stderr, "failed to reserve virtual memory\n");
+    return 1;
+  }
+
  glfwInit();
  glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
  auto window = glfwCreateWindow(1280, 720, "RPCSX", nullptr, nullptr);
@ -725,20 +728,6 @@ int main(int argc, const char *argv[]) {
  amdgpu::bridge::BridgePuller bridgePuller{bridge};
  amdgpu::bridge::Command commandsBuffer[1];

-  if (!std::filesystem::exists(std::string("/dev/shm") + shmName)) {
-    std::printf("Waiting for OS\n");
-    while (!std::filesystem::exists(std::string("/dev/shm") + shmName)) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(300));
-    }
-  }
-
-  int memoryFd = ::shm_open(shmName, O_RDWR, S_IRUSR | S_IWUSR);
-
-  if (memoryFd < 0) {
-    std::printf("failed to open shared memory\n");
-    return 1;
-  }
-
  int dmemFd[3];

  for (std::size_t i = 0; i < std::size(dmemFd); ++i) {
@ -759,26 +748,80 @@ int main(int argc, const char *argv[]) {
    }
  }

-  struct stat memoryStat;
-  ::fstat(memoryFd, &memoryStat);
-  amdgpu::RemoteMemory memory{(char *)::mmap(
-      nullptr, memoryStat.st_size, PROT_NONE, MAP_SHARED, memoryFd, 0)};
-
-  // extern void *g_rwMemory;
-  g_memorySize = memoryStat.st_size;
-  g_memoryBase = 0x40000;
-  // g_rwMemory = ::mmap(nullptr, g_memorySize, PROT_READ | PROT_WRITE, MAP_SHARED,
-  //                     memoryFd, 0);
-
-  g_hostMemory = memory;
-
  {
    amdgpu::device::AmdgpuDevice device(bridgePuller.header);

-    for (std::uint32_t end = bridge->memoryAreaCount, i = 0; i < end; ++i) {
-      auto area = bridge->memoryAreas[i];
-      device.handleProtectMemory(area.address, area.size, area.prot);
-    }
+    struct VmMapSlot {
+      int memoryType;
+      int prot;
+      std::int64_t offset;
+      std::uint64_t baseAddress;
+
+      auto operator<=>(const VmMapSlot &) const = default;
+    };
+
+    struct ProcessInfo {
+      int vmId = -1;
+      int vmFd = -1;
+      rx::MemoryTableWithPayload<VmMapSlot> vmTable;
+    };
+
+    auto mapProcess = [&](std::int64_t pid, int vmId, ProcessInfo &process) {
+      process.vmId = vmId;
+
+      auto memory = amdgpu::RemoteMemory{vmId};
+
+      std::string pidVmName = shmName;
+      pidVmName += '-';
+      pidVmName += std::to_string(pid);
+      int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+      process.vmFd = memoryFd;
+
+      if (memoryFd < 0) {
+        std::printf("failed to process %x shared memory\n", (int)pid);
+        std::abort();
+      }
+
+      for (auto [startAddress, endAddress, slot] : process.vmTable) {
+        auto gpuProt = slot.prot >> 4;
+        if (gpuProt == 0) {
+          continue;
+        }
+
+        auto devOffset = slot.offset + startAddress - slot.baseAddress;
+        int mapFd = memoryFd;
+
+        if (slot.memoryType >= 0) {
+          mapFd = dmemFd[slot.memoryType];
+        }
+
+        auto mmapResult =
+            ::mmap(memory.getPointer(startAddress), endAddress - startAddress,
+                   gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset);
+
+        if (mmapResult == MAP_FAILED) {
+          std::printf(
+              "failed to map process %x memory, address %lx-%lx, type %x\n",
+              (int)pid, startAddress, endAddress, slot.memoryType);
+          std::abort();
+        }
+
+        device.handleProtectMemory(memory, startAddress,
+                                   endAddress - startAddress, slot.prot);
+      }
+    };
+
+    auto unmapProcess = [&](ProcessInfo &process) {
+      auto startAddress = static_cast<std::uint64_t>(process.vmId) << 40;
+      auto size = static_cast<std::uint64_t>(1) << 40;
+      rx::mem::reserve(reinterpret_cast<void *>(startAddress), size);
+
+      ::close(process.vmFd);
+      process.vmFd = -1;
+      process.vmId = -1;
+    };
+
+    std::unordered_map<std::int64_t, ProcessInfo> processInfo;

    std::vector<VkCommandBuffer> presentCmdBuffers(swapchainImages.size());

@ -966,66 +1009,141 @@ int main(int argc, const char *argv[]) {

      for (auto cmd : std::span(commandsBuffer, pulledCount)) {
        switch (cmd.id) {
-        case amdgpu::bridge::CommandId::ProtectMemory:
-          device.handleProtectMemory(cmd.memoryProt.address,
-                                     cmd.memoryProt.size, cmd.memoryProt.prot);
-          break;
-        case amdgpu::bridge::CommandId::CommandBuffer:
-          device.handleCommandBuffer(cmd.commandBuffer.queue,
-                                     cmd.commandBuffer.address,
-                                     cmd.commandBuffer.size);
-          break;
-        case amdgpu::bridge::CommandId::Flip: {
-          if (!isImageAcquired) {
-            Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX,
-                                              presentCompleteSemaphore, nullptr,
-                                              &imageIndex);
+        case amdgpu::bridge::CommandId::ProtectMemory: {
+          auto &process = processInfo[cmd.memoryProt.pid];

-            vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE,
-                            UINT64_MAX);
-            vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]);
+          auto vmSlotIt = process.vmTable.queryArea(cmd.memoryProt.address);
+          if (vmSlotIt == process.vmTable.end()) {
+            std::abort();
          }

-          isImageAcquired = false;
+          auto vmSlot = (*vmSlotIt).payload;

-          vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0);
-          VkCommandBufferBeginInfo beginInfo{};
-          beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-          beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+          process.vmTable.map(cmd.memoryProt.address,
+                              cmd.memoryProt.address + cmd.memoryProt.size,
+                              VmMapSlot{
+                                  .memoryType = vmSlot.memoryType,
+                                  .prot = static_cast<int>(cmd.memoryProt.prot),
+                                  .offset = vmSlot.offset,
+                                  .baseAddress = vmSlot.baseAddress,
+                              });

-          vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo);
+          if (process.vmId >= 0) {
+            auto memory = amdgpu::RemoteMemory{process.vmId};
+            rx::mem::protect(memory.getPointer(cmd.memoryProt.address),
+                             cmd.memoryProt.size, cmd.memoryProt.prot >> 4);
+            device.handleProtectMemory(memory, cmd.mapMemory.address,
+                                       cmd.mapMemory.size, cmd.mapMemory.prot);
+          }
+          break;
+        }
+        case amdgpu::bridge::CommandId::CommandBuffer: {
+          auto &process = processInfo[cmd.commandBuffer.pid];
+          if (process.vmId >= 0) {
+            device.handleCommandBuffer(
+                amdgpu::RemoteMemory{process.vmId}, cmd.commandBuffer.queue,
+                cmd.commandBuffer.address, cmd.commandBuffer.size);
+          }
+          break;
+        }
+        case amdgpu::bridge::CommandId::Flip: {
+          auto &process = processInfo[cmd.flip.pid];

-          if (device.handleFlip(
-                  presentQueue, presentCmdBuffers[imageIndex],
-                  *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
-                  cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
-                  presentCompleteSemaphore, renderCompleteSemaphore,
-                  inFlightFences[imageIndex])) {
-            VkPresentInfoKHR presentInfo{
-                .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
-                .waitSemaphoreCount = 1,
-                .pWaitSemaphores = &renderCompleteSemaphore,
-                .swapchainCount = 1,
-                .pSwapchains = &swapchain,
-                .pImageIndices = &imageIndex,
-            };
-            if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) {
-              std::printf("swapchain was invalidated\n");
-              createSwapchain();
+          if (process.vmId >= 0) {
+            if (!isImageAcquired) {
+              Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX,
+                                                presentCompleteSemaphore,
+                                                nullptr, &imageIndex);
+
+              vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE,
+                              UINT64_MAX);
+              vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]);
+            }
+
+            isImageAcquired = false;
+
+            vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0);
+            VkCommandBufferBeginInfo beginInfo{};
+            beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+            beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+            vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo);
+
+            if (device.handleFlip(
+                    amdgpu::RemoteMemory{process.vmId}, presentQueue,
+                    presentCmdBuffers[imageIndex],
+                    *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
+                    cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
+                    presentCompleteSemaphore, renderCompleteSemaphore,
+                    inFlightFences[imageIndex])) {
+              VkPresentInfoKHR presentInfo{
+                  .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+                  .waitSemaphoreCount = 1,
+                  .pWaitSemaphores = &renderCompleteSemaphore,
+                  .swapchainCount = 1,
+                  .pSwapchains = &swapchain,
+                  .pImageIndices = &imageIndex,
+              };
+              if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) {
+                std::printf("swapchain was invalidated\n");
+                createSwapchain();
+              }
+            } else {
+              isImageAcquired = true;
            }
-          } else {
-            isImageAcquired = true;
          }
          break;
        }

-        case amdgpu::bridge::CommandId::MapDmem: {
-          auto addr = g_hostMemory.getPointer(cmd.mapDmem.address);
-          auto mapping = ::mmap(addr, cmd.mapDmem.size,
-                 PROT_READ | PROT_WRITE /*TODO: cmd.mapDmem.prot >> 4*/,
-                 MAP_FIXED | MAP_SHARED, dmemFd[cmd.mapDmem.dmemIndex],
-                 cmd.mapDmem.offset);
-          device.handleProtectMemory(cmd.mapDmem.address, cmd.mapDmem.size, 0x33 /*TODO: cmd.mapDmem.prot*/);
+        case amdgpu::bridge::CommandId::MapProcess: {
+          mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, processInfo[cmd.mapProcess.pid]);
+          break;
+        }
+        case amdgpu::bridge::CommandId::UnmapProcess: {
+          unmapProcess(processInfo[cmd.mapProcess.pid]);
+          break;
+        }
+
+        case amdgpu::bridge::CommandId::MapMemory: {
+          auto &process = processInfo[cmd.mapMemory.pid];
+
+          process.vmTable.map(
+              cmd.mapMemory.address, cmd.mapMemory.address + cmd.mapMemory.size,
+              VmMapSlot{
+                  .memoryType = static_cast<int>(cmd.mapMemory.memoryType >= 0
+                                                     ? cmd.mapMemory.dmemIndex
+                                                     : -1),
+                  .prot = static_cast<int>(cmd.mapMemory.prot),
+                  .offset = cmd.mapMemory.offset,
+                  .baseAddress = cmd.mapMemory.address,
+              });
+
+          if (process.vmId >= 0) {
+            auto memory = amdgpu::RemoteMemory{process.vmId};
+
+            int mapFd = process.vmFd;
+
+            if (cmd.mapMemory.memoryType >= 0) {
+              mapFd = dmemFd[cmd.mapMemory.dmemIndex];
+            }
+
+            auto mmapResult =
+                ::mmap(memory.getPointer(cmd.mapMemory.address),
+                       cmd.mapMemory.size, cmd.mapMemory.prot >> 4,
+                       MAP_FIXED | MAP_SHARED, mapFd, cmd.mapMemory.offset);
+
+            if (mmapResult == MAP_FAILED) {
+              std::printf(
+                  "failed to map process %x memory, address %lx-%lx, type %x\n",
+                  (int)cmd.mapMemory.pid, cmd.mapMemory.address,
+                  cmd.mapMemory.address + cmd.mapMemory.size,
+                  cmd.mapMemory.memoryType);
+              std::abort();
+            }
+
+            device.handleProtectMemory(memory, cmd.mapMemory.address,
+                                       cmd.mapMemory.size, cmd.mapMemory.prot);
+          }
          break;
        }

--- a/rpcsx-os/iodev/dmem.cpp
+++ b/rpcsx-os/iodev/dmem.cpp
@ -44,6 +44,12 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
           rx::vm::kMapProtGpuAll;
  }

+  auto allocationInfoIt = allocations.queryArea(directMemoryStart);
+  if (allocationInfoIt == allocations.end()) {
+    std::abort();
+  }
+  auto allocationInfo = *allocationInfoIt;
+
  auto result =
      rx::vm::map(*address, len, prot, flags, rx::vm::kMapInternalReserveOnly,
                  this, directMemoryStart);
@ -60,9 +66,10 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
    return orbis::ErrorCode::INVAL;
  }

-  rx::bridge.sendMapDmem(orbis::g_currentThread->tproc->pid, index,
-                         reinterpret_cast<std::uint64_t>(result), len, prot,
-                         directMemoryStart);
+  rx::bridge.sendMapMemory(orbis::g_currentThread->tproc->pid,
+                           allocationInfo.payload.memoryType, index,
+                           reinterpret_cast<std::uint64_t>(result), len, prot,
+                           directMemoryStart);

  *address = result;

--- a/rpcsx-os/iodev/gc.cpp
+++ b/rpcsx-os/iodev/gc.cpp
@ -10,6 +10,7 @@
 #include <cstdio>
 #include <mutex>
 #include <sys/mman.h>
+#include <unordered_map>

 struct ComputeQueue {
  std::uint64_t ringBaseAddress{};
@ -19,14 +20,104 @@ struct ComputeQueue {
  std::uint64_t len{};
 };

+static void runBridge(int vmId) {
+  std::thread{[=] {
+    pthread_setname_np(pthread_self(), "Bridge");
+    auto bridge = rx::bridge.header;
+
+    std::vector<std::uint64_t> fetchedCommands;
+    fetchedCommands.reserve(std::size(bridge->cacheCommands));
+
+    while (true) {
+      for (auto &command : bridge->cacheCommands) {
+        std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
+
+        if (value != 0) {
+          fetchedCommands.push_back(value);
+          command[vmId].store(0, std::memory_order::relaxed);
+        }
+      }
+
+      if (fetchedCommands.empty()) {
+        continue;
+      }
+
+      for (auto command : fetchedCommands) {
+        auto page = static_cast<std::uint32_t>(command);
+        auto count = static_cast<std::uint32_t>(command >> 32) + 1;
+
+        auto pageFlags =
+            bridge->cachePages[vmId][page].load(std::memory_order::relaxed);
+
+        auto address =
+            static_cast<std::uint64_t>(page) * amdgpu::bridge::kHostPageSize;
+        auto origVmProt = rx::vm::getPageProtection(address);
+        int prot = 0;
+
+        if (origVmProt & rx::vm::kMapProtCpuRead) {
+          prot |= PROT_READ;
+        }
+        if (origVmProt & rx::vm::kMapProtCpuWrite) {
+          prot |= PROT_WRITE;
+        }
+        if (origVmProt & rx::vm::kMapProtCpuExec) {
+          prot |= PROT_EXEC;
+        }
+
+        if (pageFlags & amdgpu::bridge::kPageReadWriteLock) {
+          prot &= ~(PROT_READ | PROT_WRITE);
+        } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) {
+          prot &= ~PROT_WRITE;
+        }
+
+        // std::fprintf(stderr, "protection %lx-%lx\n", address,
+        //              address + amdgpu::bridge::kHostPageSize * count);
+        if (::mprotect(reinterpret_cast<void *>(address),
+                       amdgpu::bridge::kHostPageSize * count, prot)) {
+          perror("protection failed");
+          std::abort();
+        }
+      }
+
+      fetchedCommands.clear();
+    }
+  }}.detach();
+}
+
+static constexpr auto kVmIdCount = 6;
+
 struct GcDevice : public IoDevice {
+  std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1;
  orbis::shared_mutex mtx;
+  orbis::kmap<orbis::pid_t, int> clients;
  orbis::kmap<std::uint64_t, ComputeQueue> computeQueues;
  orbis::ErrorCode open(orbis::Ref<orbis::File> *file, const char *path,
                        std::uint32_t flags, std::uint32_t mode,
                        orbis::Thread *thread) override;
+
+  void addClient(orbis::Process *process);
+  void removeClient(orbis::Process *process);
+
+  int allocateVmId() {
+    int id = std::countr_zero(freeVmIds);
+
+    if (id >= kVmIdCount) {
+      std::fprintf(stderr, "out of vm slots\n");
+      std::abort();
+    }
+
+    freeVmIds &= ~(1 << id);
+    return id;
+  };
+
+  void deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); };
 };
-struct GcFile : public orbis::File {};
+
+struct GcFile : public orbis::File {
+  orbis::Process *process = nullptr;
+  ~GcFile() { device.staticCast<GcDevice>()->removeClient(process); }
+};
+
 static std::uint64_t g_submitDoneFlag;

 static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
@ -34,7 +125,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  // 0xc00c8110
  // 0xc0848119

-  auto device = static_cast<GcDevice *>(file->device.get());
+  auto device = file->device.staticCast<GcDevice>();
  std::lock_guard lock(device->mtx);

  switch (request) {
@ -55,7 +146,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,

    // flockfile(stderr);
    // if (thread->tproc->pid != amdgpu::bridge::expGpuPid) {
-      // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds);
+    // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds);
    // }

    for (unsigned i = 0; i < args->count; ++i) {
@ -172,14 +263,20 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
    break;
  }

-  case 0xc010810b: { // something like stats masks?
+  case 0xc010810b: { // get cu masks param
    struct Args {
-      std::uint64_t arg1;
-      std::uint64_t arg2;
+      std::uint32_t se0sh0;
+      std::uint32_t se0sh1;
+      std::uint32_t se1sh0;
+      std::uint32_t se1sh1;
    };

    auto args = reinterpret_cast<Args *>(argp);
-    ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2);
+    // ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2);
+    args->se0sh0 = ~0;
+    args->se0sh1 = ~0;
+    args->se1sh0 = ~0;
+    args->se1sh1 = ~0;
    break;
  }

@ -265,8 +362,14 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  }

  case 0xc0048113: {
-    // get client number
-    *(std::uint32_t *)argp = 0;
+    // get num clients
+
+    struct Args {
+      std::uint32_t numClients;
+    };
+
+    auto *args = reinterpret_cast<Args *>(argp);
+    args->numClients = device->clients.size();
    break;
  }

@ -312,8 +415,38 @@ orbis::ErrorCode GcDevice::open(orbis::Ref<orbis::File> *file, const char *path,
  auto newFile = orbis::knew<GcFile>();
  newFile->device = this;
  newFile->ops = &ops;
+  newFile->process = thread->tproc;
+  addClient(thread->tproc);
  *file = newFile;
  return {};
 }

+void GcDevice::addClient(orbis::Process *process) {
+  std::lock_guard lock(mtx);
+  auto &client = clients[process->pid];
+  ++client;
+
+  if (client == 1) {
+    auto vmId = allocateVmId();
+    rx::bridge.sendMapProcess(process->pid, vmId);
+    process->vmId = vmId;
+
+    runBridge(vmId);
+  }
+}
+
+void GcDevice::removeClient(orbis::Process *process) {
+  std::lock_guard lock(mtx);
+  auto clientIt = clients.find(process->pid);
+  assert(clientIt != clients.end());
+  assert(clientIt->second != 0);
+  --clientIt->second;
+  if (clientIt->second == 0) {
+    clients.erase(clientIt);
+    rx::bridge.sendUnmapProcess(process->pid);
+    deallocateVmId(process->vmId);
+    process->vmId = -1;
+  }
+}
+
 IoDevice *createGcCharacterDevice() { return orbis::knew<GcDevice>(); }
--- a/rpcsx-os/main.cpp
+++ b/rpcsx-os/main.cpp
@ -41,71 +41,6 @@
 #include <unordered_map>

 static int g_gpuPid;
-
-void runBridge() {
-  std::thread{[] {
-    pthread_setname_np(pthread_self(), "Bridge");
-    auto bridge = rx::bridge.header;
-
-    std::vector<std::uint64_t> fetchedCommands;
-    fetchedCommands.reserve(std::size(bridge->cacheCommands));
-
-    while (true) {
-      for (auto &command : bridge->cacheCommands) {
-        std::uint64_t value = command.load(std::memory_order::relaxed);
-
-        if (value != 0) {
-          fetchedCommands.push_back(value);
-          command.store(0, std::memory_order::relaxed);
-        }
-      }
-
-      if (fetchedCommands.empty()) {
-        continue;
-      }
-
-      for (auto command : fetchedCommands) {
-        auto page = static_cast<std::uint32_t>(command);
-        auto count = static_cast<std::uint32_t>(command >> 32) + 1;
-
-        auto pageFlags =
-            bridge->cachePages[page].load(std::memory_order::relaxed);
-
-        auto address =
-            static_cast<std::uint64_t>(page) * amdgpu::bridge::kHostPageSize;
-        auto origVmProt = rx::vm::getPageProtection(address);
-        int prot = 0;
-
-        if (origVmProt & rx::vm::kMapProtCpuRead) {
-          prot |= PROT_READ;
-        }
-        if (origVmProt & rx::vm::kMapProtCpuWrite) {
-          prot |= PROT_WRITE;
-        }
-        if (origVmProt & rx::vm::kMapProtCpuExec) {
-          prot |= PROT_EXEC;
-        }
-
-        if (pageFlags & amdgpu::bridge::kPageReadWriteLock) {
-          prot &= ~(PROT_READ | PROT_WRITE);
-        } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) {
-          prot &= ~PROT_WRITE;
-        }
-
-        // std::fprintf(stderr, "protection %lx-%lx\n", address,
-        //              address + amdgpu::bridge::kHostPageSize * count);
-        if (::mprotect(reinterpret_cast<void *>(address),
-                       amdgpu::bridge::kHostPageSize * count, prot)) {
-          perror("protection failed");
-          std::abort();
-        }
-      }
-
-      fetchedCommands.clear();
-    }
-  }}.detach();
-}
-
 extern bool allowMonoDebug;

 __attribute__((no_stack_protector)) static void
@ -116,8 +51,9 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {

  auto signalAddress = reinterpret_cast<std::uintptr_t>(info->si_addr);

-  if (orbis::g_currentThread != nullptr && sig == SIGSEGV &&
+  if (orbis::g_currentThread != nullptr && orbis::g_currentThread->tproc->vmId >= 0 && sig == SIGSEGV &&
      signalAddress >= 0x40000 && signalAddress < 0x100'0000'0000) {
+    auto vmid = orbis::g_currentThread->tproc->vmId;
    auto ctx = reinterpret_cast<ucontext_t *>(ucontext);
    bool isWrite = (ctx->uc_mcontext.gregs[REG_ERR] & 0x2) != 0;
    auto origVmProt = rx::vm::getPageProtection(signalAddress);
@ -138,17 +74,17 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
      auto bridge = rx::bridge.header;

      while (true) {
-        auto flags = bridge->cachePages[page].load(std::memory_order::relaxed);
+        auto flags = bridge->cachePages[vmid][page].load(std::memory_order::relaxed);

        if ((flags & amdgpu::bridge::kPageReadWriteLock) != 0) {
          if ((flags & amdgpu::bridge::kPageLazyLock) != 0) {
            if (std::uint32_t gpuCommand = 0;
-                !bridge->gpuCacheCommand.compare_exchange_weak(gpuCommand,
+                !bridge->gpuCacheCommand[vmid].compare_exchange_weak(gpuCommand,
                                                               page)) {
              continue;
            }

-            while (!bridge->cachePages[page].compare_exchange_weak(
+            while (!bridge->cachePages[vmid][page].compare_exchange_weak(
                flags, flags & ~amdgpu::bridge::kPageLazyLock,
                std::memory_order::relaxed)) {
            }
@ -165,7 +101,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
          break;
        }

-        if (bridge->cachePages[page].compare_exchange_weak(
+        if (bridge->cachePages[vmid][page].compare_exchange_weak(
                flags, amdgpu::bridge::kPageInvalidated,
                std::memory_order::relaxed)) {
          break;
@ -188,6 +124,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
  }

  if (orbis::g_currentThread != nullptr) {
+    orbis::g_currentThread->tproc->exitStatus = sig;
    orbis::g_currentThread->tproc->event.emit(orbis::kEvFiltProc,
                                              orbis::kNoteExit, sig);
  }
@ -1640,29 +1577,34 @@ int main(int argc, const char *argv[]) {
  };

  if (isSystem) {
-    amdgpu::bridge::expGpuPid = isSafeMode ? 20001 : 60001;
    orbis::g_context.safeMode = isSafeMode ? 1 : 0;
-    initProcess->authInfo = {
-        .unk0 = 0x380000000000000f,
-        .caps =
-            {
-                -1ul,
-                -1ul,
-                -1ul,
-                -1ul,
-            },
-        .attrs =
-            {
-                0x4000400040000000,
-                0x4000000000000000,
-                0x0080000000000002,
-                0xF0000000FFFF4000,
-            },
-    };
+    initProcess->authInfo = {.unk0 = 0x380000000000000f,
+                             .caps =
+                                 {
+                                     -1ul,
+                                     -1ul,
+                                     -1ul,
+                                     -1ul,
+                                 },
+                             .attrs =
+                                 {
+                                     0x4000400040000000,
+                                     0x4000000000000000,
+                                     0x0080000000000002,
+                                     0xF0000000FFFF4000,
+                                 },
+                             .ucred = {
+                                 -1ul,
+                                 -1ul,
+                                 0x3800000000000022,
+                                 -1ul,
+                                 (1ul << 0x3a),
+                                 -1ul,
+                                 -1ul,
+                             }};
    initProcess->budgetId = 0;
    initProcess->isInSandbox = false;
  } else {
-    amdgpu::bridge::expGpuPid = initProcess->pid;
    initProcess->authInfo = {
        .unk0 = 0x3100000000000001,
        .caps =
@ -1788,7 +1730,6 @@ int main(int argc, const char *argv[]) {

      launchDaemon(mainThread, "/system/sys/orbis_audiod.elf",
                   {"/system/sys/orbis_audiod.elf"}, {});
-      runBridge();
      status = ps4Exec(mainThread, execEnv, std::move(executableModule),
                       ps4Argv, {});
    }
--- a/rpcsx-os/ops.cpp
+++ b/rpcsx-os/ops.cpp
@ -43,7 +43,6 @@ using namespace orbis;
 extern bool allowMonoDebug;

 extern "C" void __register_frame(const void *);
-void runBridge();
 void setupSigHandlers();
 int ps4Exec(orbis::Thread *mainThread,
            orbis::utils::Ref<orbis::Module> executableModule,
@ -828,9 +827,6 @@ SysResult fork(Thread *thread, slong flags) {
  dup2(logFd, 1);
  dup2(logFd, 2);

-  if (childPid == amdgpu::bridge::expGpuPid) {
-    runBridge();
-  }
  return {};
 }

--- a/rpcsx-os/vm.cpp
+++ b/rpcsx-os/vm.cpp
@ -958,11 +958,8 @@ void *rx::vm::map(void *addr, std::uint64_t len, std::int32_t prot,
  }

  if (auto thr = orbis::g_currentThread) {
-    // std::fprintf(stderr, "sending mapping %lx-%lx, pid %lx\n", address,
-    //              address + len, thr->tproc->pid);
-    // if (!noOverwrite) {
-    //   rx::bridge.sendMemoryProtect(thr->tproc->pid, address, len, prot);
-    // }
+    rx::bridge.sendMapMemory(thr->tproc->pid, -1, -1, address, len, prot,
+                             address - kMinAddress);
  } else {
    std::fprintf(stderr, "ignoring mapping %lx-%lx\n", address, address + len);
  }