rpcsx-gpu: add multiprocess support

2025-12-06 07:12:14 +01:00 · 2024-09-01 17:43:45 +03:00 · 2024-09-01 17:43:45 +03:00 · 2c781626d3
parent f77376c1e3
commit 2c781626d3
14 changed files with 746 additions and 450 deletions
--- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
+++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
@ -1,14 +1,12 @@
 #pragma once
 #include <orbis/utils/SharedMutex.hpp>
 #include <atomic>
 #include <cstdint>
 #include <cstring>
 #include <initializer_list>
 #include <orbis/utils/SharedMutex.hpp>
 namespace amdgpu::bridge {
 extern std::uint32_t expGpuPid;
 struct PadState {
  std::uint64_t timestamp;
  std::uint32_t unk;
@ -47,7 +45,9 @@ enum class CommandId : std::uint32_t {
  ProtectMemory,
  CommandBuffer,
  Flip,
-  MapDmem,
+  MapMemory,
  MapProcess,
  UnmapProcess,
 };
 struct CmdMemoryProt {
@ -79,15 +79,25 @@ struct CmdFlip {
  std::uint64_t arg;
 };
-struct CmdMapDmem {
+struct CmdMapMemory {
-  std::uint64_t offset;
+  std::int64_t offset;
  std::uint64_t address;
  std::uint64_t size;
  std::uint32_t prot;
  std::uint32_t pid;
  std::int32_t memoryType;
  std::uint32_t dmemIndex;
 };
 struct CmdMapProcess {
  std::uint64_t pid;
  int vmId;
 };
 struct CmdUnmapProcess {
  std::uint64_t pid;
 };
 enum {
  kPageWriteWatch = 1 << 0,
  kPageReadWriteLock = 1 << 1,
@ -112,17 +122,15 @@ struct BridgeHeader {
  volatile std::uint64_t flipArg;
  volatile std::uint64_t flipCount;
  volatile std::uint64_t bufferInUseAddress;
  std::uint32_t memoryAreaCount;
  std::uint32_t commandBufferCount;
  std::uint32_t bufferCount;
  CmdMemoryProt memoryAreas[512];
  CmdCommandBuffer commandBuffers[32];
  CmdBuffer buffers[10];
  // orbis::shared_mutex cacheCommandMtx;
  // orbis::shared_cv cacheCommandCv;
-  std::atomic<std::uint64_t> cacheCommands[4];
+  std::atomic<std::uint64_t> cacheCommands[6][4];
-  std::atomic<std::uint32_t> gpuCacheCommand;
+  std::atomic<std::uint32_t> gpuCacheCommand[6];
-  std::atomic<std::uint8_t> cachePages[0x100'0000'0000 / kHostPageSize];
+  std::atomic<std::uint8_t> cachePages[6][0x100'0000'0000 / kHostPageSize];
  volatile std::uint64_t pull;
  volatile std::uint64_t push;
@ -137,7 +145,9 @@ struct Command {
    CmdCommandBuffer commandBuffer;
    CmdBuffer buffer;
    CmdFlip flip;
-    CmdMapDmem mapDmem;
+    CmdMapMemory mapMemory;
    CmdMapProcess mapProcess;
    CmdUnmapProcess unmapProcess;
  };
 };
@ -160,29 +170,32 @@ struct BridgePusher {
  void sendMemoryProtect(std::uint32_t pid, std::uint64_t address,
                         std::uint64_t size, std::uint32_t prot) {
-    if (pid == expGpuPid) {
+    sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
      sendCommand(CommandId::ProtectMemory, {pid, address, size, prot});
    }
  }
-  void sendMapDmem(std::uint32_t pid, std::uint32_t dmemIndex, std::uint64_t address, std::uint64_t size, std::uint32_t prot, std::uint64_t offset) {
+  void sendMapMemory(std::uint32_t pid, std::uint32_t memoryType,
-    // if (pid == expGpuPid) {
+                     std::uint32_t dmemIndex, std::uint64_t address,
-      sendCommand(CommandId::MapDmem, {pid, dmemIndex, address, size, prot, offset});
+                     std::uint64_t size, std::uint32_t prot,
-    // }
+                     std::uint64_t offset) {
    sendCommand(CommandId::MapMemory,
                {pid, memoryType, dmemIndex, address, size, prot, offset});
  }
  void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue,
                         std::uint64_t address, std::uint64_t size) {
-    // if (pid == expGpuPid) {
+    sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
      sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
    // }
  }
  void sendFlip(std::uint32_t pid, std::uint32_t bufferIndex,
                std::uint64_t arg) {
-    // if (pid == expGpuPid) {
+    sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
-      sendCommand(CommandId::Flip, {pid, bufferIndex, arg});
+  }
-    // }
+
  void sendMapProcess(std::uint32_t pid, unsigned vmId) {
    sendCommand(CommandId::MapProcess, {pid, vmId});
  }
  void sendUnmapProcess(std::uint32_t pid) {
    sendCommand(CommandId::UnmapProcess, {pid});
  }
  void wait() {
@ -198,7 +211,8 @@ private:
  void sendCommand(CommandId id, std::initializer_list<std::uint64_t> args) {
    std::uint64_t exp = 0;
-    while (!header->lock.compare_exchange_weak(exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
+    while (!header->lock.compare_exchange_weak(
        exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) {
      exp = 0;
    }
@ -303,13 +317,23 @@ private:
      result.flip.arg = args[2];
      return result;
-    case CommandId::MapDmem:
+    case CommandId::MapMemory:
-      result.mapDmem.pid = args[0];
+      result.mapMemory.pid = args[0];
-      result.mapDmem.dmemIndex = args[1];
+      result.mapMemory.memoryType = args[1];
-      result.mapDmem.address = args[2];
+      result.mapMemory.dmemIndex = args[2];
-      result.mapDmem.size = args[3];
+      result.mapMemory.address = args[3];
-      result.mapDmem.prot = args[4];
+      result.mapMemory.size = args[4];
-      result.mapDmem.offset = args[5];
+      result.mapMemory.prot = args[5];
      result.mapMemory.offset = args[6];
      return result;
    case CommandId::MapProcess:
      result.mapProcess.pid = args[0];
      result.mapProcess.vmId = args[1];
      return result;
    case CommandId::UnmapProcess:
      result.unmapProcess.pid = args[0];
      return result;
    }
--- a/hw/amdgpu/bridge/src/bridge.cpp
+++ b/hw/amdgpu/bridge/src/bridge.cpp
@ -8,8 +8,6 @@
 static int gShmFd = -1;
 static constexpr std::size_t kShmSize = sizeof(amdgpu::bridge::BridgeHeader) +
                                        (sizeof(std::uint64_t) * 256);
 std::uint32_t amdgpu::bridge::expGpuPid = 0;
 amdgpu::bridge::BridgeHeader *
 amdgpu::bridge::createShmCommandBuffer(const char *name) {
  if (gShmFd != -1) {
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -1,5 +1,6 @@
 #pragma once
 #include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/bridge/bridge.hpp"
 #include "amdgpu/shader/Instruction.hpp"
 #include "gpu-scheduler.hpp"
@ -1259,6 +1260,42 @@ struct GnmTBuffer {
 static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);
 struct GnmSSampler {
  int32_t clamp_x : 3;
  int32_t clamp_y : 3;
  int32_t clamp_z : 3;
  int32_t max_aniso_ratio : 3;
  int32_t depth_compare_func : 3;
  int32_t force_unorm_coords : 1;
  int32_t aniso_threshold : 3;
  int32_t mc_coord_trunc : 1;
  int32_t force_degamma : 1;
  int32_t aniso_bias : 6;
  int32_t trunc_coord : 1;
  int32_t disable_cube_wrap : 1;
  int32_t filter_mode : 2;
  int32_t : 1;
  int32_t min_lod : 12;
  int32_t max_lod : 12;
  int32_t perf_mip : 4;
  int32_t perf_z : 4;
  int32_t lod_bias : 14;
  int32_t lod_bias_sec : 6;
  int32_t xy_mag_filter : 2;
  int32_t xy_min_filter : 2;
  int32_t z_filter : 2;
  int32_t mip_filter : 2;
  int32_t : 4;
  int32_t border_color_ptr : 12;
  int32_t : 18;
  int32_t border_color_type : 2;
  auto operator<=>(const GnmSSampler &) const = default;
  bool operator==(const GnmSSampler &) const = default;
 };
 static_assert(sizeof(GnmSSampler) == sizeof(std::uint32_t) * 4);
 constexpr auto kPageSize = 0x4000;
 void setVkDevice(VkDevice device,
@ -1266,11 +1303,11 @@ void setVkDevice(VkDevice device,
                 VkPhysicalDeviceProperties devProperties);
 struct AmdgpuDevice {
-  void handleProtectMemory(std::uint64_t address, std::uint64_t size,
+  void handleProtectMemory(RemoteMemory memory, std::uint64_t address,
-                           std::uint32_t prot);
+                           std::uint64_t size, std::uint32_t prot);
-  void handleCommandBuffer(std::uint64_t queueId, std::uint64_t address,
+  void handleCommandBuffer(RemoteMemory memory, std::uint64_t queueId,
-                           std::uint64_t size);
+                           std::uint64_t address, std::uint64_t size);
-  bool handleFlip(VkQueue queue, VkCommandBuffer cmdBuffer,
+  bool handleFlip(RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer,
                  TaskChain &initTaskChain, std::uint32_t bufferIndex,
                  std::uint64_t arg, VkImage targetImage,
                  VkExtent2D targetExtent, VkSemaphore waitSemaphore,
--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
--- a/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
+++ b/hw/amdgpu/include/amdgpu/RemoteMemory.hpp
@ -3,10 +3,11 @@
 namespace amdgpu {
 struct RemoteMemory {
-  char *shmPointer;
+  int vmId;
  template <typename T = void> T *getPointer(std::uint64_t address) const {
-    return address ? reinterpret_cast<T *>(shmPointer + address - 0x40000)
+    return address ? reinterpret_cast<T *>(
                         static_cast<std::uint64_t>(vmId) << 40 | address)
                   : nullptr;
  }
 };
--- a/orbis-kernel/include/orbis/AuthInfo.hpp
+++ b/orbis-kernel/include/orbis/AuthInfo.hpp
@ -7,7 +7,14 @@ struct AuthInfo {
  uint64_t unk0;
  uint64_t caps[4];
  uint64_t attrs[4];
-  uint64_t unk[8];
+  uint64_t ucred[8];
  bool hasUseHp3dPipeCapability() const {
    return ucred[2] == 0x3800000000000009;
  }
  bool hasMmapSelfCapability() const { return ((ucred[4] >> 0x3a) & 1) != 1; }
  bool hasSystemCapability() const { return ((ucred[3] >> 0x3e) & 1) != 0; }
  bool hasSceProgramAttribute() const { return ((ucred[3] >> 0x1f) & 1) != 0; }
 };
 static_assert(sizeof(AuthInfo) == 136);
--- a/orbis-kernel/include/orbis/thread/Process.hpp
+++ b/orbis-kernel/include/orbis/thread/Process.hpp
@ -51,6 +51,7 @@ struct Process final {
  ProcessState state = ProcessState::NEW;
  Process *parentProcess = nullptr;
  shared_mutex mtx;
  int vmId = -1;
  void (*onSysEnter)(Thread *thread, int id, uint64_t *args,
                     int argsCount) = nullptr;
  void (*onSysExit)(Thread *thread, int id, uint64_t *args, int argsCount,
--- a/rpcsx-gpu/CMakeLists.txt
+++ b/rpcsx-gpu/CMakeLists.txt
@ -8,4 +8,5 @@ add_executable(rpcsx-gpu
 target_include_directories(rpcsx-gpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(rpcsx-gpu PUBLIC amdgpu::bridge amdgpu::device glfw Vulkan::Vulkan rx)
 set_target_properties(rpcsx-gpu PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 target_link_options(rpcsx-os PUBLIC "LINKER:-Ttext-segment,0x0000060000000000")
 install(TARGETS rpcsx-gpu RUNTIME DESTINATION bin)
--- a/rpcsx-gpu/main.cpp
+++ b/rpcsx-gpu/main.cpp
@ -1,7 +1,9 @@
 #include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/device/gpu-scheduler.hpp"
 #include "amdgpu/device/vk.hpp"
 #include "rx/MemoryTable.hpp"
 #include "rx/Version.hpp"
 #include "rx/mem.hpp"
 #include "util/unreachable.hpp"
 #include <algorithm>
 #include <amdgpu/bridge/bridge.hpp>
@ -16,18 +18,14 @@
 #include <sys/stat.h>
 #include <thread>
 #include <unistd.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <util/VerifyVulkan.hpp>
 #include <vulkan/vulkan.h>
 #include <vulkan/vulkan_core.h>
 #include <GLFW/glfw3.h> // TODO: make in optional
 // TODO
 // extern void *g_rwMemory;
 extern std::size_t g_memorySize;
 extern std::uint64_t g_memoryBase;
 extern amdgpu::RemoteMemory g_hostMemory;
 static void usage(std::FILE *out, const char *argv0) {
  std::fprintf(out, "usage: %s [options...]\n", argv0);
  std::fprintf(out, "  options:\n");
@ -159,6 +157,11 @@ int main(int argc, const char *argv[]) {
    return 1;
  }
  if (!rx::mem::reserve((void *)0x40000, 0x60000000000 - 0x40000)) {
    std::fprintf(stderr, "failed to reserve virtual memory\n");
    return 1;
  }
  glfwInit();
  glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
  auto window = glfwCreateWindow(1280, 720, "RPCSX", nullptr, nullptr);
@ -725,20 +728,6 @@ int main(int argc, const char *argv[]) {
  amdgpu::bridge::BridgePuller bridgePuller{bridge};
  amdgpu::bridge::Command commandsBuffer[1];
  if (!std::filesystem::exists(std::string("/dev/shm") + shmName)) {
    std::printf("Waiting for OS\n");
    while (!std::filesystem::exists(std::string("/dev/shm") + shmName)) {
      std::this_thread::sleep_for(std::chrono::milliseconds(300));
    }
  }
  int memoryFd = ::shm_open(shmName, O_RDWR, S_IRUSR | S_IWUSR);
  if (memoryFd < 0) {
    std::printf("failed to open shared memory\n");
    return 1;
  }
  int dmemFd[3];
  for (std::size_t i = 0; i < std::size(dmemFd); ++i) {
@ -759,26 +748,80 @@ int main(int argc, const char *argv[]) {
    }
  }
  struct stat memoryStat;
  ::fstat(memoryFd, &memoryStat);
  amdgpu::RemoteMemory memory{(char *)::mmap(
      nullptr, memoryStat.st_size, PROT_NONE, MAP_SHARED, memoryFd, 0)};
  // extern void *g_rwMemory;
  g_memorySize = memoryStat.st_size;
  g_memoryBase = 0x40000;
  // g_rwMemory = ::mmap(nullptr, g_memorySize, PROT_READ | PROT_WRITE, MAP_SHARED,
  //                     memoryFd, 0);
  g_hostMemory = memory;
  {
    amdgpu::device::AmdgpuDevice device(bridgePuller.header);
-    for (std::uint32_t end = bridge->memoryAreaCount, i = 0; i < end; ++i) {
+    struct VmMapSlot {
-      auto area = bridge->memoryAreas[i];
+      int memoryType;
-      device.handleProtectMemory(area.address, area.size, area.prot);
+      int prot;
-    }
+      std::int64_t offset;
      std::uint64_t baseAddress;
      auto operator<=>(const VmMapSlot &) const = default;
    };
    struct ProcessInfo {
      int vmId = -1;
      int vmFd = -1;
      rx::MemoryTableWithPayload<VmMapSlot> vmTable;
    };
    auto mapProcess = [&](std::int64_t pid, int vmId, ProcessInfo &process) {
      process.vmId = vmId;
      auto memory = amdgpu::RemoteMemory{vmId};
      std::string pidVmName = shmName;
      pidVmName += '-';
      pidVmName += std::to_string(pid);
      int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
      process.vmFd = memoryFd;
      if (memoryFd < 0) {
        std::printf("failed to process %x shared memory\n", (int)pid);
        std::abort();
      }
      for (auto [startAddress, endAddress, slot] : process.vmTable) {
        auto gpuProt = slot.prot >> 4;
        if (gpuProt == 0) {
          continue;
        }
        auto devOffset = slot.offset + startAddress - slot.baseAddress;
        int mapFd = memoryFd;
        if (slot.memoryType >= 0) {
          mapFd = dmemFd[slot.memoryType];
        }
        auto mmapResult =
            ::mmap(memory.getPointer(startAddress), endAddress - startAddress,
                   gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset);
        if (mmapResult == MAP_FAILED) {
          std::printf(
              "failed to map process %x memory, address %lx-%lx, type %x\n",
              (int)pid, startAddress, endAddress, slot.memoryType);
          std::abort();
        }
        device.handleProtectMemory(memory, startAddress,
                                   endAddress - startAddress, slot.prot);
      }
    };
    auto unmapProcess = [&](ProcessInfo &process) {
      auto startAddress = static_cast<std::uint64_t>(process.vmId) << 40;
      auto size = static_cast<std::uint64_t>(1) << 40;
      rx::mem::reserve(reinterpret_cast<void *>(startAddress), size);
      ::close(process.vmFd);
      process.vmFd = -1;
      process.vmId = -1;
    };
    std::unordered_map<std::int64_t, ProcessInfo> processInfo;
    std::vector<VkCommandBuffer> presentCmdBuffers(swapchainImages.size());
@ -966,66 +1009,141 @@ int main(int argc, const char *argv[]) {
      for (auto cmd : std::span(commandsBuffer, pulledCount)) {
        switch (cmd.id) {
-        case amdgpu::bridge::CommandId::ProtectMemory:
+        case amdgpu::bridge::CommandId::ProtectMemory: {
-          device.handleProtectMemory(cmd.memoryProt.address,
+          auto &process = processInfo[cmd.memoryProt.pid];
                                     cmd.memoryProt.size, cmd.memoryProt.prot);
          break;
        case amdgpu::bridge::CommandId::CommandBuffer:
          device.handleCommandBuffer(cmd.commandBuffer.queue,
                                     cmd.commandBuffer.address,
                                     cmd.commandBuffer.size);
          break;
        case amdgpu::bridge::CommandId::Flip: {
          if (!isImageAcquired) {
            Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX,
                                              presentCompleteSemaphore, nullptr,
                                              &imageIndex);
-            vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE,
+          auto vmSlotIt = process.vmTable.queryArea(cmd.memoryProt.address);
-                            UINT64_MAX);
+          if (vmSlotIt == process.vmTable.end()) {
-            vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]);
+            std::abort();
          }
-          isImageAcquired = false;
+          auto vmSlot = (*vmSlotIt).payload;
-          vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0);
+          process.vmTable.map(cmd.memoryProt.address,
-          VkCommandBufferBeginInfo beginInfo{};
+                              cmd.memoryProt.address + cmd.memoryProt.size,
-          beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+                              VmMapSlot{
-          beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+                                  .memoryType = vmSlot.memoryType,
                                  .prot = static_cast<int>(cmd.memoryProt.prot),
                                  .offset = vmSlot.offset,
                                  .baseAddress = vmSlot.baseAddress,
                              });
-          vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo);
+          if (process.vmId >= 0) {
            auto memory = amdgpu::RemoteMemory{process.vmId};
            rx::mem::protect(memory.getPointer(cmd.memoryProt.address),
                             cmd.memoryProt.size, cmd.memoryProt.prot >> 4);
            device.handleProtectMemory(memory, cmd.mapMemory.address,
                                       cmd.mapMemory.size, cmd.mapMemory.prot);
          }
          break;
        }
        case amdgpu::bridge::CommandId::CommandBuffer: {
          auto &process = processInfo[cmd.commandBuffer.pid];
          if (process.vmId >= 0) {
            device.handleCommandBuffer(
                amdgpu::RemoteMemory{process.vmId}, cmd.commandBuffer.queue,
                cmd.commandBuffer.address, cmd.commandBuffer.size);
          }
          break;
        }
        case amdgpu::bridge::CommandId::Flip: {
          auto &process = processInfo[cmd.flip.pid];
-          if (device.handleFlip(
+          if (process.vmId >= 0) {
-                  presentQueue, presentCmdBuffers[imageIndex],
+            if (!isImageAcquired) {
-                  *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
+              Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX,
-                  cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
+                                                presentCompleteSemaphore,
-                  presentCompleteSemaphore, renderCompleteSemaphore,
+                                                nullptr, &imageIndex);
-                  inFlightFences[imageIndex])) {
+
-            VkPresentInfoKHR presentInfo{
+              vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE,
-                .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+                              UINT64_MAX);
-                .waitSemaphoreCount = 1,
+              vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]);
-                .pWaitSemaphores = &renderCompleteSemaphore,
+            }
-                .swapchainCount = 1,
+
-                .pSwapchains = &swapchain,
+            isImageAcquired = false;
-                .pImageIndices = &imageIndex,
+
-            };
+            vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0);
-            if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) {
+            VkCommandBufferBeginInfo beginInfo{};
-              std::printf("swapchain was invalidated\n");
+            beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
-              createSwapchain();
+            beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
            vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo);
            if (device.handleFlip(
                    amdgpu::RemoteMemory{process.vmId}, presentQueue,
                    presentCmdBuffers[imageIndex],
                    *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
                    cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
                    presentCompleteSemaphore, renderCompleteSemaphore,
                    inFlightFences[imageIndex])) {
              VkPresentInfoKHR presentInfo{
                  .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
                  .waitSemaphoreCount = 1,
                  .pWaitSemaphores = &renderCompleteSemaphore,
                  .swapchainCount = 1,
                  .pSwapchains = &swapchain,
                  .pImageIndices = &imageIndex,
              };
              if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) {
                std::printf("swapchain was invalidated\n");
                createSwapchain();
              }
            } else {
              isImageAcquired = true;
            }
          } else {
            isImageAcquired = true;
          }
          break;
        }
-        case amdgpu::bridge::CommandId::MapDmem: {
+        case amdgpu::bridge::CommandId::MapProcess: {
-          auto addr = g_hostMemory.getPointer(cmd.mapDmem.address);
+          mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, processInfo[cmd.mapProcess.pid]);
-          auto mapping = ::mmap(addr, cmd.mapDmem.size,
+          break;
-                 PROT_READ | PROT_WRITE /*TODO: cmd.mapDmem.prot >> 4*/,
+        }
-                 MAP_FIXED | MAP_SHARED, dmemFd[cmd.mapDmem.dmemIndex],
+        case amdgpu::bridge::CommandId::UnmapProcess: {
-                 cmd.mapDmem.offset);
+          unmapProcess(processInfo[cmd.mapProcess.pid]);
-          device.handleProtectMemory(cmd.mapDmem.address, cmd.mapDmem.size, 0x33 /*TODO: cmd.mapDmem.prot*/);
+          break;
        }
        case amdgpu::bridge::CommandId::MapMemory: {
          auto &process = processInfo[cmd.mapMemory.pid];
          process.vmTable.map(
              cmd.mapMemory.address, cmd.mapMemory.address + cmd.mapMemory.size,
              VmMapSlot{
                  .memoryType = static_cast<int>(cmd.mapMemory.memoryType >= 0
                                                     ? cmd.mapMemory.dmemIndex
                                                     : -1),
                  .prot = static_cast<int>(cmd.mapMemory.prot),
                  .offset = cmd.mapMemory.offset,
                  .baseAddress = cmd.mapMemory.address,
              });
          if (process.vmId >= 0) {
            auto memory = amdgpu::RemoteMemory{process.vmId};
            int mapFd = process.vmFd;
            if (cmd.mapMemory.memoryType >= 0) {
              mapFd = dmemFd[cmd.mapMemory.dmemIndex];
            }
            auto mmapResult =
                ::mmap(memory.getPointer(cmd.mapMemory.address),
                       cmd.mapMemory.size, cmd.mapMemory.prot >> 4,
                       MAP_FIXED | MAP_SHARED, mapFd, cmd.mapMemory.offset);
            if (mmapResult == MAP_FAILED) {
              std::printf(
                  "failed to map process %x memory, address %lx-%lx, type %x\n",
                  (int)cmd.mapMemory.pid, cmd.mapMemory.address,
                  cmd.mapMemory.address + cmd.mapMemory.size,
                  cmd.mapMemory.memoryType);
              std::abort();
            }
            device.handleProtectMemory(memory, cmd.mapMemory.address,
                                       cmd.mapMemory.size, cmd.mapMemory.prot);
          }
          break;
        }
--- a/rpcsx-os/iodev/dmem.cpp
+++ b/rpcsx-os/iodev/dmem.cpp
@ -44,6 +44,12 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
           rx::vm::kMapProtGpuAll;
  }
  auto allocationInfoIt = allocations.queryArea(directMemoryStart);
  if (allocationInfoIt == allocations.end()) {
    std::abort();
  }
  auto allocationInfo = *allocationInfoIt;
  auto result =
      rx::vm::map(*address, len, prot, flags, rx::vm::kMapInternalReserveOnly,
                  this, directMemoryStart);
@ -60,9 +66,10 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len,
    return orbis::ErrorCode::INVAL;
  }
-  rx::bridge.sendMapDmem(orbis::g_currentThread->tproc->pid, index,
+  rx::bridge.sendMapMemory(orbis::g_currentThread->tproc->pid,
-                         reinterpret_cast<std::uint64_t>(result), len, prot,
+                           allocationInfo.payload.memoryType, index,
-                         directMemoryStart);
+                           reinterpret_cast<std::uint64_t>(result), len, prot,
                           directMemoryStart);
  *address = result;
--- a/rpcsx-os/iodev/gc.cpp
+++ b/rpcsx-os/iodev/gc.cpp
@ -10,6 +10,7 @@
 #include <cstdio>
 #include <mutex>
 #include <sys/mman.h>
 #include <unordered_map>
 struct ComputeQueue {
  std::uint64_t ringBaseAddress{};
@ -19,14 +20,104 @@ struct ComputeQueue {
  std::uint64_t len{};
 };
 static void runBridge(int vmId) {
  std::thread{[=] {
    pthread_setname_np(pthread_self(), "Bridge");
    auto bridge = rx::bridge.header;
    std::vector<std::uint64_t> fetchedCommands;
    fetchedCommands.reserve(std::size(bridge->cacheCommands));
    while (true) {
      for (auto &command : bridge->cacheCommands) {
        std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
        if (value != 0) {
          fetchedCommands.push_back(value);
          command[vmId].store(0, std::memory_order::relaxed);
        }
      }
      if (fetchedCommands.empty()) {
        continue;
      }
      for (auto command : fetchedCommands) {
        auto page = static_cast<std::uint32_t>(command);
        auto count = static_cast<std::uint32_t>(command >> 32) + 1;
        auto pageFlags =
            bridge->cachePages[vmId][page].load(std::memory_order::relaxed);
        auto address =
            static_cast<std::uint64_t>(page) * amdgpu::bridge::kHostPageSize;
        auto origVmProt = rx::vm::getPageProtection(address);
        int prot = 0;
        if (origVmProt & rx::vm::kMapProtCpuRead) {
          prot |= PROT_READ;
        }
        if (origVmProt & rx::vm::kMapProtCpuWrite) {
          prot |= PROT_WRITE;
        }
        if (origVmProt & rx::vm::kMapProtCpuExec) {
          prot |= PROT_EXEC;
        }
        if (pageFlags & amdgpu::bridge::kPageReadWriteLock) {
          prot &= ~(PROT_READ | PROT_WRITE);
        } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) {
          prot &= ~PROT_WRITE;
        }
        // std::fprintf(stderr, "protection %lx-%lx\n", address,
        //              address + amdgpu::bridge::kHostPageSize * count);
        if (::mprotect(reinterpret_cast<void *>(address),
                       amdgpu::bridge::kHostPageSize * count, prot)) {
          perror("protection failed");
          std::abort();
        }
      }
      fetchedCommands.clear();
    }
  }}.detach();
 }
 static constexpr auto kVmIdCount = 6;
 struct GcDevice : public IoDevice {
  std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1;
  orbis::shared_mutex mtx;
  orbis::kmap<orbis::pid_t, int> clients;
  orbis::kmap<std::uint64_t, ComputeQueue> computeQueues;
  orbis::ErrorCode open(orbis::Ref<orbis::File> *file, const char *path,
                        std::uint32_t flags, std::uint32_t mode,
                        orbis::Thread *thread) override;
  void addClient(orbis::Process *process);
  void removeClient(orbis::Process *process);
  int allocateVmId() {
    int id = std::countr_zero(freeVmIds);
    if (id >= kVmIdCount) {
      std::fprintf(stderr, "out of vm slots\n");
      std::abort();
    }
    freeVmIds &= ~(1 << id);
    return id;
  };
  void deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); };
 };
-struct GcFile : public orbis::File {};
+
 struct GcFile : public orbis::File {
  orbis::Process *process = nullptr;
  ~GcFile() { device.staticCast<GcDevice>()->removeClient(process); }
 };
 static std::uint64_t g_submitDoneFlag;
 static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
@ -34,7 +125,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  // 0xc00c8110
  // 0xc0848119
-  auto device = static_cast<GcDevice *>(file->device.get());
+  auto device = file->device.staticCast<GcDevice>();
  std::lock_guard lock(device->mtx);
  switch (request) {
@ -55,7 +146,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
    // flockfile(stderr);
    // if (thread->tproc->pid != amdgpu::bridge::expGpuPid) {
-      // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds);
+    // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds);
    // }
    for (unsigned i = 0; i < args->count; ++i) {
@ -172,14 +263,20 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
    break;
  }
-  case 0xc010810b: { // something like stats masks?
+  case 0xc010810b: { // get cu masks param
    struct Args {
-      std::uint64_t arg1;
+      std::uint32_t se0sh0;
-      std::uint64_t arg2;
+      std::uint32_t se0sh1;
      std::uint32_t se1sh0;
      std::uint32_t se1sh1;
    };
    auto args = reinterpret_cast<Args *>(argp);
-    ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2);
+    // ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2);
    args->se0sh0 = ~0;
    args->se0sh1 = ~0;
    args->se1sh0 = ~0;
    args->se1sh1 = ~0;
    break;
  }
@ -265,8 +362,14 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request,
  }
  case 0xc0048113: {
-    // get client number
+    // get num clients
-    *(std::uint32_t *)argp = 0;
+
    struct Args {
      std::uint32_t numClients;
    };
    auto *args = reinterpret_cast<Args *>(argp);
    args->numClients = device->clients.size();
    break;
  }
@ -312,8 +415,38 @@ orbis::ErrorCode GcDevice::open(orbis::Ref<orbis::File> *file, const char *path,
  auto newFile = orbis::knew<GcFile>();
  newFile->device = this;
  newFile->ops = &ops;
  newFile->process = thread->tproc;
  addClient(thread->tproc);
  *file = newFile;
  return {};
 }
 void GcDevice::addClient(orbis::Process *process) {
  std::lock_guard lock(mtx);
  auto &client = clients[process->pid];
  ++client;
  if (client == 1) {
    auto vmId = allocateVmId();
    rx::bridge.sendMapProcess(process->pid, vmId);
    process->vmId = vmId;
    runBridge(vmId);
  }
 }
 void GcDevice::removeClient(orbis::Process *process) {
  std::lock_guard lock(mtx);
  auto clientIt = clients.find(process->pid);
  assert(clientIt != clients.end());
  assert(clientIt->second != 0);
  --clientIt->second;
  if (clientIt->second == 0) {
    clients.erase(clientIt);
    rx::bridge.sendUnmapProcess(process->pid);
    deallocateVmId(process->vmId);
    process->vmId = -1;
  }
 }
 IoDevice *createGcCharacterDevice() { return orbis::knew<GcDevice>(); }
--- a/rpcsx-os/main.cpp
+++ b/rpcsx-os/main.cpp
@ -41,71 +41,6 @@
 #include <unordered_map>
 static int g_gpuPid;
 void runBridge() {
  std::thread{[] {
    pthread_setname_np(pthread_self(), "Bridge");
    auto bridge = rx::bridge.header;
    std::vector<std::uint64_t> fetchedCommands;
    fetchedCommands.reserve(std::size(bridge->cacheCommands));
    while (true) {
      for (auto &command : bridge->cacheCommands) {
        std::uint64_t value = command.load(std::memory_order::relaxed);
        if (value != 0) {
          fetchedCommands.push_back(value);
          command.store(0, std::memory_order::relaxed);
        }
      }
      if (fetchedCommands.empty()) {
        continue;
      }
      for (auto command : fetchedCommands) {
        auto page = static_cast<std::uint32_t>(command);
        auto count = static_cast<std::uint32_t>(command >> 32) + 1;
        auto pageFlags =
            bridge->cachePages[page].load(std::memory_order::relaxed);
        auto address =
            static_cast<std::uint64_t>(page) * amdgpu::bridge::kHostPageSize;
        auto origVmProt = rx::vm::getPageProtection(address);
        int prot = 0;
        if (origVmProt & rx::vm::kMapProtCpuRead) {
          prot |= PROT_READ;
        }
        if (origVmProt & rx::vm::kMapProtCpuWrite) {
          prot |= PROT_WRITE;
        }
        if (origVmProt & rx::vm::kMapProtCpuExec) {
          prot |= PROT_EXEC;
        }
        if (pageFlags & amdgpu::bridge::kPageReadWriteLock) {
          prot &= ~(PROT_READ | PROT_WRITE);
        } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) {
          prot &= ~PROT_WRITE;
        }
        // std::fprintf(stderr, "protection %lx-%lx\n", address,
        //              address + amdgpu::bridge::kHostPageSize * count);
        if (::mprotect(reinterpret_cast<void *>(address),
                       amdgpu::bridge::kHostPageSize * count, prot)) {
          perror("protection failed");
          std::abort();
        }
      }
      fetchedCommands.clear();
    }
  }}.detach();
 }
 extern bool allowMonoDebug;
 __attribute__((no_stack_protector)) static void
@ -116,8 +51,9 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
  auto signalAddress = reinterpret_cast<std::uintptr_t>(info->si_addr);
-  if (orbis::g_currentThread != nullptr && sig == SIGSEGV &&
+  if (orbis::g_currentThread != nullptr && orbis::g_currentThread->tproc->vmId >= 0 && sig == SIGSEGV &&
      signalAddress >= 0x40000 && signalAddress < 0x100'0000'0000) {
    auto vmid = orbis::g_currentThread->tproc->vmId;
    auto ctx = reinterpret_cast<ucontext_t *>(ucontext);
    bool isWrite = (ctx->uc_mcontext.gregs[REG_ERR] & 0x2) != 0;
    auto origVmProt = rx::vm::getPageProtection(signalAddress);
@ -138,17 +74,17 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
      auto bridge = rx::bridge.header;
      while (true) {
-        auto flags = bridge->cachePages[page].load(std::memory_order::relaxed);
+        auto flags = bridge->cachePages[vmid][page].load(std::memory_order::relaxed);
        if ((flags & amdgpu::bridge::kPageReadWriteLock) != 0) {
          if ((flags & amdgpu::bridge::kPageLazyLock) != 0) {
            if (std::uint32_t gpuCommand = 0;
-                !bridge->gpuCacheCommand.compare_exchange_weak(gpuCommand,
+                !bridge->gpuCacheCommand[vmid].compare_exchange_weak(gpuCommand,
                                                               page)) {
              continue;
            }
-            while (!bridge->cachePages[page].compare_exchange_weak(
+            while (!bridge->cachePages[vmid][page].compare_exchange_weak(
                flags, flags & ~amdgpu::bridge::kPageLazyLock,
                std::memory_order::relaxed)) {
            }
@ -165,7 +101,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
          break;
        }
-        if (bridge->cachePages[page].compare_exchange_weak(
+        if (bridge->cachePages[vmid][page].compare_exchange_weak(
                flags, amdgpu::bridge::kPageInvalidated,
                std::memory_order::relaxed)) {
          break;
@ -188,6 +124,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) {
  }
  if (orbis::g_currentThread != nullptr) {
    orbis::g_currentThread->tproc->exitStatus = sig;
    orbis::g_currentThread->tproc->event.emit(orbis::kEvFiltProc,
                                              orbis::kNoteExit, sig);
  }
@ -1640,29 +1577,34 @@ int main(int argc, const char *argv[]) {
  };
  if (isSystem) {
    amdgpu::bridge::expGpuPid = isSafeMode ? 20001 : 60001;
    orbis::g_context.safeMode = isSafeMode ? 1 : 0;
-    initProcess->authInfo = {
+    initProcess->authInfo = {.unk0 = 0x380000000000000f,
-        .unk0 = 0x380000000000000f,
+                             .caps =
-        .caps =
+                                 {
-            {
+                                     -1ul,
-                -1ul,
+                                     -1ul,
-                -1ul,
+                                     -1ul,
-                -1ul,
+                                     -1ul,
-                -1ul,
+                                 },
-            },
+                             .attrs =
-        .attrs =
+                                 {
-            {
+                                     0x4000400040000000,
-                0x4000400040000000,
+                                     0x4000000000000000,
-                0x4000000000000000,
+                                     0x0080000000000002,
-                0x0080000000000002,
+                                     0xF0000000FFFF4000,
-                0xF0000000FFFF4000,
+                                 },
-            },
+                             .ucred = {
-    };
+                                 -1ul,
                                 -1ul,
                                 0x3800000000000022,
                                 -1ul,
                                 (1ul << 0x3a),
                                 -1ul,
                                 -1ul,
                             }};
    initProcess->budgetId = 0;
    initProcess->isInSandbox = false;
  } else {
    amdgpu::bridge::expGpuPid = initProcess->pid;
    initProcess->authInfo = {
        .unk0 = 0x3100000000000001,
        .caps =
@ -1788,7 +1730,6 @@ int main(int argc, const char *argv[]) {
      launchDaemon(mainThread, "/system/sys/orbis_audiod.elf",
                   {"/system/sys/orbis_audiod.elf"}, {});
      runBridge();
      status = ps4Exec(mainThread, execEnv, std::move(executableModule),
                       ps4Argv, {});
    }
--- a/rpcsx-os/ops.cpp
+++ b/rpcsx-os/ops.cpp
@ -43,7 +43,6 @@ using namespace orbis;
 extern bool allowMonoDebug;
 extern "C" void __register_frame(const void *);
 void runBridge();
 void setupSigHandlers();
 int ps4Exec(orbis::Thread *mainThread,
            orbis::utils::Ref<orbis::Module> executableModule,
@ -828,9 +827,6 @@ SysResult fork(Thread *thread, slong flags) {
  dup2(logFd, 1);
  dup2(logFd, 2);
  if (childPid == amdgpu::bridge::expGpuPid) {
    runBridge();
  }
  return {};
 }
--- a/rpcsx-os/vm.cpp
+++ b/rpcsx-os/vm.cpp
@ -958,11 +958,8 @@ void *rx::vm::map(void *addr, std::uint64_t len, std::int32_t prot,
  }
  if (auto thr = orbis::g_currentThread) {
-    // std::fprintf(stderr, "sending mapping %lx-%lx, pid %lx\n", address,
+    rx::bridge.sendMapMemory(thr->tproc->pid, -1, -1, address, len, prot,
-    //              address + len, thr->tproc->pid);
+                             address - kMinAddress);
    // if (!noOverwrite) {
    //   rx::bridge.sendMemoryProtect(thr->tproc->pid, address, len, prot);
    // }
  } else {
    std::fprintf(stderr, "ignoring mapping %lx-%lx\n", address, address + len);
  }