gpu: add mulitprocess buffer commits

2026-04-20 22:05:12 +00:00 · 2024-09-02 20:24:16 +03:00 · 2024-09-02 20:24:16 +03:00 · 6a9924ebd1
commit 6a9924ebd1
parent d0c9585b62
5 changed files with 136 additions and 66 deletions
--- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
+++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp
@ -48,6 +48,8 @@ enum class CommandId : std::uint32_t {
  MapMemory,
  MapProcess,
  UnmapProcess,
+  RegisterBuffer,
+  RegisterBufferAttribute,
 };

 struct CmdMemoryProt {
@ -64,13 +66,25 @@ struct CmdCommandBuffer {
  std::uint32_t pid;
 };

-struct CmdBuffer {
-  std::uint32_t width;
-  std::uint32_t height;
-  std::uint32_t pitch;
-  std::uint64_t address;
+struct CmdBufferAttribute {
+  std::uint32_t pid;
+  std::uint8_t attrId;
+  std::uint8_t submit;
+  std::uint64_t canary;
  std::uint32_t pixelFormat;
  std::uint32_t tilingMode;
+  std::uint32_t pitch;
+  std::uint32_t width;
+  std::uint32_t height;
+};
+
+struct CmdBuffer {
+  std::uint64_t canary;
+  std::uint32_t index;
+  std::uint32_t attrId;
+  std::uint64_t address;
+  std::uint64_t address2;
+  std::uint32_t pid;
 };

 struct CmdFlip {
@ -118,14 +132,14 @@ struct BridgeHeader {
  std::uint64_t vmSize;
  char vmName[32];
  PadState kbPadState;
-  volatile std::uint32_t flipBuffer;
-  volatile std::uint64_t flipArg;
-  volatile std::uint64_t flipCount;
-  volatile std::uint64_t bufferInUseAddress;
+  volatile std::uint32_t flipBuffer[6];
+  volatile std::uint64_t flipArg[6];
+  volatile std::uint64_t flipCount[6];
+  volatile std::uint64_t bufferInUseAddress[6];
  std::uint32_t commandBufferCount;
  std::uint32_t bufferCount;
  CmdCommandBuffer commandBuffers[32];
-  CmdBuffer buffers[10];
+  // CmdBuffer buffers[10];
  // orbis::shared_mutex cacheCommandMtx;
  // orbis::shared_cv cacheCommandCv;
  std::atomic<std::uint64_t> cacheCommands[6][4];
@ -144,6 +158,7 @@ struct Command {
    CmdMemoryProt memoryProt;
    CmdCommandBuffer commandBuffer;
    CmdBuffer buffer;
+    CmdBufferAttribute bufferAttribute;
    CmdFlip flip;
    CmdMapMemory mapMemory;
    CmdMapProcess mapProcess;
@ -181,6 +196,23 @@ struct BridgePusher {
                {pid, memoryType, dmemIndex, address, size, prot, offset});
  }

+  void sendRegisterBuffer(std::uint32_t pid, std::uint64_t canary,
+                          std::uint32_t index, std::uint32_t attrId,
+                          std::uint64_t address, std::uint64_t address2) {
+    sendCommand(CommandId::RegisterBuffer,
+                {pid, canary, index, attrId, address, address2});
+  }
+  void sendRegisterBufferAttribute(std::uint32_t pid, std::uint8_t attrId,
+                                   std::uint8_t submit, std::uint64_t canary,
+                                   std::uint32_t pixelFormat,
+                                   std::uint32_t tilingMode,
+                                   std::uint32_t pitch, std::uint32_t width,
+                                   std::uint32_t height) {
+    sendCommand(CommandId::RegisterBufferAttribute,
+                {pid, attrId, submit, canary, pixelFormat, tilingMode, pitch,
+                 width, height});
+  }
+
  void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue,
                         std::uint64_t address, std::uint64_t size) {
    sendCommand(CommandId::CommandBuffer, {pid, queue, address, size});
@ -335,6 +367,27 @@ private:
    case CommandId::UnmapProcess:
      result.unmapProcess.pid = args[0];
      return result;
+
+    case CommandId::RegisterBufferAttribute:
+      result.bufferAttribute.pid = args[0];
+      result.bufferAttribute.attrId = args[1];
+      result.bufferAttribute.submit = args[2];
+      result.bufferAttribute.canary = args[3];
+      result.bufferAttribute.pixelFormat = args[4];
+      result.bufferAttribute.tilingMode = args[5];
+      result.bufferAttribute.pitch = args[6];
+      result.bufferAttribute.width = args[7];
+      result.bufferAttribute.height = args[8];
+      return result;
+
+    case CommandId::RegisterBuffer:
+      result.buffer.pid = args[0];
+      result.buffer.canary = args[1];
+      result.buffer.index = args[2];
+      result.buffer.attrId = args[3];
+      result.buffer.address = args[4];
+      result.buffer.address2 = args[5];
+      return result;
    }

    __builtin_trap();
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -1311,7 +1311,9 @@ struct AmdgpuDevice {
                  TaskChain &initTaskChain, std::uint32_t bufferIndex,
                  std::uint64_t arg, VkImage targetImage,
                  VkExtent2D targetExtent, VkSemaphore waitSemaphore,
-                  VkSemaphore signalSemaphore, VkFence fence);
+                  VkSemaphore signalSemaphore, VkFence fence,
+                  bridge::CmdBuffer *buffers,
+                  bridge::CmdBufferAttribute *bufferAttributes);

  AmdgpuDevice(amdgpu::bridge::BridgeHeader *bridge);

--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
@ -2666,9 +2666,10 @@ struct CacheLine {
  std::mutex writeBackTableMtx;
  util::MemoryTableWithPayload<Ref<AsyncTaskCtl>> writeBackTable;

-  CacheLine(std::uint64_t areaAddress, std::uint64_t areaSize)
-      : areaAddress(areaAddress), areaSize(areaSize) {
+  CacheLine(RemoteMemory memory, std::uint64_t areaAddress, std::uint64_t areaSize)
+      :memory(memory), areaAddress(areaAddress), areaSize(areaSize) {
    memoryOverlay = new MemoryOverlay();
+    memoryOverlay->memory = memory;
    hostSyncTable.map(areaAddress, areaAddress + areaSize, {1, memoryOverlay});
  }

@ -3631,8 +3632,7 @@ private:
      assert(address >= area.beginAddress && address + size < area.endAddress);
      it = cacheLines.emplace_hint(
          it, std::piecewise_construct, std::tuple{area.beginAddress},
-          std::tuple{area.beginAddress, area.endAddress});
-      it->second.memory = memory;
+          std::tuple{memory, area.beginAddress, area.endAddress});
    }

    return it->second;
@ -4817,8 +4817,8 @@ void amdgpu::device::AmdgpuDevice::handleProtectMemory(RemoteMemory memory,
      protStr = "unknown";
      break;
    }
-    std::fprintf(stderr, "Allocated area at %zx, size %lx, prot %s\n", address,
-                 size, protStr);
+    std::fprintf(stderr, "Allocated area at %zx, size %lx, prot %s, vmid %u\n", address,
+                 size, protStr, memory.vmId);
  } else {
    memoryAreaTable[memory.vmId].unmap(beginPage, endPage);
    std::fprintf(stderr, "Unmapped area at %zx, size %lx\n", address, size);
@ -4888,12 +4888,13 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(
    RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer,
    TaskChain &taskChain, std::uint32_t bufferIndex, std::uint64_t arg,
    VkImage targetImage, VkExtent2D targetExtent, VkSemaphore waitSemaphore,
-    VkSemaphore signalSemaphore, VkFence fence) {
+    VkSemaphore signalSemaphore, VkFence fence, bridge::CmdBuffer *buffers,
+    bridge::CmdBufferAttribute *bufferAttributes) {

  if (bufferIndex == ~static_cast<std::uint32_t>(0)) {
-    g_bridge->flipBuffer = bufferIndex;
-    g_bridge->flipArg = arg;
-    g_bridge->flipCount = g_bridge->flipCount + 1;
+    g_bridge->flipBuffer[memory.vmId] = bufferIndex;
+    g_bridge->flipArg[memory.vmId] = arg;
+    g_bridge->flipCount[memory.vmId] = g_bridge->flipCount[memory.vmId] + 1;

    // black surface, ignore for now
    return false;
@ -4904,9 +4905,10 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(
  // std::fprintf(stderr, "host visible memory: ");
  // getHostVisibleMemory().dump();

-  auto buffer = g_bridge->buffers[bufferIndex];
+  auto buffer = buffers[bufferIndex];
+  auto bufferAttr = bufferAttributes[buffer.attrId];

-  if (buffer.pitch == 0 || buffer.height == 0 || buffer.address == 0) {
+  if (bufferAttr.pitch == 0 || bufferAttr.height == 0 || buffer.address == 0) {
    std::printf("Attempt to flip unallocated buffer\n");
    return false;
  }
@ -4925,7 +4927,7 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(
  SurfaceFormat surfFormat;
  TextureChannelType channelType;

-  switch (buffer.pixelFormat) {
+  switch (bufferAttr.pixelFormat) {
  case 0x80000000:
    // bgra
    surfFormat = kSurfaceFormat8_8_8_8;
@ -4946,18 +4948,18 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(

  default:
    util::unreachable("unimplemented color buffer format %x",
-                      buffer.pixelFormat);
+                      bufferAttr.pixelFormat);
  }

  auto &cache = getCache(memory);
  auto tag = cache.createTag();

-  imageRef =
-      cache.getImage(tag, taskChain, buffer.address, surfFormat, channelType,
-                     buffer.tilingMode == 1 ? kTileModeDisplay_2dThin
-                                            : kTileModeDisplay_LinearAligned,
-                     buffer.width, buffer.height, 1, buffer.pitch, 4, 5, 6, 7,
-                     shader::AccessOp::Load);
+  imageRef = cache.getImage(
+      tag, taskChain, buffer.address, surfFormat, channelType,
+      bufferAttr.tilingMode == 1 ? kTileModeDisplay_2dThin
+                                 : kTileModeDisplay_LinearAligned,
+      bufferAttr.width, bufferAttr.height, 1, bufferAttr.pitch, 4, 5, 6, 7,
+      shader::AccessOp::Load);

  auto initTask = taskChain.getLastTaskId();

@ -4972,8 +4974,8 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(
                           .baseArrayLayer = 0,
                           .layerCount = 1},
        .srcOffsets = {{},
-                       {static_cast<int32_t>(buffer.width),
-                        static_cast<int32_t>(buffer.height), 1}},
+                       {static_cast<int32_t>(bufferAttr.width),
+                        static_cast<int32_t>(bufferAttr.height), 1}},
        .dstSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
                           .mipLevel = 0,
                           .baseArrayLayer = 0,
@ -5064,11 +5066,11 @@ bool amdgpu::device::AmdgpuDevice::handleFlip(
  taskChain.add(submitCompleteTask, [=] {
    imageRef->unlock(tag);

-    g_bridge->flipBuffer = bufferIndex;
-    g_bridge->flipArg = arg;
-    g_bridge->flipCount = g_bridge->flipCount + 1;
+    g_bridge->flipBuffer[memory.vmId] = bufferIndex;
+    g_bridge->flipArg[memory.vmId] = arg;
+    g_bridge->flipCount[memory.vmId] = g_bridge->flipCount[memory.vmId] + 1;
    auto bufferInUse =
-        memory.getPointer<std::uint64_t>(g_bridge->bufferInUseAddress);
+        memory.getPointer<std::uint64_t>(g_bridge->bufferInUseAddress[memory.vmId]);
    if (bufferInUse != nullptr) {
      bufferInUse[bufferIndex] = 0;
    }
--- a/rpcsx-gpu/main.cpp
+++ b/rpcsx-gpu/main.cpp
@ -763,6 +763,8 @@ int main(int argc, const char *argv[]) {
    struct ProcessInfo {
      int vmId = -1;
      int vmFd = -1;
+      amdgpu::bridge::CmdBufferAttribute bufferAttributes[10];
+      amdgpu::bridge::CmdBuffer buffers[10];
      rx::MemoryTableWithPayload<VmMapSlot> vmTable;
    };

@ -1033,7 +1035,8 @@ int main(int argc, const char *argv[]) {
            rx::mem::protect(memory.getPointer(cmd.memoryProt.address),
                             cmd.memoryProt.size, cmd.memoryProt.prot >> 4);
            device.handleProtectMemory(memory, cmd.memoryProt.address,
-                                       cmd.memoryProt.size, cmd.memoryProt.prot);
+                                       cmd.memoryProt.size,
+                                       cmd.memoryProt.prot);
          }
          break;
        }
@ -1075,7 +1078,8 @@ int main(int argc, const char *argv[]) {
                    *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
                    cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
                    presentCompleteSemaphore, renderCompleteSemaphore,
-                    inFlightFences[imageIndex])) {
+                    inFlightFences[imageIndex], process.buffers,
+                    process.bufferAttributes)) {
              VkPresentInfoKHR presentInfo{
                  .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
                  .waitSemaphoreCount = 1,
@ -1096,7 +1100,8 @@ int main(int argc, const char *argv[]) {
        }

        case amdgpu::bridge::CommandId::MapProcess: {
-          mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, processInfo[cmd.mapProcess.pid]);
+          mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId,
+                     processInfo[cmd.mapProcess.pid]);
          break;
        }
        case amdgpu::bridge::CommandId::UnmapProcess: {
@ -1147,6 +1152,28 @@ int main(int argc, const char *argv[]) {
          break;
        }

+        case amdgpu::bridge::CommandId::RegisterBuffer: {
+          auto &process = processInfo[cmd.buffer.pid];
+
+          if (cmd.buffer.attrId >= 10 || cmd.buffer.index >= 10) {
+            std::abort();
+          }
+
+          process.buffers[cmd.buffer.index] = cmd.buffer;
+          break;
+        }
+
+        case amdgpu::bridge::CommandId::RegisterBufferAttribute: {
+          auto &process = processInfo[cmd.bufferAttribute.pid];
+          if (cmd.bufferAttribute.attrId >= 10) {
+            std::abort();
+          }
+
+          process.bufferAttributes[cmd.bufferAttribute.attrId] =
+              cmd.bufferAttribute;
+          break;
+        }
+
        default:
          util::unreachable("Unexpected command id %u\n", (unsigned)cmd.id);
        }
--- a/rpcsx-os/iodev/dce.cpp
+++ b/rpcsx-os/iodev/dce.cpp
@ -252,8 +252,6 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                                  void *argp, orbis::Thread *thread) {
  auto device = static_cast<DceDevice *>(file->device.get());

-  std::lock_guard lock(device->mtx);
-
  if (request == 0xc0308203) {
    // returns:
    // PERM
@ -298,11 +296,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,

      FlipControlStatus flipStatus{};
      // TODO: lock bridge header
-      flipStatus.flipArg = rx::bridge.header->flipArg;
-      flipStatus.count = rx::bridge.header->flipCount;
+      flipStatus.flipArg = rx::bridge.header->flipArg[thread->tproc->vmId];
+      flipStatus.count = rx::bridge.header->flipCount[thread->tproc->vmId];
      flipStatus.processTime = 0; // TODO
      flipStatus.tsc = 0;         // TODO
-      flipStatus.currentBuffer = rx::bridge.header->flipBuffer;
+      flipStatus.currentBuffer = rx::bridge.header->flipBuffer[thread->tproc->vmId];
      flipStatus.flipPendingNum0 = 0; // TODO
      flipStatus.gcQueueNum = 0;      // TODO
      flipStatus.flipPendingNum1 = 0; // TODO
@ -332,8 +330,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
      *(std::uint64_t *)args->size = kDceControlMemorySize;  // size
    } else if (args->id == 31) {
      if ((std::uint64_t)args->ptr == 0xc) {
-        rx::bridge.header->bufferInUseAddress = args->size;
-      } else {
+        rx::bridge.header->bufferInUseAddress[thread->tproc->vmId] = args->size;
+      } else if ((std::uint64_t)args->ptr != 1) {
        ORBIS_LOG_ERROR("buffer in use", args->ptr, args->size);
        thread->where();
      }
@ -361,20 +359,8 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
    ORBIS_LOG_ERROR("dce: RegisterBuffer", args->canary, args->index,
                    args->address, args->address2);

-    if (args->index >= std::size(rx::bridge.header->buffers)) {
-      // TODO
-      ORBIS_LOG_FATAL("dce: out of buffers!", args->index);
-      return orbis::ErrorCode::NOMEM;
-    }
-
-    // TODO: lock bridge header
-    rx::bridge.header->buffers[args->index] = {
-        .width = device->bufferAttributes.width,
-        .height = device->bufferAttributes.height,
-        .pitch = device->bufferAttributes.pitch,
-        .address = args->address,
-        .pixelFormat = device->bufferAttributes.pixelFormat,
-        .tilingMode = device->bufferAttributes.tilingMode};
+    rx::bridge.sendRegisterBuffer(thread->tproc->pid, args->canary, args->index,
+                                  args->attrid, args->address, args->address2);
    return {};
  }

@ -387,11 +373,11 @@ static orbis::ErrorCode dce_ioctl(orbis::File *file, std::uint64_t request,
                    args->unk4_zero, args->unk5_zero, args->options,
                    args->reserved1, args->reserved2);

-    device->bufferAttributes.pixelFormat = args->pixelFormat;
-    device->bufferAttributes.tilingMode = args->tilingMode;
-    device->bufferAttributes.pitch = args->pitch;
-    device->bufferAttributes.width = args->width;
-    device->bufferAttributes.height = args->height;
+    rx::bridge.sendRegisterBufferAttribute(
+        thread->tproc->pid, args->attrid, args->submit, args->canary,
+        args->pixelFormat, args->tilingMode, args->pitch, args->width,
+        args->height);
+
    return {};
  }