gpu: reduce cpu usage on cache commands

2026-02-12 10:44:23 +01:00 · 2024-11-01 09:51:50 +03:00 · 2024-11-01 09:51:50 +03:00 · 4bccf990f3
parent 9558bb7335
commit 4bccf990f3
3 changed files with 42 additions and 11 deletions
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@ -210,7 +210,14 @@ Device::Device() : vkContext(createVkContext(this)) {

  cacheUpdateThread = std::jthread([this](const std::stop_token &stopToken) {
    auto &sched = graphicsPipes[0].scheduler;
+    std::uint32_t prevIdleValue = 0;
    while (!stopToken.stop_requested()) {
+      if (gpuCacheCommandIdle.wait(prevIdleValue) != std::errc{}) {
+        continue;
+      }
+
+      prevIdleValue = gpuCacheCommandIdle.load(std::memory_order::acquire);
+
      for (int vmId = 0; vmId < kMaxProcessCount; ++vmId) {
        auto page = gpuCacheCommand[vmId].load(std::memory_order::relaxed);
        if (page == 0) {
@ -996,11 +1003,16 @@ static void notifyPageChanges(Device *device, int vmId, std::uint32_t firstPage,
      (static_cast<std::uint64_t>(pageCount - 1) << 32) | firstPage;

  while (true) {
-    for (std::size_t i = 0; i < std::size(device->cacheCommands); ++i) {
+    for (std::size_t i = 0; i < std::size(device->cpuCacheCommands); ++i) {
      std::uint64_t expCommand = 0;
-      if (device->cacheCommands[vmId][i].compare_exchange_strong(
-              expCommand, command, std::memory_order::acquire,
+      if (device->cpuCacheCommands[vmId][i].compare_exchange_strong(
+              expCommand, command, std::memory_order::release,
              std::memory_order::relaxed)) {
+        device->cpuCacheCommandsIdle[vmId].fetch_add(
+            1, std::memory_order::release);
+        device->cpuCacheCommandsIdle[vmId].notify_one();
+
+        while (device->cpuCacheCommands[vmId][i].load(std::memory_order::acquire) != 0) {}
        return;
      }
    }
--- a/rpcsx/gpu/DeviceContext.hpp
+++ b/rpcsx/gpu/DeviceContext.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include "orbis/utils/SharedAtomic.hpp"
 #include <atomic>
 #include <cstdint>

@ -66,10 +67,12 @@ enum {
 struct DeviceContext {
  static constexpr auto kMaxProcessCount = 6;

-  PadState kbPadState;
-  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
-  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
-  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
+  PadState kbPadState{};
+  std::atomic<std::uint64_t> cpuCacheCommands[kMaxProcessCount][4]{};
+  orbis::shared_atomic32 cpuCacheCommandsIdle[kMaxProcessCount]{};
+  orbis::shared_atomic32 gpuCacheCommand[kMaxProcessCount]{};
+  orbis::shared_atomic32 gpuCacheCommandIdle{};
+  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount]{};

  volatile std::uint32_t flipBuffer[kMaxProcessCount];
  volatile std::uint64_t flipArg[kMaxProcessCount];
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@ -137,15 +137,26 @@ static void runBridge(int vmId) {
    auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
    auto &gpuCtx = gpu.getContext();
    std::vector<std::uint64_t> fetchedCommands;
-    fetchedCommands.reserve(std::size(gpuCtx.cacheCommands));
+    fetchedCommands.reserve(std::size(gpuCtx.cpuCacheCommands));
+
+    std::vector<std::atomic<std::uint64_t> *> fetchedAtomics;
+    std::uint32_t prevIdleValue = 0;

    while (true) {
-      for (auto &command : gpuCtx.cacheCommands) {
-        std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
+      if (gpuCtx.cpuCacheCommandsIdle[vmId].wait(prevIdleValue) !=
+          std::errc{}) {
+        continue;
+      }
+
+      prevIdleValue =
+          gpuCtx.cpuCacheCommandsIdle[vmId].load(std::memory_order::acquire);
+
+      for (auto &command : gpuCtx.cpuCacheCommands[vmId]) {
+        std::uint64_t value = command.load(std::memory_order::relaxed);

        if (value != 0) {
          fetchedCommands.push_back(value);
-          command[vmId].store(0, std::memory_order::relaxed);
+          fetchedAtomics.push_back(&command);
        }
      }

@ -187,7 +198,12 @@ static void runBridge(int vmId) {
        }
      }

+      for (auto fetchedAtomic : fetchedAtomics) {
+        fetchedAtomic->store(0, std::memory_order::release);
+      }
+
      fetchedCommands.clear();
+      fetchedAtomics.clear();
    }
  }}.detach();
 }