gpu: reduce cpu usage on cache commands

This commit is contained in:
DH 2024-11-01 09:51:50 +03:00
parent 9558bb7335
commit 4bccf990f3
3 changed files with 42 additions and 11 deletions

View file

@ -210,7 +210,14 @@ Device::Device() : vkContext(createVkContext(this)) {
cacheUpdateThread = std::jthread([this](const std::stop_token &stopToken) {
auto &sched = graphicsPipes[0].scheduler;
std::uint32_t prevIdleValue = 0;
while (!stopToken.stop_requested()) {
if (gpuCacheCommandIdle.wait(prevIdleValue) != std::errc{}) {
continue;
}
prevIdleValue = gpuCacheCommandIdle.load(std::memory_order::acquire);
for (int vmId = 0; vmId < kMaxProcessCount; ++vmId) {
auto page = gpuCacheCommand[vmId].load(std::memory_order::relaxed);
if (page == 0) {
@ -996,11 +1003,16 @@ static void notifyPageChanges(Device *device, int vmId, std::uint32_t firstPage,
(static_cast<std::uint64_t>(pageCount - 1) << 32) | firstPage;
while (true) {
for (std::size_t i = 0; i < std::size(device->cacheCommands); ++i) {
for (std::size_t i = 0; i < std::size(device->cpuCacheCommands); ++i) {
std::uint64_t expCommand = 0;
if (device->cacheCommands[vmId][i].compare_exchange_strong(
expCommand, command, std::memory_order::acquire,
if (device->cpuCacheCommands[vmId][i].compare_exchange_strong(
expCommand, command, std::memory_order::release,
std::memory_order::relaxed)) {
device->cpuCacheCommandsIdle[vmId].fetch_add(
1, std::memory_order::release);
device->cpuCacheCommandsIdle[vmId].notify_one();
while (device->cpuCacheCommands[vmId][i].load(std::memory_order::acquire) != 0) {}
return;
}
}

View file

@ -1,5 +1,6 @@
#pragma once
#include "orbis/utils/SharedAtomic.hpp"
#include <atomic>
#include <cstdint>
@ -66,10 +67,12 @@ enum {
struct DeviceContext {
static constexpr auto kMaxProcessCount = 6;
PadState kbPadState;
std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
PadState kbPadState{};
std::atomic<std::uint64_t> cpuCacheCommands[kMaxProcessCount][4]{};
orbis::shared_atomic32 cpuCacheCommandsIdle[kMaxProcessCount]{};
orbis::shared_atomic32 gpuCacheCommand[kMaxProcessCount]{};
orbis::shared_atomic32 gpuCacheCommandIdle{};
std::atomic<std::uint8_t> *cachePages[kMaxProcessCount]{};
volatile std::uint32_t flipBuffer[kMaxProcessCount];
volatile std::uint64_t flipArg[kMaxProcessCount];

View file

@ -137,15 +137,26 @@ static void runBridge(int vmId) {
auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
auto &gpuCtx = gpu.getContext();
std::vector<std::uint64_t> fetchedCommands;
fetchedCommands.reserve(std::size(gpuCtx.cacheCommands));
fetchedCommands.reserve(std::size(gpuCtx.cpuCacheCommands));
std::vector<std::atomic<std::uint64_t> *> fetchedAtomics;
std::uint32_t prevIdleValue = 0;
while (true) {
for (auto &command : gpuCtx.cacheCommands) {
std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
if (gpuCtx.cpuCacheCommandsIdle[vmId].wait(prevIdleValue) !=
std::errc{}) {
continue;
}
prevIdleValue =
gpuCtx.cpuCacheCommandsIdle[vmId].load(std::memory_order::acquire);
for (auto &command : gpuCtx.cpuCacheCommands[vmId]) {
std::uint64_t value = command.load(std::memory_order::relaxed);
if (value != 0) {
fetchedCommands.push_back(value);
command[vmId].store(0, std::memory_order::relaxed);
fetchedAtomics.push_back(&command);
}
}
@ -187,7 +198,12 @@ static void runBridge(int vmId) {
}
}
for (auto fetchedAtomic : fetchedAtomics) {
fetchedAtomic->store(0, std::memory_order::release);
}
fetchedCommands.clear();
fetchedAtomics.clear();
}
}}.detach();
}