From 4fe857485c5508b49b0b9d600477c4c6f2240b71 Mon Sep 17 00:00:00 2001 From: DH Date: Tue, 15 Oct 2024 18:35:17 +0300 Subject: [PATCH] gpu: implement compute queue --- orbis-kernel/include/orbis/KernelContext.hpp | 1 + rpcsx/gpu/Device.cpp | 8 +- rpcsx/gpu/Device.hpp | 2 +- rpcsx/gpu/DeviceCtl.cpp | 44 + rpcsx/gpu/DeviceCtl.hpp | 9 + rpcsx/gpu/Pipe.cpp | 902 ++++++++++++------- rpcsx/gpu/Pipe.hpp | 161 ++-- rpcsx/gpu/Registers.hpp | 29 +- rpcsx/iodev/dce.cpp | 44 +- rpcsx/iodev/dce.hpp | 25 + rpcsx/iodev/gc.cpp | 89 +- rpcsx/main.cpp | 5 +- 12 files changed, 836 insertions(+), 483 deletions(-) create mode 100644 rpcsx/iodev/dce.hpp diff --git a/orbis-kernel/include/orbis/KernelContext.hpp b/orbis-kernel/include/orbis/KernelContext.hpp index b6b80689f..76d5cdc01 100644 --- a/orbis-kernel/include/orbis/KernelContext.hpp +++ b/orbis-kernel/include/orbis/KernelContext.hpp @@ -181,6 +181,7 @@ public: Ref blockpoolDevice; shared_mutex gpuDeviceMtx; Ref gpuDevice; + Ref dceDevice; uint sdkVersion{}; uint fwSdkVersion{}; uint safeMode{}; diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp index 64af24036..8efee25c5 100644 --- a/rpcsx/gpu/Device.cpp +++ b/rpcsx/gpu/Device.cpp @@ -236,7 +236,7 @@ Device::Device() : vkContext(createVkContext(this)) { for (int i = 0; i < kGfxPipeCount; ++i) { graphicsPipes[i].setDeQueue( - Queue{ + Ring{ .base = mainGfxRings[i], .size = sizeof(mainGfxRings[i]) / sizeof(mainGfxRings[i][0]), .rptr = mainGfxRings[i], @@ -474,7 +474,7 @@ void Device::start() { } } -void Device::submitCommand(Queue &ring, +void Device::submitCommand(Ring &ring, std::span command) { std::scoped_lock lock(writeCommandMtx); if (ring.wptr + command.size() > ring.base + ring.size) { @@ -599,12 +599,12 @@ void Device::onCommandBuffer(std::uint32_t pid, int cmdHeader, auto op = rx::getBits(cmdHeader, 15, 8); if (op == gnm::IT_INDIRECT_BUFFER_CNST) { - graphicsPipes[0].setCeQueue(Queue::createFromRange( + graphicsPipes[0].setCeQueue(Ring::createFromRange( process.vmId, memory.getPointer(address), size / sizeof(std::uint32_t))); } else if (op == gnm::IT_INDIRECT_BUFFER) { graphicsPipes[0].setDeQueue( - Queue::createFromRange(process.vmId, + Ring::createFromRange(process.vmId, memory.getPointer(address), size / sizeof(std::uint32_t)), 1); diff --git a/rpcsx/gpu/Device.hpp b/rpcsx/gpu/Device.hpp index 89417c6ae..7bc4979d0 100644 --- a/rpcsx/gpu/Device.hpp +++ b/rpcsx/gpu/Device.hpp @@ -112,7 +112,7 @@ struct Device : orbis::RcBase, DeviceContext { return caches[vmId].createComputeTag(scheduler); } - void submitCommand(Queue &ring, std::span command); + void submitCommand(Ring &ring, std::span command); void submitGfxCommand(int gfxPipe, std::span command); void mapProcess(std::uint32_t pid, int vmId); diff --git a/rpcsx/gpu/DeviceCtl.cpp b/rpcsx/gpu/DeviceCtl.cpp index 26f527370..1c3c9de6a 100644 --- a/rpcsx/gpu/DeviceCtl.cpp +++ b/rpcsx/gpu/DeviceCtl.cpp @@ -117,5 +117,49 @@ void DeviceCtl::registerBufferAttribute(std::uint32_t pid, process.bufferAttributes[attr.attrId] = attr; } +void DeviceCtl::mapComputeQueue(int vmId, std::uint32_t meId, + std::uint32_t pipeId, std::uint32_t queueId, + std::uint32_t vqueueId, + orbis::uint64_t ringBaseAddress, + orbis::uint64_t readPtrAddress, + orbis::uint64_t doorbell, + orbis::uint64_t ringSize) { + if (meId != 1) { + rx::die("unexpected ME %d", meId); + } + + auto &pipe = mDevice->computePipes[pipeId]; + auto lock = pipe.lockQueue(queueId); + auto memory = RemoteMemory{vmId}; + auto base = memory.getPointer(ringBaseAddress); + pipe.mapQueue(queueId, + Ring{ + .vmId = vmId, + .indirectLevel = 0, + .doorbell = memory.getPointer(doorbell), + .base = base, + .size = ringSize, + .rptr = base, + .wptr = base, + .rptrReportLocation = + memory.getPointer(readPtrAddress), + }, + lock); + + auto config = std::bit_cast(doorbell); + config->state = 1; +} + +void DeviceCtl::submitComputeQueue(std::uint32_t meId, std::uint32_t pipeId, + std::uint32_t queueId, + std::uint64_t offset) { + if (meId != 1) { + rx::die("unexpected ME %d", meId); + } + + auto &pipe = mDevice->computePipes[pipeId]; + pipe.submit(queueId, offset); +} + void DeviceCtl::start() { mDevice->start(); } void DeviceCtl::waitForIdle() { mDevice->waitForIdle(); } diff --git a/rpcsx/gpu/DeviceCtl.hpp b/rpcsx/gpu/DeviceCtl.hpp index 7a370a4cf..d8ea308d6 100644 --- a/rpcsx/gpu/DeviceCtl.hpp +++ b/rpcsx/gpu/DeviceCtl.hpp @@ -1,6 +1,7 @@ #pragma once #include "DeviceContext.hpp" +#include "orbis-config.hpp" #include "orbis/utils/Rc.hpp" #include #include @@ -40,6 +41,14 @@ public: std::uint64_t address, std::uint64_t size, int prot); void registerBuffer(std::uint32_t pid, Buffer buffer); void registerBufferAttribute(std::uint32_t pid, BufferAttribute attr); + + void mapComputeQueue(int vmId, std::uint32_t meId, std::uint32_t pipeId, + std::uint32_t queueId, std::uint32_t vqueueId, + orbis::uint64_t ringBaseAddress, + orbis::uint64_t readPtrAddress, orbis::uint64_t doorbell, + orbis::uint64_t ringSize); + void submitComputeQueue(std::uint32_t meId, std::uint32_t pipeId, + std::uint32_t queueId, std::uint64_t offset); void start(); void waitForIdle(); diff --git a/rpcsx/gpu/Pipe.cpp b/rpcsx/gpu/Pipe.cpp index 5c625c9bf..fdf5b2f1d 100644 --- a/rpcsx/gpu/Pipe.cpp +++ b/rpcsx/gpu/Pipe.cpp @@ -8,6 +8,7 @@ #include "vk.hpp" #include #include +#include #include #include #include @@ -81,77 +82,336 @@ static bool compare(int cmpFn, std::uint32_t poll, std::uint32_t mask, return false; } -ComputePipe::ComputePipe(int index) : scheduler(createComputeScheduler(index)) { +ComputePipe::ComputePipe(int index) + : scheduler(createComputeScheduler(index)), index(index) { for (auto &handler : commandHandlers) { handler = &ComputePipe::unknownPacket; } commandHandlers[gnm::IT_NOP] = &ComputePipe::handleNop; + commandHandlers[gnm::IT_SET_SH_REG] = &ComputePipe::setShReg; + commandHandlers[gnm::IT_DISPATCH_DIRECT] = &ComputePipe::dispatchDirect; + commandHandlers[gnm::IT_DISPATCH_INDIRECT] = &ComputePipe::dispatchIndirect; + commandHandlers[gnm::IT_RELEASE_MEM] = &ComputePipe::releaseMem; + commandHandlers[gnm::IT_WAIT_REG_MEM] = &ComputePipe::waitRegMem; + commandHandlers[gnm::IT_WRITE_DATA] = &ComputePipe::writeData; } bool ComputePipe::processAllRings() { bool allProcessed = true; - for (auto &ring : queues) { - processRing(ring); + for (auto &queue : queues) { + std::lock_guard lock(queueMtx[&queue - queues]); - if (ring.rptr != ring.wptr) { - allProcessed = false; - break; + for (auto &ring : queue) { + if (!processRing(ring)) { + allProcessed = false; + } } } return allProcessed; } -void ComputePipe::processRing(Queue &queue) { - while (queue.rptr != queue.wptr) { - if (queue.rptr >= queue.base + queue.size) { - queue.rptr = queue.base; - } - - auto header = *queue.rptr; - auto type = rx::getBits(header, 31, 30); - - if (type == 3) { - auto op = rx::getBits(header, 15, 8); - auto len = rx::getBits(header, 29, 16) + 2; - - // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel, - // gnm::pm4OpcodeToString(op)); - - if (op == gnm::IT_COND_EXEC) { - rx::die("unimplemented COND_EXEC"); - } - - auto handler = commandHandlers[op]; - if (!(this->*handler)(queue)) { - return; - } - - queue.rptr += len; - continue; - } - - if (type == 2) { - ++queue.rptr; - continue; - } - - rx::die("unexpected pm4 packet type %u", type); +bool ComputePipe::processRing(Ring &ring) { + if (ring.size == 0) { + return true; } -} -bool ComputePipe::unknownPacket(Queue &queue) { - auto op = rx::getBits(queue.rptr[0], 15, 8); + while (true) { + if (ring.rptrReportLocation != nullptr) { + // FIXME: verify + ring.rptr = ring.base + *ring.rptrReportLocation; + } - rx::die("unimplemented compute pm4 packet: %s, queue %u\n", - gnm::pm4OpcodeToString(op), queue.indirectLevel); + while (ring.rptr != ring.wptr) { + if (ring.rptr >= ring.base + ring.size) { + ring.rptr = ring.base; + continue; + } + + auto header = *ring.rptr; + auto type = rx::getBits(header, 31, 30); + + if (type == 3) { + auto op = rx::getBits(header, 15, 8); + auto len = rx::getBits(header, 29, 16) + 2; + + // std::fprintf(stderr, "queue %d: %s\n", ring.indirectLevel, + // gnm::pm4OpcodeToString(op)); + + if (op == gnm::IT_COND_EXEC) { + rx::die("unimplemented COND_EXEC"); + } + + auto handler = commandHandlers[op]; + if (!(this->*handler)(ring)) { + if (ring.rptrReportLocation != nullptr) { + *ring.rptrReportLocation = ring.rptr - ring.base; + } + return false; + } + + ring.rptr += len; + continue; + } + + if (type == 2) { + ++ring.rptr; + continue; + } + + rx::die("unexpected pm4 packet type %u", type); + } + + if (ring.rptrReportLocation != nullptr) { + *ring.rptrReportLocation = ring.rptr - ring.base; + } + } return true; } -bool ComputePipe::handleNop(Queue &queue) { return true; } +void ComputePipe::mapQueue(int queueId, Ring ring, + std::unique_lock &lock) { + if (ring.indirectLevel < 0 || ring.indirectLevel > 1) { + rx::die("unexpected compute ring indirect level %d", ring.indirectLevel); + } + + if (ring.indirectLevel == 0) { + waitForIdle(queueId, lock); + } + + std::println("mapQueue: {}, {}, {}", (void *)ring.base, (void *)ring.wptr, + ring.size); + + queues[1 - ring.indirectLevel][queueId] = ring; +} + +void ComputePipe::waitForIdle(int queueId, + std::unique_lock &lock) { + auto &ring = queues[1][queueId]; + + while (true) { + if (ring.size == 0) { + return; + } + + if (ring.rptr == ring.wptr) { + return; + } + + lock.unlock(); + std::this_thread::sleep_for(std::chrono::microseconds(10)); + lock.lock(); + } +} + +void ComputePipe::submit(int queueId, std::uint32_t offset) { + auto &ring = queues[1][queueId]; + ring.wptr = ring.base + offset; +} + +bool ComputePipe::setShReg(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16); + auto offset = ring.rptr[1] & 0xffff; + auto index = ring.rptr[1] >> 26; + auto data = ring.rptr + 2; + + if (Registers::ShaderConfig::kMmioOffset + offset < + Registers::ComputeConfig::kMmioOffset) { + rx::die( + "unexpected compute pipe offset %x %s", offset, + gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset + offset)); + } + + offset -= Registers::ComputeConfig::kMmioOffset - + Registers::ShaderConfig::kMmioOffset; + + rx::dieIf( + (offset + len) * sizeof(std::uint32_t) > sizeof(Registers::ComputeConfig), + "out of compute regs, offset: %x, count %u, %s\n", offset, len, + gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset + offset)); + + for (std::size_t i = 0; i < len; ++i) { + std::fprintf(stderr, "writing to %s value %x\n", + gnm::mmio::registerName(Registers::ShaderConfig::kMmioOffset + + offset + i), + data[i]); + } + + std::memcpy(ring.doorbell + offset, data, sizeof(std::uint32_t) * len); + + return true; +} + +bool ComputePipe::dispatchDirect(Ring &ring) { + auto config = std::bit_cast(ring.doorbell); + auto dimX = ring.rptr[1]; + auto dimY = ring.rptr[2]; + auto dimZ = ring.rptr[3]; + auto dispatchInitiator = ring.rptr[4]; + config->computeDispatchInitiator = dispatchInitiator; + + amdgpu::dispatch(device->caches[ring.vmId], scheduler, *config, dimX, dimY, + dimZ); + return true; +} + +bool ComputePipe::dispatchIndirect(Ring &ring) { + auto config = std::bit_cast(ring.doorbell); + auto offset = ring.rptr[1]; + auto dispatchInitiator = ring.rptr[2]; + + config->computeDispatchInitiator = dispatchInitiator; + auto buffer = RemoteMemory{ring.vmId}.getPointer( + drawIndexIndirPatchBase + offset); + + auto dimX = buffer[0]; + auto dimY = buffer[1]; + auto dimZ = buffer[2]; + + amdgpu::dispatch(device->caches[ring.vmId], scheduler, *config, dimX, dimY, + dimZ); + return true; +} + +bool ComputePipe::releaseMem(Ring &ring) { + auto eventCntl = ring.rptr[1]; + auto dataCntl = ring.rptr[2]; + auto addressLo = ring.rptr[3] & ~3; + auto addressHi = ring.rptr[4] & ((1 << 16) - 1); + auto dataLo = ring.rptr[5]; + auto dataHi = ring.rptr[6]; + + auto eventIndex = rx::getBits(eventCntl, 11, 8); + auto eventType = rx::getBits(eventCntl, 5, 0); + auto dataSel = rx::getBits(dataCntl, 31, 29); + auto intSel = rx::getBits(dataCntl, 25, 24); + + auto address = addressLo | (static_cast(addressHi) << 32); + auto pointer = RemoteMemory{ring.vmId}.getPointer(address); + + switch (dataSel) { + case 0: // none + break; + case 1: // 32 bit, low + *reinterpret_cast(pointer) = dataLo; + break; + case 2: // 64 bit + *pointer = dataLo | (static_cast(dataHi) << 32); + break; + case 3: // 64 bit, global GPU clock + *pointer = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + break; + case 4: // 64 bit, perf counter + *pointer = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + break; + + default: + rx::die("unimplemented event release mem data %#x", dataSel); + } + + if (intSel) { + orbis::g_context.deviceEventEmitter->emit(orbis::kEvFiltGraphicsCore, 0, + kGcEventCompute0RelMem + index); + } + + return true; +} + +bool ComputePipe::waitRegMem(Ring &ring) { + auto engine = rx::getBit(ring.rptr[1], 8); + auto memSpace = rx::getBit(ring.rptr[1], 4); + auto function = rx::getBits(ring.rptr[1], 2, 0); + auto pollAddressLo = ring.rptr[2]; + auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1); + auto reference = ring.rptr[4]; + auto mask = ring.rptr[5]; + auto pollInterval = ring.rptr[6]; + + std::uint32_t pollData; + + if (memSpace == 0) { + pollData = *getMmRegister(ring, pollAddressLo & ((1 << 16) - 1)); + } else { + auto pollAddress = (pollAddressLo & ~3) | + (static_cast(pollAddressHi) << 32); + pollData = *RemoteMemory{ring.vmId}.getPointer(pollAddress); + } + + return compare(function, pollData, mask, reference); +} + +bool ComputePipe::writeData(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16) - 1; + auto control = ring.rptr[1]; + auto dstAddressLo = ring.rptr[2]; + auto dstAddressHi = ring.rptr[3]; + auto data = ring.rptr + 4; + + auto engineSel = rx::getBits(control, 31, 30); + auto wrConfirm = rx::getBit(control, 20); + auto wrOneAddress = rx::getBit(control, 16); + auto dstSel = rx::getBits(control, 11, 8); + + std::uint32_t *dstPointer = nullptr; + + switch (dstSel) { + case 0: // memory mapped register + dstPointer = getMmRegister(ring, dstAddressLo & ((1 << 16) - 1)); + break; + + case 1: // memory sync + case 2: // TC L2 + case 5: { // memory async + auto address = + (dstAddressLo & ~3) | (static_cast(dstAddressHi) << 32); + dstPointer = RemoteMemory{ring.vmId}.getPointer(address); + break; + } + + default: + rx::die("unimplemented write data, dst sel = %#x", dstSel); + } + + if (wrOneAddress) { + for (std::uint32_t i = 0; i < len; ++i) { + *dstPointer = data[i]; + } + } else { + std::memcpy(dstPointer, data, len * sizeof(std::uint32_t)); + } + + return true; +} + +bool ComputePipe::unknownPacket(Ring &ring) { + auto op = rx::getBits(ring.rptr[0], 15, 8); + + rx::die("unimplemented compute pm4 packet: %s, indirect level %u\n", + gnm::pm4OpcodeToString(op), ring.indirectLevel); + + return true; +} + +bool ComputePipe::handleNop(Ring &ring) { return true; } + +std::uint32_t *ComputePipe::getMmRegister(Ring &ring, std::uint32_t dwAddress) { + if (dwAddress >= Registers::ComputeConfig::kMmioOffset && + dwAddress < + Registers::ComputeConfig::kMmioOffset + + sizeof(Registers::ComputeConfig) / sizeof(std::uint32_t)) { + return ring.doorbell + (dwAddress - Registers::ComputeConfig::kMmioOffset); + } + + rx::die("unexpected memory mapped compute register address %x, %s", dwAddress, + gnm::mmio::registerName(dwAddress)); +} GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) { for (auto &processorHandlers : commandHandlers) { @@ -263,15 +523,15 @@ GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) { mainHandlers[IT_FLIP] = &GraphicsPipe::flip; } -void GraphicsPipe::setCeQueue(Queue queue) { - queue.indirectLevel = -1; - ceQueue = queue; +void GraphicsPipe::setCeQueue(Ring ring) { + ring.indirectLevel = -1; + ceQueue = ring; } -void GraphicsPipe::setDeQueue(Queue queue, int ring) { - rx::dieIf(ring > 2, "out of indirect gfx rings, %u", ring); - queue.indirectLevel = ring; - deQueues[2 - ring] = queue; +void GraphicsPipe::setDeQueue(Ring ring, int indirectLevel) { + rx::dieIf(indirectLevel > 2, "out of indirect gfx rings, %u", indirectLevel); + ring.indirectLevel = indirectLevel; + deQueues[2 - indirectLevel] = ring; } std::uint32_t *GraphicsPipe::getMmRegister(std::uint32_t dwAddress) { @@ -318,14 +578,14 @@ bool GraphicsPipe::processAllRings() { } } - for (auto &queue : deQueues) { - if (queue.rptr == queue.wptr) { + for (auto &ring : deQueues) { + if (ring.rptr == ring.wptr) { continue; } - processRing(queue); + processRing(ring); - if (queue.rptr != queue.wptr) { + if (ring.rptr != ring.wptr) { allProcessed = false; break; } @@ -334,21 +594,21 @@ bool GraphicsPipe::processAllRings() { return allProcessed; } -void GraphicsPipe::processRing(Queue &queue) { +void GraphicsPipe::processRing(Ring &ring) { int cp; - if (queue.indirectLevel < 0) { + if (ring.indirectLevel < 0) { cp = 0; } else { - cp = queue.indirectLevel + 1; + cp = ring.indirectLevel + 1; } - while (queue.rptr != queue.wptr) { - if (queue.rptr >= queue.base + queue.size) { - queue.rptr = queue.base; + while (ring.rptr != ring.wptr) { + if (ring.rptr >= ring.base + ring.size) { + ring.rptr = ring.base; continue; } - auto header = *queue.rptr; + auto header = *ring.rptr; auto type = rx::getBits(header, 31, 30); if (type == 3) { @@ -356,9 +616,9 @@ void GraphicsPipe::processRing(Queue &queue) { auto len = rx::getBits(header, 29, 16) + 2; // if (auto str = gnm::pm4OpcodeToString(op)) { - // std::println(stderr, "queue {}: {}", queue.indirectLevel, str); + // std::println(stderr, "queue {}: {}", ring.indirectLevel, str); // } else { - // std::println(stderr, "queue {}: {:x}", queue.indirectLevel, op); + // std::println(stderr, "queue {}: {:x}", ring.indirectLevel, op); // } if (op == gnm::IT_COND_EXEC) { @@ -366,11 +626,11 @@ void GraphicsPipe::processRing(Queue &queue) { } auto handler = commandHandlers[cp][op]; - if (!(this->*handler)(queue)) { + if (!(this->*handler)(ring)) { return; } - queue.rptr += len; + ring.rptr += len; if (op == gnm::IT_INDIRECT_BUFFER || op == gnm::IT_INDIRECT_BUFFER_CNST) { break; @@ -380,34 +640,33 @@ void GraphicsPipe::processRing(Queue &queue) { } if (type == 2) { - ++queue.rptr; + ++ring.rptr; continue; } rx::die("unexpected pm4 packet type %u, ring %u, header %u, rptr %p, wptr " "%p, base %p", - type, queue.indirectLevel, header, queue.rptr, queue.wptr, - queue.base); + type, ring.indirectLevel, header, ring.rptr, ring.wptr, ring.base); } } -bool GraphicsPipe::handleNop(Queue &queue) { return true; } +bool GraphicsPipe::handleNop(Ring &ring) { return true; } -bool GraphicsPipe::setBase(Queue &queue) { - auto baseIndex = queue.rptr[1] & 0xf; +bool GraphicsPipe::setBase(Ring &ring) { + auto baseIndex = ring.rptr[1] & 0xf; switch (baseIndex) { case 0: { - auto address0 = queue.rptr[2] & ~3; - auto address1 = queue.rptr[3] & ((1 << 16) - 1); + auto address0 = ring.rptr[2] & ~3; + auto address1 = ring.rptr[3] & ((1 << 16) - 1); displayListPatchBase = address0 | (static_cast(address1) << 32); break; } case 1: { - auto address0 = queue.rptr[2] & ~3; - auto address1 = queue.rptr[3] & ((1 << 16) - 1); + auto address0 = ring.rptr[2] & ~3; + auto address1 = ring.rptr[3] & ((1 << 16) - 1); drawIndexIndirPatchBase = address0 | (static_cast(address1) << 32); @@ -415,16 +674,16 @@ bool GraphicsPipe::setBase(Queue &queue) { } case 2: { - auto cs1Index = queue.rptr[2] & ((1 << 16) - 1); - auto cs2Index = queue.rptr[3] & ((1 << 16) - 1); + auto cs1Index = ring.rptr[2] & ((1 << 16) - 1); + auto cs2Index = ring.rptr[3] & ((1 << 16) - 1); gdsPartitionBases[0] = cs1Index; gdsPartitionBases[1] = cs2Index; break; } case 3: { - auto cs1Index = queue.rptr[2] & ((1 << 16) - 1); - auto cs2Index = queue.rptr[3] & ((1 << 16) - 1); + auto cs1Index = ring.rptr[2] & ((1 << 16) - 1); + auto cs2Index = ring.rptr[3] & ((1 << 16) - 1); cePartitionBases[0] = cs1Index; cePartitionBases[1] = cs2Index; break; @@ -437,7 +696,7 @@ bool GraphicsPipe::setBase(Queue &queue) { return true; } -bool GraphicsPipe::clearState(Queue &queue) { +bool GraphicsPipe::clearState(Ring &ring) { auto paScClipRectRule = context.paScClipRectRule.value; auto cbTargetMask = context.cbTargetMask.raw; auto cbShaderMask = context.cbShaderMask.raw; @@ -460,15 +719,15 @@ bool GraphicsPipe::clearState(Queue &queue) { return true; } -bool GraphicsPipe::contextControl(Queue &queue) { return true; } -bool GraphicsPipe::acquireMem(Queue &queue) { return true; } -bool GraphicsPipe::releaseMem(Queue &queue) { - auto eventCntl = queue.rptr[1]; - auto dataCntl = queue.rptr[2]; - auto addressLo = queue.rptr[3] & ~3; - auto addressHi = queue.rptr[3] & ~3; - auto dataLo = queue.rptr[4]; - auto dataHi = queue.rptr[5]; +bool GraphicsPipe::contextControl(Ring &ring) { return true; } +bool GraphicsPipe::acquireMem(Ring &ring) { return true; } +bool GraphicsPipe::releaseMem(Ring &ring) { + auto eventCntl = ring.rptr[1]; + auto dataCntl = ring.rptr[2]; + auto addressLo = ring.rptr[3] & ~3; + auto addressHi = ring.rptr[4] & ((1 << 16) - 1); + auto dataLo = ring.rptr[5]; + auto dataHi = ring.rptr[6]; auto eventIndex = rx::getBits(eventCntl, 11, 8); auto eventType = rx::getBits(eventCntl, 5, 0); @@ -476,7 +735,7 @@ bool GraphicsPipe::releaseMem(Queue &queue) { auto intSel = rx::getBits(dataCntl, 25, 24); auto address = addressLo | (static_cast(addressHi) << 32); - auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + auto pointer = RemoteMemory{ring.vmId}.getPointer(address); context.vgtEventInitiator = eventType; @@ -507,43 +766,43 @@ bool GraphicsPipe::releaseMem(Queue &queue) { return true; } -bool GraphicsPipe::drawPreamble(Queue &queue) { return true; } +bool GraphicsPipe::drawPreamble(Ring &ring) { return true; } -bool GraphicsPipe::indexBufferSize(Queue &queue) { - vgtIndexBufferSize = queue.rptr[1]; +bool GraphicsPipe::indexBufferSize(Ring &ring) { + vgtIndexBufferSize = ring.rptr[1]; return true; } -bool GraphicsPipe::dispatchDirect(Queue &queue) { - auto dimX = queue.rptr[1]; - auto dimY = queue.rptr[2]; - auto dimZ = queue.rptr[3]; - auto dispatchInitiator = queue.rptr[4]; +bool GraphicsPipe::dispatchDirect(Ring &ring) { + auto dimX = ring.rptr[1]; + auto dimY = ring.rptr[2]; + auto dimZ = ring.rptr[3]; + auto dispatchInitiator = ring.rptr[4]; sh.compute.computeDispatchInitiator = dispatchInitiator; - amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX, - dimY, dimZ); + amdgpu::dispatch(device->caches[ring.vmId], scheduler, sh.compute, dimX, dimY, + dimZ); return true; } -bool GraphicsPipe::dispatchIndirect(Queue &queue) { - auto offset = queue.rptr[1]; - auto dispatchInitiator = queue.rptr[2]; +bool GraphicsPipe::dispatchIndirect(Ring &ring) { + auto offset = ring.rptr[1]; + auto dispatchInitiator = ring.rptr[2]; sh.compute.computeDispatchInitiator = dispatchInitiator; - auto buffer = RemoteMemory{queue.vmId}.getPointer( + auto buffer = RemoteMemory{ring.vmId}.getPointer( drawIndexIndirPatchBase + offset); auto dimX = buffer[0]; auto dimY = buffer[1]; auto dimZ = buffer[2]; - amdgpu::dispatch(device->caches[queue.vmId], scheduler, sh.compute, dimX, - dimY, dimZ); + amdgpu::dispatch(device->caches[ring.vmId], scheduler, sh.compute, dimX, dimY, + dimZ); return true; } -bool GraphicsPipe::setPredication(Queue &queue) { - auto startAddressLo = queue.rptr[1] & ~0xf; - auto predProperties = queue.rptr[2]; +bool GraphicsPipe::setPredication(Ring &ring) { + auto startAddressLo = ring.rptr[1] & ~0xf; + auto predProperties = ring.rptr[2]; auto startAddressHi = rx::getBits(predProperties, 15, 0); auto predBool = rx::getBit(predProperties, 8); @@ -562,15 +821,15 @@ bool GraphicsPipe::setPredication(Queue &queue) { return true; } -bool GraphicsPipe::drawIndirect(Queue &queue) { - auto dataOffset = queue.rptr[1]; - auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1); - auto startInstLoc = queue.rptr[3] & ((1 << 16) - 1); - auto drawInitiator = queue.rptr[4]; +bool GraphicsPipe::drawIndirect(Ring &ring) { + auto dataOffset = ring.rptr[1]; + auto baseVtxLoc = ring.rptr[2] & ((1 << 16) - 1); + auto startInstLoc = ring.rptr[3] & ((1 << 16) - 1); + auto drawInitiator = ring.rptr[4]; context.vgtDrawInitiator = drawInitiator; - auto buffer = RemoteMemory{queue.vmId}.getPointer( + auto buffer = RemoteMemory{ring.vmId}.getPointer( drawIndexIndirPatchBase + dataOffset); std::uint32_t vertexCountPerInstance = buffer[0]; @@ -578,16 +837,16 @@ bool GraphicsPipe::drawIndirect(Queue &queue) { std::uint32_t startVertexLocation = buffer[2]; std::uint32_t startInstanceLocation = buffer[3]; - draw(*this, queue.vmId, startVertexLocation, vertexCountPerInstance, + draw(*this, ring.vmId, startVertexLocation, vertexCountPerInstance, startInstanceLocation, instanceCount, 0, 0, 0); return true; } -bool GraphicsPipe::drawIndexIndirect(Queue &queue) { - auto dataOffset = queue.rptr[1]; - auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1); - auto drawInitiator = queue.rptr[3]; +bool GraphicsPipe::drawIndexIndirect(Ring &ring) { + auto dataOffset = ring.rptr[1]; + auto baseVtxLoc = ring.rptr[2] & ((1 << 16) - 1); + auto drawInitiator = ring.rptr[3]; - auto buffer = RemoteMemory{queue.vmId}.getPointer( + auto buffer = RemoteMemory{ring.vmId}.getPointer( drawIndexIndirPatchBase + dataOffset); context.vgtDrawInitiator = drawInitiator; @@ -598,24 +857,24 @@ bool GraphicsPipe::drawIndexIndirect(Queue &queue) { std::uint32_t baseVertexLocation = buffer[3]; std::uint32_t startInstanceLocation = buffer[4]; - draw(*this, queue.vmId, baseVertexLocation, indexCountPerInstance, + draw(*this, ring.vmId, baseVertexLocation, indexCountPerInstance, startInstanceLocation, instanceCount, vgtIndexBase, startIndexLocation, indexCountPerInstance); return true; } -bool GraphicsPipe::indexBase(Queue &queue) { - auto addressLo = queue.rptr[1] & ~1; - auto addressHi = queue.rptr[2] & ((1 << 16) - 1); +bool GraphicsPipe::indexBase(Ring &ring) { + auto addressLo = ring.rptr[1] & ~1; + auto addressHi = ring.rptr[2] & ((1 << 16) - 1); auto address = addressLo | (static_cast(addressHi) << 32); vgtIndexBase = address; return true; } -bool GraphicsPipe::drawIndex2(Queue &queue) { - auto maxSize = queue.rptr[1]; - auto indexBaseLo = queue.rptr[2] & ~1; - auto indexBaseHi = queue.rptr[3] & ((1 << 16) - 1); - auto indexCount = queue.rptr[4]; - auto drawInitiator = queue.rptr[5]; +bool GraphicsPipe::drawIndex2(Ring &ring) { + auto maxSize = ring.rptr[1]; + auto indexBaseLo = ring.rptr[2] & ~1; + auto indexBaseHi = ring.rptr[3] & ((1 << 16) - 1); + auto indexCount = ring.rptr[4]; + auto drawInitiator = ring.rptr[5]; context.vgtDrawInitiator = drawInitiator; uConfig.vgtNumIndices = indexCount; @@ -623,32 +882,32 @@ bool GraphicsPipe::drawIndex2(Queue &queue) { auto indexBase = indexBaseLo | (static_cast(indexBaseHi) << 32); - draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, indexBase, + draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, indexBase, 0, maxSize); return true; } -bool GraphicsPipe::indexType(Queue &queue) { - uConfig.vgtIndexType = static_cast(queue.rptr[1] & 1); +bool GraphicsPipe::indexType(Ring &ring) { + uConfig.vgtIndexType = static_cast(ring.rptr[1] & 1); return true; } -bool GraphicsPipe::drawIndexAuto(Queue &queue) { - auto indexCount = queue.rptr[1]; - auto drawInitiator = queue.rptr[2]; +bool GraphicsPipe::drawIndexAuto(Ring &ring) { + auto indexCount = ring.rptr[1]; + auto drawInitiator = ring.rptr[2]; uConfig.vgtNumIndices = indexCount; context.vgtDrawInitiator = drawInitiator; - draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0, 0); + draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0, 0); return true; } -bool GraphicsPipe::numInstances(Queue &queue) { - uConfig.vgtNumInstances = std::max(queue.rptr[1], 1u); +bool GraphicsPipe::numInstances(Ring &ring) { + uConfig.vgtNumInstances = std::max(ring.rptr[1], 1u); return true; } -bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) { - auto primCount = queue.rptr[1]; - auto drawInitiator = queue.rptr[2]; - auto control = queue.rptr[3]; +bool GraphicsPipe::drawIndexMultiAuto(Ring &ring) { + auto primCount = ring.rptr[1]; + auto drawInitiator = ring.rptr[2]; + auto control = ring.rptr[3]; auto indexOffset = rx::getBits(control, 15, 0); auto primType = rx::getBits(control, 20, 16); @@ -658,27 +917,27 @@ bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) { uConfig.vgtPrimitiveType = static_cast(primType); uConfig.vgtNumIndices = indexCount; - draw(*this, queue.vmId, 0, primCount, 0, uConfig.vgtNumInstances, - vgtIndexBase, indexOffset, indexCount); + draw(*this, ring.vmId, 0, primCount, 0, uConfig.vgtNumInstances, vgtIndexBase, + indexOffset, indexCount); return true; } -bool GraphicsPipe::drawIndexOffset2(Queue &queue) { - auto maxSize = queue.rptr[1]; - auto indexOffset = queue.rptr[2]; - auto indexCount = queue.rptr[3]; - auto drawInitiator = queue.rptr[4]; +bool GraphicsPipe::drawIndexOffset2(Ring &ring) { + auto maxSize = ring.rptr[1]; + auto indexOffset = ring.rptr[2]; + auto indexCount = ring.rptr[3]; + auto drawInitiator = ring.rptr[4]; context.vgtDrawInitiator = drawInitiator; - draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, + draw(*this, ring.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, vgtIndexBase, indexOffset, maxSize); return true; } -bool GraphicsPipe::writeData(Queue &queue) { - auto len = rx::getBits(queue.rptr[0], 29, 16) - 1; - auto control = queue.rptr[1]; - auto dstAddressLo = queue.rptr[2]; - auto dstAddressHi = queue.rptr[3]; - auto data = queue.rptr + 4; +bool GraphicsPipe::writeData(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16) - 1; + auto control = ring.rptr[1]; + auto dstAddressLo = ring.rptr[2]; + auto dstAddressHi = ring.rptr[3]; + auto data = ring.rptr + 4; auto engineSel = rx::getBits(control, 31, 30); auto wrConfirm = rx::getBit(control, 20); @@ -697,7 +956,7 @@ bool GraphicsPipe::writeData(Queue &queue) { case 5: { // memory async auto address = (dstAddressLo & ~3) | (static_cast(dstAddressHi) << 32); - dstPointer = RemoteMemory{queue.vmId}.getPointer(address); + dstPointer = RemoteMemory{ring.vmId}.getPointer(address); break; } @@ -715,19 +974,19 @@ bool GraphicsPipe::writeData(Queue &queue) { return true; } -bool GraphicsPipe::memSemaphore(Queue &queue) { +bool GraphicsPipe::memSemaphore(Ring &ring) { // FIXME return true; } -bool GraphicsPipe::waitRegMem(Queue &queue) { - auto engine = rx::getBit(queue.rptr[1], 8); - auto memSpace = rx::getBit(queue.rptr[1], 4); - auto function = rx::getBits(queue.rptr[1], 2, 0); - auto pollAddressLo = queue.rptr[2]; - auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1); - auto reference = queue.rptr[4]; - auto mask = queue.rptr[5]; - auto pollInterval = queue.rptr[6]; +bool GraphicsPipe::waitRegMem(Ring &ring) { + auto engine = rx::getBit(ring.rptr[1], 8); + auto memSpace = rx::getBit(ring.rptr[1], 4); + auto function = rx::getBits(ring.rptr[1], 2, 0); + auto pollAddressLo = ring.rptr[2]; + auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1); + auto reference = ring.rptr[4]; + auto mask = ring.rptr[5]; + auto pollInterval = ring.rptr[6]; std::uint32_t pollData; @@ -736,61 +995,60 @@ bool GraphicsPipe::waitRegMem(Queue &queue) { } else { auto pollAddress = (pollAddressLo & ~3) | (static_cast(pollAddressHi) << 32); - pollData = *RemoteMemory{queue.vmId}.getPointer(pollAddress); + pollData = *RemoteMemory{ring.vmId}.getPointer(pollAddress); } return compare(function, pollData, mask, reference); } -bool GraphicsPipe::indirectBufferConst(Queue &queue) { - rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP"); +bool GraphicsPipe::indirectBufferConst(Ring &ring) { + rx::dieIf(ring.indirectLevel < 0, "unexpected indirect buffer from CP"); - auto addressLo = queue.rptr[1] & ~3; - auto addressHi = queue.rptr[2] & ((1 << 8) - 1); - int vmId = queue.rptr[3] >> 24; - auto ibSize = queue.rptr[3] & ((1 << 20) - 1); + auto addressLo = ring.rptr[1] & ~3; + auto addressHi = ring.rptr[2] & ((1 << 8) - 1); + int vmId = ring.rptr[3] >> 24; + auto ibSize = ring.rptr[3] & ((1 << 20) - 1); auto address = addressLo | (static_cast(addressHi) << 32); - if (queue.indirectLevel != 0) { - vmId = queue.vmId; + if (ring.indirectLevel != 0) { + vmId = ring.vmId; } auto rptr = RemoteMemory{vmId}.getPointer(address); - setCeQueue(Queue::createFromRange(vmId, rptr, ibSize)); + setCeQueue(Ring::createFromRange(vmId, rptr, ibSize)); return true; } -bool GraphicsPipe::indirectBuffer(Queue &queue) { - rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP"); +bool GraphicsPipe::indirectBuffer(Ring &ring) { + rx::dieIf(ring.indirectLevel < 0, "unexpected indirect buffer from CP"); - auto addressLo = queue.rptr[1] & ~3; - auto addressHi = queue.rptr[2] & ((1 << 8) - 1); - int vmId = queue.rptr[3] >> 24; - auto ibSize = queue.rptr[3] & ((1 << 20) - 1); + auto addressLo = ring.rptr[1] & ~3; + auto addressHi = ring.rptr[2] & ((1 << 8) - 1); + int vmId = ring.rptr[3] >> 24; + auto ibSize = ring.rptr[3] & ((1 << 20) - 1); auto address = addressLo | (static_cast(addressHi) << 32); - if (queue.indirectLevel != 0) { - vmId = queue.vmId; + if (ring.indirectLevel != 0) { + vmId = ring.vmId; } auto rptr = RemoteMemory{vmId}.getPointer(address); - setDeQueue(Queue::createFromRange(vmId, rptr, ibSize), - queue.indirectLevel + 1); + setDeQueue(Ring::createFromRange(vmId, rptr, ibSize), ring.indirectLevel + 1); return true; } -bool GraphicsPipe::pfpSyncMe(Queue &queue) { +bool GraphicsPipe::pfpSyncMe(Ring &ring) { // TODO return true; } -bool GraphicsPipe::condWrite(Queue &queue) { - auto writeSpace = rx::getBit(queue.rptr[1], 8); - auto pollSpace = rx::getBit(queue.rptr[1], 4); - auto function = rx::getBits(queue.rptr[1], 2, 0); - auto pollAddressLo = queue.rptr[2]; - auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1); - auto reference = queue.rptr[4]; - auto mask = queue.rptr[5]; - auto writeAddressLo = queue.rptr[6]; - auto writeAddressHi = queue.rptr[7] & ((1 << 16) - 1); - auto writeData = queue.rptr[8]; +bool GraphicsPipe::condWrite(Ring &ring) { + auto writeSpace = rx::getBit(ring.rptr[1], 8); + auto pollSpace = rx::getBit(ring.rptr[1], 4); + auto function = rx::getBits(ring.rptr[1], 2, 0); + auto pollAddressLo = ring.rptr[2]; + auto pollAddressHi = ring.rptr[3] & ((1 << 16) - 1); + auto reference = ring.rptr[4]; + auto mask = ring.rptr[5]; + auto writeAddressLo = ring.rptr[6]; + auto writeAddressHi = ring.rptr[7] & ((1 << 16) - 1); + auto writeData = ring.rptr[8]; std::uint32_t pollData; @@ -799,7 +1057,7 @@ bool GraphicsPipe::condWrite(Queue &queue) { } else { auto pollAddress = (pollAddressLo & ~3) | (static_cast(pollAddressHi) << 32); - pollData = *RemoteMemory{queue.vmId}.getPointer(pollAddress); + pollData = *RemoteMemory{ring.vmId}.getPointer(pollAddress); } if (compare(function, pollData, mask, reference)) { @@ -809,7 +1067,7 @@ bool GraphicsPipe::condWrite(Queue &queue) { auto writeAddress = (writeAddressLo & ~3) | (static_cast(writeAddressHi) << 32); - *RemoteMemory{queue.vmId}.getPointer(writeAddress) = + *RemoteMemory{ring.vmId}.getPointer(writeAddress) = writeData; } } @@ -817,7 +1075,7 @@ bool GraphicsPipe::condWrite(Queue &queue) { return true; } -bool GraphicsPipe::eventWrite(Queue &queue) { +bool GraphicsPipe::eventWrite(Ring &ring) { enum { kEventZPassDone = 1, kEventSamplePipelineStat = 2, @@ -825,7 +1083,7 @@ bool GraphicsPipe::eventWrite(Queue &queue) { kEventPartialFlush = 4, }; - auto eventCntl = queue.rptr[1]; + auto eventCntl = ring.rptr[1]; auto invL2 = rx::getBit(eventCntl, 20); auto eventIndex = rx::getBits(eventCntl, 11, 8); auto eventType = rx::getBits(eventCntl, 5, 0); @@ -834,8 +1092,8 @@ bool GraphicsPipe::eventWrite(Queue &queue) { if (eventIndex == kEventZPassDone || eventIndex == kEventSamplePipelineStat || eventIndex == kEventSampleStreamOutStat) { - auto addressLo = queue.rptr[2] & ~7; - auto addressHi = queue.rptr[3] & ((1 << 16) - 1); + auto addressLo = ring.rptr[2] & ~7; + auto addressHi = ring.rptr[3] & ((1 << 16) - 1); auto address = addressLo | (static_cast(addressHi) << 32); rx::die("unimplemented event write, event index %#x, address %lx", eventIndex, address); @@ -846,12 +1104,12 @@ bool GraphicsPipe::eventWrite(Queue &queue) { return true; } -bool GraphicsPipe::eventWriteEop(Queue &queue) { - auto eventCntl = queue.rptr[1]; - auto addressLo = queue.rptr[2] & ~3; - auto dataCntl = queue.rptr[3]; - auto dataLo = queue.rptr[4]; - auto dataHi = queue.rptr[5]; +bool GraphicsPipe::eventWriteEop(Ring &ring) { + auto eventCntl = ring.rptr[1]; + auto addressLo = ring.rptr[2] & ~3; + auto dataCntl = ring.rptr[3]; + auto dataLo = ring.rptr[4]; + auto dataHi = ring.rptr[5]; auto invL2 = rx::getBit(eventCntl, 20); auto eventIndex = rx::getBits(eventCntl, 11, 8); @@ -861,7 +1119,7 @@ bool GraphicsPipe::eventWriteEop(Queue &queue) { auto addressHi = rx::getBits(dataCntl, 15, 0); auto address = addressLo | (static_cast(addressHi) << 32); - auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + auto pointer = RemoteMemory{ring.vmId}.getPointer(address); context.vgtEventInitiator = eventType; @@ -897,11 +1155,11 @@ bool GraphicsPipe::eventWriteEop(Queue &queue) { return true; } -bool GraphicsPipe::eventWriteEos(Queue &queue) { - auto eventCntl = queue.rptr[1]; - auto addressLo = queue.rptr[2] & ~3; - auto cmdInfo = queue.rptr[3]; - auto dataInfo = queue.rptr[4]; +bool GraphicsPipe::eventWriteEos(Ring &ring) { + auto eventCntl = ring.rptr[1]; + auto addressLo = ring.rptr[2] & ~3; + auto cmdInfo = ring.rptr[3]; + auto dataInfo = ring.rptr[4]; auto eventIndex = rx::getBits(eventCntl, 11, 8); auto eventType = rx::getBits(eventCntl, 5, 0); @@ -909,10 +1167,10 @@ bool GraphicsPipe::eventWriteEos(Queue &queue) { auto addressHi = rx::getBits(cmdInfo, 15, 0); auto address = addressLo | (static_cast(addressHi) << 32); - auto pointer = RemoteMemory{queue.vmId}.getPointer(address); + auto pointer = RemoteMemory{ring.vmId}.getPointer(address); context.vgtEventInitiator = eventType; - auto &cache = device->caches[queue.vmId]; + auto &cache = device->caches[ring.vmId]; switch (cmd) { case 1: { // store GDS data to memory @@ -940,14 +1198,14 @@ bool GraphicsPipe::eventWriteEos(Queue &queue) { return true; } -bool GraphicsPipe::dmaData(Queue &queue) { - auto control = queue.rptr[1]; - auto srcAddressLo = queue.rptr[2]; +bool GraphicsPipe::dmaData(Ring &ring) { + auto control = ring.rptr[1]; + auto srcAddressLo = ring.rptr[2]; auto data = srcAddressLo; - auto srcAddressHi = queue.rptr[3]; - auto dstAddressLo = queue.rptr[4]; - auto dstAddressHi = queue.rptr[5]; - auto cmdSize = queue.rptr[6]; + auto srcAddressHi = ring.rptr[3]; + auto dstAddressLo = ring.rptr[4]; + auto dstAddressHi = ring.rptr[5]; + auto cmdSize = ring.rptr[6]; auto size = rx::getBits(cmdSize, 20, 0); auto engine = rx::getBit(control, 0); @@ -1000,8 +1258,8 @@ bool GraphicsPipe::dmaData(Queue &queue) { if (dstSel == 3 || das == 0) { auto dstAddress = dstAddressLo | (static_cast(dstAddressHi) << 32); - dst = amdgpu::RemoteMemory{queue.vmId}.getPointer(dstAddress); - device->caches[queue.vmId].invalidate( + dst = amdgpu::RemoteMemory{ring.vmId}.getPointer(dstAddress); + device->caches[ring.vmId].invalidate( scheduler, rx::AddressRange::fromBeginSize(dstAddress, size)); } else { dst = getMmRegister(dstAddressLo / sizeof(std::uint32_t)); @@ -1009,7 +1267,7 @@ bool GraphicsPipe::dmaData(Queue &queue) { break; case 1: - dst = device->caches[queue.vmId].getGdsBuffer().getData() + dstAddressLo; + dst = device->caches[ring.vmId].getGdsBuffer().getData() + dstAddressLo; break; default: @@ -1024,8 +1282,8 @@ bool GraphicsPipe::dmaData(Queue &queue) { if (srcSel == 3 || sas == 0) { auto srcAddress = srcAddressLo | (static_cast(srcAddressHi) << 32); - src = amdgpu::RemoteMemory{queue.vmId}.getPointer(srcAddress); - device->caches[queue.vmId].flush( + src = amdgpu::RemoteMemory{ring.vmId}.getPointer(srcAddress); + device->caches[ring.vmId].flush( scheduler, rx::AddressRange::fromBeginSize(srcAddress, size)); } else { src = getMmRegister(srcAddressLo / sizeof(std::uint32_t)); @@ -1034,7 +1292,7 @@ bool GraphicsPipe::dmaData(Queue &queue) { srcSize = ~0; break; case 1: - src = device->caches[queue.vmId].getGdsBuffer().getData() + srcAddressLo; + src = device->caches[ring.vmId].getGdsBuffer().getData() + srcAddressLo; srcSize = ~0; break; @@ -1072,10 +1330,10 @@ bool GraphicsPipe::dmaData(Queue &queue) { return true; } -bool GraphicsPipe::setConfigReg(Queue &queue) { - auto len = rx::getBits(queue.rptr[0], 29, 16); - auto offset = queue.rptr[1] & 0xffff; - auto data = queue.rptr + 2; +bool GraphicsPipe::setConfigReg(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16); + auto offset = ring.rptr[1] & 0xffff; + auto data = ring.rptr + 2; rx::dieIf( (offset + len) * sizeof(std::uint32_t) > sizeof(device->config), @@ -1088,11 +1346,11 @@ bool GraphicsPipe::setConfigReg(Queue &queue) { return true; } -bool GraphicsPipe::setShReg(Queue &queue) { - auto len = rx::getBits(queue.rptr[0], 29, 16); - auto offset = queue.rptr[1] & 0xffff; - auto index = queue.rptr[1] >> 26; - auto data = queue.rptr + 2; +bool GraphicsPipe::setShReg(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16); + auto offset = ring.rptr[1] & 0xffff; + auto index = ring.rptr[1] >> 26; + auto data = ring.rptr + 2; rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(sh), "out of SH regs, offset: %x, count %u, %s\n", offset, len, @@ -1109,11 +1367,11 @@ bool GraphicsPipe::setShReg(Queue &queue) { return true; } -bool GraphicsPipe::setUConfigReg(Queue &queue) { - auto len = rx::getBits(queue.rptr[0], 29, 16); - auto offset = queue.rptr[1] & 0xffff; - auto index = queue.rptr[1] >> 26; - auto data = queue.rptr + 2; +bool GraphicsPipe::setUConfigReg(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16); + auto offset = ring.rptr[1] & 0xffff; + auto index = ring.rptr[1] >> 26; + auto data = ring.rptr + 2; if (index != 0) { { @@ -1150,11 +1408,11 @@ bool GraphicsPipe::setUConfigReg(Queue &queue) { return true; } -bool GraphicsPipe::setContextReg(Queue &queue) { - auto len = rx::getBits(queue.rptr[0], 29, 16); - auto offset = queue.rptr[1] & 0xffff; - auto index = queue.rptr[1] >> 26; - auto data = queue.rptr + 2; +bool GraphicsPipe::setContextReg(Ring &ring) { + auto len = rx::getBits(ring.rptr[0], 29, 16); + auto offset = ring.rptr[1] & 0xffff; + auto index = ring.rptr[1] >> 26; + auto data = ring.rptr + 2; if (index != 0) { { @@ -1192,114 +1450,114 @@ bool GraphicsPipe::setContextReg(Queue &queue) { return true; } -bool GraphicsPipe::setCeDeCounters(Queue &queue) { - auto counterLo = queue.rptr[1]; - auto counterHi = queue.rptr[2]; +bool GraphicsPipe::setCeDeCounters(Ring &ring) { + auto counterLo = ring.rptr[1]; + auto counterHi = ring.rptr[2]; auto counter = counterLo | (static_cast(counterHi) << 32); deCounter = counter; ceCounter = counter; return true; } -bool GraphicsPipe::waitOnCeCounter(Queue &queue) { - auto counterLo = queue.rptr[1]; - auto counterHi = queue.rptr[2]; +bool GraphicsPipe::waitOnCeCounter(Ring &ring) { + auto counterLo = ring.rptr[1]; + auto counterHi = ring.rptr[2]; auto counter = counterLo | (static_cast(counterHi) << 32); return deCounter >= counter; } -bool GraphicsPipe::waitOnDeCounterDiff(Queue &queue) { - auto waitDiff = queue.rptr[1]; +bool GraphicsPipe::waitOnDeCounterDiff(Ring &ring) { + auto waitDiff = ring.rptr[1]; auto diff = ceCounter - deCounter; return diff < waitDiff; } -bool GraphicsPipe::incrementCeCounter(Queue &) { +bool GraphicsPipe::incrementCeCounter(Ring &) { ceCounter++; return true; } -bool GraphicsPipe::incrementDeCounter(Queue &) { +bool GraphicsPipe::incrementDeCounter(Ring &) { deCounter++; return true; } -bool GraphicsPipe::loadConstRam(Queue &queue) { - std::uint32_t addressLo = queue.rptr[1]; - std::uint32_t addressHi = queue.rptr[2]; - std::uint32_t numDw = queue.rptr[3] & ((1 << 15) - 1); +bool GraphicsPipe::loadConstRam(Ring &ring) { + std::uint32_t addressLo = ring.rptr[1]; + std::uint32_t addressHi = ring.rptr[2]; + std::uint32_t numDw = ring.rptr[3] & ((1 << 15) - 1); std::uint32_t offset = - (queue.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + (ring.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t); auto address = addressLo | (static_cast(addressHi) << 32); std::memcpy(constantMemory + offset, - RemoteMemory{queue.vmId}.getPointer(address), + RemoteMemory{ring.vmId}.getPointer(address), numDw * sizeof(std::uint32_t)); return true; } -bool GraphicsPipe::writeConstRam(Queue &queue) { +bool GraphicsPipe::writeConstRam(Ring &ring) { std::uint32_t offset = - (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); - std::uint32_t data = queue.rptr[2]; + (ring.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + std::uint32_t data = ring.rptr[2]; std::memcpy(constantMemory + offset, &data, sizeof(std::uint32_t)); return true; } -bool GraphicsPipe::dumpConstRam(Queue &queue) { +bool GraphicsPipe::dumpConstRam(Ring &ring) { std::uint32_t offset = - (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); - std::uint32_t numDw = queue.rptr[2] & ((1 << 15) - 1); - std::uint32_t addressLo = queue.rptr[3]; - std::uint32_t addressHi = queue.rptr[4]; + (ring.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t); + std::uint32_t numDw = ring.rptr[2] & ((1 << 15) - 1); + std::uint32_t addressLo = ring.rptr[3]; + std::uint32_t addressHi = ring.rptr[4]; auto address = addressLo | (static_cast(addressHi) << 32); - std::memcpy(RemoteMemory{queue.vmId}.getPointer(address), + std::memcpy(RemoteMemory{ring.vmId}.getPointer(address), constantMemory + offset, numDw * sizeof(std::uint32_t)); return true; } -bool GraphicsPipe::unknownPacket(Queue &queue) { - auto op = rx::getBits(queue.rptr[0], 15, 8); +bool GraphicsPipe::unknownPacket(Ring &ring) { + auto op = rx::getBits(ring.rptr[0], 15, 8); rx::die("unimplemented gfx pm4 packet: %s, queue %u\n", - gnm::pm4OpcodeToString(op), queue.indirectLevel); + gnm::pm4OpcodeToString(op), ring.indirectLevel); } -bool GraphicsPipe::switchBuffer(Queue &queue) { +bool GraphicsPipe::switchBuffer(Ring &ring) { // FIXME: implement return true; } -bool GraphicsPipe::mapProcess(Queue &queue) { - auto pid = queue.rptr[1]; - int vmId = queue.rptr[2]; +bool GraphicsPipe::mapProcess(Ring &ring) { + auto pid = ring.rptr[1]; + int vmId = ring.rptr[2]; device->mapProcess(pid, vmId); return true; } -bool GraphicsPipe::mapQueues(Queue &queue) { +bool GraphicsPipe::mapQueues(Ring &ring) { // FIXME: implement return true; } -bool GraphicsPipe::unmapQueues(Queue &queue) { +bool GraphicsPipe::unmapQueues(Ring &ring) { // FIXME: implement return true; } -bool GraphicsPipe::mapMemory(Queue &queue) { - auto pid = queue.rptr[1]; - auto addressLo = queue.rptr[2]; - auto addressHi = queue.rptr[3]; - auto sizeLo = queue.rptr[4]; - auto sizeHi = queue.rptr[5]; - auto memoryType = queue.rptr[6]; - auto dmemIndex = queue.rptr[7]; - auto prot = queue.rptr[8]; - auto offsetLo = queue.rptr[9]; - auto offsetHi = queue.rptr[10]; +bool GraphicsPipe::mapMemory(Ring &ring) { + auto pid = ring.rptr[1]; + auto addressLo = ring.rptr[2]; + auto addressHi = ring.rptr[3]; + auto sizeLo = ring.rptr[4]; + auto sizeHi = ring.rptr[5]; + auto memoryType = ring.rptr[6]; + auto dmemIndex = ring.rptr[7]; + auto prot = ring.rptr[8]; + auto offsetLo = ring.rptr[9]; + auto offsetHi = ring.rptr[10]; auto address = addressLo | (static_cast(addressHi) << 32); auto size = sizeLo | (static_cast(sizeHi) << 32); @@ -1308,42 +1566,42 @@ bool GraphicsPipe::mapMemory(Queue &queue) { device->mapMemory(pid, address, size, memoryType, dmemIndex, prot, offset); return true; } -bool GraphicsPipe::unmapMemory(Queue &queue) { - auto pid = queue.rptr[1]; - auto addressLo = queue.rptr[2]; - auto addressHi = queue.rptr[3]; - auto sizeLo = queue.rptr[4]; - auto sizeHi = queue.rptr[5]; +bool GraphicsPipe::unmapMemory(Ring &ring) { + auto pid = ring.rptr[1]; + auto addressLo = ring.rptr[2]; + auto addressHi = ring.rptr[3]; + auto sizeLo = ring.rptr[4]; + auto sizeHi = ring.rptr[5]; auto address = addressLo | (static_cast(addressHi) << 32); auto size = sizeLo | (static_cast(sizeHi) << 32); device->unmapMemory(pid, address, size); return true; } -bool GraphicsPipe::protectMemory(Queue &queue) { - auto pid = queue.rptr[1]; - auto addressLo = queue.rptr[2]; - auto addressHi = queue.rptr[3]; - auto sizeLo = queue.rptr[4]; - auto sizeHi = queue.rptr[5]; - auto prot = queue.rptr[6]; +bool GraphicsPipe::protectMemory(Ring &ring) { + auto pid = ring.rptr[1]; + auto addressLo = ring.rptr[2]; + auto addressHi = ring.rptr[3]; + auto sizeLo = ring.rptr[4]; + auto sizeHi = ring.rptr[5]; + auto prot = ring.rptr[6]; auto address = addressLo | (static_cast(addressHi) << 32); auto size = sizeLo | (static_cast(sizeHi) << 32); device->protectMemory(pid, address, size, prot); return true; } -bool GraphicsPipe::unmapProcess(Queue &queue) { - auto pid = queue.rptr[1]; +bool GraphicsPipe::unmapProcess(Ring &ring) { + auto pid = ring.rptr[1]; device->unmapProcess(pid); return true; } -bool GraphicsPipe::flip(Queue &queue) { - auto buffer = queue.rptr[1]; - auto dataLo = queue.rptr[2]; - auto dataHi = queue.rptr[3]; - auto pid = queue.rptr[4]; +bool GraphicsPipe::flip(Ring &ring) { + auto buffer = ring.rptr[1]; + auto dataLo = ring.rptr[2]; + auto dataHi = ring.rptr[3]; + auto pid = ring.rptr[4]; auto data = dataLo | (static_cast(dataHi) << 32); device->flip(pid, buffer, data); diff --git a/rpcsx/gpu/Pipe.hpp b/rpcsx/gpu/Pipe.hpp index 1dc9e449f..18ab4ecdb 100644 --- a/rpcsx/gpu/Pipe.hpp +++ b/rpcsx/gpu/Pipe.hpp @@ -1,6 +1,7 @@ #pragma once #include "Registers.hpp" #include "Scheduler.hpp" +#include "orbis/utils/SharedMutex.hpp" #include #include @@ -8,7 +9,7 @@ namespace amdgpu { struct Device; -struct Queue { +struct Ring { int vmId = -1; int indirectLevel = -1; std::uint32_t *doorbell{}; @@ -16,11 +17,12 @@ struct Queue { std::uint64_t size{}; std::uint32_t *rptr{}; std::uint32_t *wptr{}; + std::uint32_t *rptrReportLocation{}; - static Queue createFromRange(int vmId, std::uint32_t *base, - std::uint64_t size, int indirectLevel = 0, - std::uint32_t *doorbell = nullptr) { - Queue result; + static Ring createFromRange(int vmId, std::uint32_t *base, std::uint64_t size, + int indirectLevel = 0, + std::uint32_t *doorbell = nullptr) { + Ring result; result.vmId = vmId; result.indirectLevel = indirectLevel; result.doorbell = doorbell; @@ -36,20 +38,35 @@ struct ComputePipe { Device *device; Scheduler scheduler; - using CommandHandler = bool (ComputePipe::*)(Queue &); + using CommandHandler = bool (ComputePipe::*)(Ring &); CommandHandler commandHandlers[255]; - Queue queues[8]; - Registers::ComputeConfig computeConfig; + orbis::shared_mutex queueMtx[8]; + int index; + Ring queues[2][8]; + std::uint64_t drawIndexIndirPatchBase = 0; ComputePipe(int index); bool processAllRings(); - void processRing(Queue &queue); - void mapQueue(int queueId, Queue queue); + bool processRing(Ring &ring); + void mapQueue(int queueId, Ring ring, std::unique_lock &lock); + void waitForIdle(int queueId, std::unique_lock &lock); + void submit(int queueId, std::uint32_t offset); - bool setShReg(Queue &queue); - bool unknownPacket(Queue &queue); - bool handleNop(Queue &queue); + std::unique_lock lockQueue(int queueId) { + return std::unique_lock(queueMtx[queueId]); + } + + bool setShReg(Ring &ring); + bool dispatchDirect(Ring &ring); + bool dispatchIndirect(Ring &ring); + bool releaseMem(Ring &ring); + bool waitRegMem(Ring &ring); + bool writeData(Ring &ring); + bool unknownPacket(Ring &ring); + bool handleNop(Ring &ring); + + std::uint32_t *getMmRegister(Ring &ring, std::uint32_t dwAddress); }; struct GraphicsPipe { @@ -71,75 +88,75 @@ struct GraphicsPipe { Registers::Context context; Registers::UConfig uConfig; - Queue deQueues[3]; - Queue ceQueue; + Ring deQueues[3]; + Ring ceQueue; - using CommandHandler = bool (GraphicsPipe::*)(Queue &); + using CommandHandler = bool (GraphicsPipe::*)(Ring &); CommandHandler commandHandlers[4][255]; GraphicsPipe(int index); - void setCeQueue(Queue queue); - void setDeQueue(Queue queue, int ring); + void setCeQueue(Ring ring); + void setDeQueue(Ring ring, int indirectLevel); bool processAllRings(); - void processRing(Queue &queue); + void processRing(Ring &ring); - bool drawPreamble(Queue &queue); - bool indexBufferSize(Queue &queue); - bool handleNop(Queue &queue); - bool contextControl(Queue &queue); - bool acquireMem(Queue &queue); - bool releaseMem(Queue &queue); - bool dispatchDirect(Queue &queue); - bool dispatchIndirect(Queue &queue); - bool writeData(Queue &queue); - bool memSemaphore(Queue &queue); - bool waitRegMem(Queue &queue); - bool indirectBufferConst(Queue &queue); - bool indirectBuffer(Queue &queue); - bool condWrite(Queue &queue); - bool eventWrite(Queue &queue); - bool eventWriteEop(Queue &queue); - bool eventWriteEos(Queue &queue); - bool dmaData(Queue &queue); - bool setBase(Queue &queue); - bool clearState(Queue &queue); - bool setPredication(Queue &queue); - bool drawIndirect(Queue &queue); - bool drawIndexIndirect(Queue &queue); - bool indexBase(Queue &queue); - bool drawIndex2(Queue &queue); - bool indexType(Queue &queue); - bool drawIndexAuto(Queue &queue); - bool numInstances(Queue &queue); - bool drawIndexMultiAuto(Queue &queue); - bool drawIndexOffset2(Queue &queue); - bool pfpSyncMe(Queue &queue); - bool setCeDeCounters(Queue &queue); - bool waitOnCeCounter(Queue &queue); - bool waitOnDeCounterDiff(Queue &queue); - bool incrementCeCounter(Queue &queue); - bool incrementDeCounter(Queue &queue); - bool loadConstRam(Queue &queue); - bool writeConstRam(Queue &queue); - bool dumpConstRam(Queue &queue); - bool setConfigReg(Queue &queue); - bool setShReg(Queue &queue); - bool setUConfigReg(Queue &queue); - bool setContextReg(Queue &queue); + bool drawPreamble(Ring &ring); + bool indexBufferSize(Ring &ring); + bool handleNop(Ring &ring); + bool contextControl(Ring &ring); + bool acquireMem(Ring &ring); + bool releaseMem(Ring &ring); + bool dispatchDirect(Ring &ring); + bool dispatchIndirect(Ring &ring); + bool writeData(Ring &ring); + bool memSemaphore(Ring &ring); + bool waitRegMem(Ring &ring); + bool indirectBufferConst(Ring &ring); + bool indirectBuffer(Ring &ring); + bool condWrite(Ring &ring); + bool eventWrite(Ring &ring); + bool eventWriteEop(Ring &ring); + bool eventWriteEos(Ring &ring); + bool dmaData(Ring &ring); + bool setBase(Ring &ring); + bool clearState(Ring &ring); + bool setPredication(Ring &ring); + bool drawIndirect(Ring &ring); + bool drawIndexIndirect(Ring &ring); + bool indexBase(Ring &ring); + bool drawIndex2(Ring &ring); + bool indexType(Ring &ring); + bool drawIndexAuto(Ring &ring); + bool numInstances(Ring &ring); + bool drawIndexMultiAuto(Ring &ring); + bool drawIndexOffset2(Ring &ring); + bool pfpSyncMe(Ring &ring); + bool setCeDeCounters(Ring &ring); + bool waitOnCeCounter(Ring &ring); + bool waitOnDeCounterDiff(Ring &ring); + bool incrementCeCounter(Ring &ring); + bool incrementDeCounter(Ring &ring); + bool loadConstRam(Ring &ring); + bool writeConstRam(Ring &ring); + bool dumpConstRam(Ring &ring); + bool setConfigReg(Ring &ring); + bool setShReg(Ring &ring); + bool setUConfigReg(Ring &ring); + bool setContextReg(Ring &ring); - bool unknownPacket(Queue &queue); + bool unknownPacket(Ring &ring); - bool switchBuffer(Queue &queue); - bool mapProcess(Queue &queue); - bool mapQueues(Queue &queue); - bool unmapQueues(Queue &queue); - bool mapMemory(Queue &queue); - bool unmapMemory(Queue &queue); - bool protectMemory(Queue &queue); - bool unmapProcess(Queue &queue); - bool flip(Queue &queue); + bool switchBuffer(Ring &ring); + bool mapProcess(Ring &ring); + bool mapQueues(Ring &ring); + bool unmapQueues(Ring &ring); + bool mapMemory(Ring &ring); + bool unmapMemory(Ring &ring); + bool protectMemory(Ring &ring); + bool unmapProcess(Ring &ring); + bool flip(Ring &ring); std::uint32_t *getMmRegister(std::uint32_t dwAddress); }; diff --git a/rpcsx/gpu/Registers.hpp b/rpcsx/gpu/Registers.hpp index 15e96d875..33461d49d 100644 --- a/rpcsx/gpu/Registers.hpp +++ b/rpcsx/gpu/Registers.hpp @@ -399,10 +399,10 @@ struct DbDepthSize { std::uint32_t raw; }; - std::uint32_t getPitch() const { + [[nodiscard]] std::uint32_t getPitch() const { return (pitchTileMax + 1) * 8; } - std::uint32_t getHeight() const { + [[nodiscard]] std::uint32_t getHeight() const { return (heightTileMax + 1) * 8; } }; @@ -591,8 +591,12 @@ struct Registers { }; }; - std::uint8_t getVGprCount() const { return (vgprs + 1) * 4; } - std::uint8_t getSGprCount() const { return (sgprs + 1) * 8; } + [[nodiscard]] std::uint8_t getVGprCount() const { + return (vgprs + 1) * 4; + } + [[nodiscard]] std::uint8_t getSGprCount() const { + return (sgprs + 1) * 8; + } } rsrc1; struct { union { @@ -613,7 +617,9 @@ struct Registers { }; }; - std::uint32_t getLdsDwordsCount() const { return ldsSize * 64; } + [[nodiscard]] std::uint32_t getLdsDwordsCount() const { + return ldsSize * 64; + } } rsrc2; std::uint32_t _pad3[1]; @@ -624,20 +630,25 @@ struct Registers { std::uint32_t wavesPerSh : 6; std::uint32_t : 6; std::uint32_t tgPerCu : 4; - std::uint32_t lockThreshold: 6; + std::uint32_t lockThreshold : 6; std::uint32_t simdDestCntl : 1; }; - }; - std::uint32_t getWavesPerSh() const { return wavesPerSh << 4; } + [[nodiscard]] std::uint32_t getWavesPerSh() const { + return wavesPerSh << 4; + } } resourceLimits; std::uint32_t staticThreadMgmtSe0; std::uint32_t staticThreadMgmtSe1; std::uint32_t tmpRingSize; - std::uint32_t _pad4[39]; + std::uint32_t _unk0[5]; + std::uint32_t state; + std::uint32_t _unk1[33]; std::array userData; }; + static_assert(sizeof(ComputeConfig) == 320); + struct ShaderConfig { static constexpr auto kMmioOffset = 0x2c00; diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp index f620c0e29..3e91a463f 100644 --- a/rpcsx/iodev/dce.cpp +++ b/rpcsx/iodev/dce.cpp @@ -1,3 +1,4 @@ +#include "dce.hpp" #include "gpu/DeviceCtl.hpp" #include "io-device.hpp" #include "iodev/dmem.hpp" @@ -8,7 +9,6 @@ #include "orbis/thread/Process.hpp" #include "orbis/thread/Thread.hpp" #include "orbis/utils/Logs.hpp" -#include "orbis/utils/SharedMutex.hpp" #include "rx/mem.hpp" #include "rx/watchdog.hpp" #include "vm.hpp" @@ -192,32 +192,21 @@ static void runBridge(int vmId) { }}.detach(); } -static constexpr auto kVmIdCount = 6; struct DceFile : public orbis::File {}; -struct DceDevice : IoDevice { - orbis::shared_mutex mtx; - std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1; - orbis::uint64_t dmemOffset = ~static_cast(0); +int DceDevice::allocateVmId() { + int id = std::countr_zero(freeVmIds); - orbis::ErrorCode open(orbis::Ref *file, const char *path, - std::uint32_t flags, std::uint32_t mode, - orbis::Thread *thread) override; + if (id >= kVmIdCount) { + std::println(stderr, "out of vm slots"); + std::abort(); + } - int allocateVmId() { - int id = std::countr_zero(freeVmIds); + freeVmIds &= ~(1 << id); + return id; +} - if (id >= kVmIdCount) { - std::println(stderr, "out of vm slots"); - std::abort(); - } - - freeVmIds &= ~(1 << id); - return id; - }; - - void deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); }; -}; +void DceDevice::deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); } static void initDceMemory(DceDevice *device) { if (device->dmemOffset + 1) { @@ -466,21 +455,24 @@ orbis::ErrorCode DceDevice::open(orbis::Ref *file, newFile->device = this; newFile->ops = &ops; *file = newFile; + initializeProcess(thread->tproc); + return {}; +} - if (thread->tproc->vmId == -1) { +void DceDevice::initializeProcess(orbis::Process *process) { + if (process->vmId == -1) { createGpu(); auto vmId = allocateVmId(); std::lock_guard lock(orbis::g_context.gpuDeviceMtx); { auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}; - gpu.submitMapProcess(thread->tproc->gfxRing, thread->tproc->pid, vmId); - thread->tproc->vmId = vmId; + gpu.submitMapProcess(process->gfxRing, process->pid, vmId); + process->vmId = vmId; } runBridge(vmId); } - return {}; } IoDevice *createDceCharacterDevice() { return orbis::knew(); } diff --git a/rpcsx/iodev/dce.hpp b/rpcsx/iodev/dce.hpp new file mode 100644 index 000000000..d3cfb81a9 --- /dev/null +++ b/rpcsx/iodev/dce.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include "io-device.hpp" +#include "orbis-config.hpp" +#include "orbis/error/ErrorCode.hpp" +#include "orbis/file.hpp" +#include "orbis/thread/Process.hpp" +#include "orbis/utils/Rc.hpp" +#include "orbis/utils/SharedMutex.hpp" + +static constexpr auto kVmIdCount = 6; + +struct DceDevice : IoDevice { + orbis::shared_mutex mtx; + std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1; + orbis::uint64_t dmemOffset = ~static_cast(0); + + orbis::ErrorCode open(orbis::Ref *file, const char *path, + std::uint32_t flags, std::uint32_t mode, + orbis::Thread *thread) override; + + int allocateVmId(); + void deallocateVmId(int vmId); + void initializeProcess(orbis::Process *process); +}; diff --git a/rpcsx/iodev/gc.cpp b/rpcsx/iodev/gc.cpp index e2d5cc476..d898fee40 100644 --- a/rpcsx/iodev/gc.cpp +++ b/rpcsx/iodev/gc.cpp @@ -1,5 +1,6 @@ #include "gpu/DeviceCtl.hpp" #include "io-device.hpp" +#include "iodev/dce.hpp" #include "iodev/dmem.hpp" #include "orbis/KernelAllocator.hpp" #include "orbis/KernelContext.hpp" @@ -87,11 +88,11 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { for (unsigned i = 0; i < args->count; ++i) { gpu.submitGfxCommand(gcFile->gfxPipe, - orbis::g_currentThread->tproc->vmId, - {args->cmds + i * 4, 4}); + orbis::g_currentThread->tproc->vmId, + {args->cmds + i * 4, 4}); } } else { - return orbis::ErrorCode::INVAL; + return orbis::ErrorCode::BUSY; } break; } @@ -106,7 +107,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { gpu.submitSwitchBuffer(orbis::g_currentThread->tproc->vmId); } else { - return orbis::ErrorCode::INVAL; + return orbis::ErrorCode::BUSY; } // ORBIS_LOG_ERROR("gc ioctl 0xc0088101", args->arg0, args->arg1); @@ -127,11 +128,11 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { for (unsigned i = 0; i < args->count; ++i) { gpu.submitGfxCommand(gcFile->gfxPipe, - orbis::g_currentThread->tproc->vmId, - {args->cmds + i * 4, 4}); + orbis::g_currentThread->tproc->vmId, + {args->cmds + i * 4, 4}); } } else { - return orbis::ErrorCode::INVAL; + return orbis::ErrorCode::BUSY; } // orbis::bridge.sendDoFlip(); @@ -142,7 +143,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { gpu.waitForIdle(); } else { - return orbis::ErrorCode::INVAL; + return orbis::ErrorCode::BUSY; } break; } @@ -193,64 +194,53 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, case 0xc030810d: { // map compute queue struct Args { - std::uint32_t pipeHi; - std::uint32_t pipeLo; - std::uint32_t queueId; - std::uint32_t offset; - std::uint64_t ringBaseAddress; - std::uint64_t readPtrAddress; - std::uint64_t dingDongPtr; - std::uint32_t lenLog2; + orbis::uint32_t meId; + orbis::uint32_t pipeId; + orbis::uint32_t queueId; + orbis::uint32_t vqueueId; + orbis::uintptr_t ringBaseAddress; + orbis::uintptr_t readPtrAddress; + orbis::uintptr_t doorbell; + orbis::uint32_t ringSize; }; auto args = reinterpret_cast(argp); - ORBIS_LOG_ERROR("gc ioctl map compute queue", args->pipeHi, args->pipeLo, - args->queueId, args->offset, args->ringBaseAddress, - args->readPtrAddress, args->dingDongPtr, args->lenLog2); + ORBIS_LOG_ERROR("gc ioctl map compute queue", args->meId, args->pipeId, + args->queueId, args->vqueueId, args->ringBaseAddress, + args->readPtrAddress, args->doorbell, args->ringSize); - rx::die("gc ioctl map compute queue"); + if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { + gpu.mapComputeQueue(thread->tproc->vmId, args->meId, args->pipeId, + args->queueId, args->vqueueId, args->ringBaseAddress, + args->readPtrAddress, args->doorbell, + static_cast(1) << args->ringSize); - // auto id = ((args->pipeHi * 4) + args->pipeLo) * 8 + args->queueId; - // device->computeQueues[id] = { - // .ringBaseAddress = args->ringBaseAddress, - // .readPtrAddress = args->readPtrAddress, - // .dingDongPtr = args->dingDongPtr, - // .len = static_cast(1) << args->lenLog2, - // }; - // args->pipeHi = 0x769c766; - // args->pipeLo = 0x72e8e3c1; - // args->queueId = -0x248d50d8; - // args->offset = 0xd245ed58; - - // ((std::uint64_t *)args->dingDongPtr)[0xf0 / sizeof(std::uint64_t)] = 1; + } else { + return orbis::ErrorCode::BUSY; + } break; } case 0xc010811c: { // ding dong for workload struct Args { - std::uint32_t pipeHi; - std::uint32_t pipeLo; + std::uint32_t meId; + std::uint32_t pipeId; std::uint32_t queueId; std::uint32_t nextStartOffsetInDw; }; auto args = reinterpret_cast(argp); - ORBIS_LOG_ERROR("gc ioctl ding dong for workload", args->pipeHi, - args->pipeLo, args->queueId, args->nextStartOffsetInDw); - rx::die("gc ioctl ding dong for workload"); + ORBIS_LOG_ERROR("gc ioctl ding dong for workload", args->meId, args->pipeId, + args->queueId, args->nextStartOffsetInDw); - // auto id = ((args->pipeHi * 4) + args->pipeLo) * 8 + args->queueId; - - // auto queue = device->computeQueues.at(id); - // auto address = (queue.ringBaseAddress + queue.offset); - // auto endOffset = static_cast(args->nextStartOffsetInDw) << - // 2; auto size = endOffset - queue.offset; - - // rx::bridge.sendCommandBuffer(thread->tproc->pid, id, address, size); - - // queue.offset = endOffset; + if (auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice}) { + gpu.submitComputeQueue(args->meId, args->pipeId, args->queueId, + args->nextStartOffsetInDw); + } else { + return orbis::ErrorCode::BUSY; + } break; } @@ -336,6 +326,9 @@ orbis::ErrorCode GcDevice::open(orbis::Ref *file, const char *path, } void GcDevice::addClient(orbis::Process *process) { + auto dce = orbis::g_context.dceDevice.rawStaticCast(); + dce->initializeProcess(process); + std::lock_guard lock(mtx); auto &client = clients[process->pid]; ++client; diff --git a/rpcsx/main.cpp b/rpcsx/main.cpp index 6a980f5e1..7a83ec519 100644 --- a/rpcsx/main.cpp +++ b/rpcsx/main.cpp @@ -321,6 +321,9 @@ static void ps4InitDev() { auto dmem1 = createDmemCharacterDevice(1); orbis::g_context.dmemDevice = dmem1; + auto dce = createDceCharacterDevice(); + orbis::g_context.dceDevice = dce; + auto ttyFd = ::open("tty.txt", O_CREAT | O_TRUNC | O_WRONLY, 0666); auto consoleDev = createConsoleCharacterDevice(STDIN_FILENO, ttyFd); auto mbus = static_cast(createMBusCharacterDevice()); @@ -357,7 +360,7 @@ static void ps4InitDev() { vfs::addDevice("zero", createZeroCharacterDevice()); vfs::addDevice("null", createNullCharacterDevice()); vfs::addDevice("dipsw", createDipswCharacterDevice()); - vfs::addDevice("dce", createDceCharacterDevice()); + vfs::addDevice("dce", dce); vfs::addDevice("hmd_cmd", createHmdCmdCharacterDevice()); vfs::addDevice("hmd_snsr", createHmdSnsrCharacterDevice()); vfs::addDevice("hmd_3da", createHmd3daCharacterDevice());