diff --git a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp index 9b23dd8bc..a05bf1e5a 100644 --- a/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp +++ b/hw/amdgpu/bridge/include/amdgpu/bridge/bridge.hpp @@ -1,14 +1,12 @@ #pragma once -#include #include #include #include #include +#include namespace amdgpu::bridge { -extern std::uint32_t expGpuPid; - struct PadState { std::uint64_t timestamp; std::uint32_t unk; @@ -47,7 +45,9 @@ enum class CommandId : std::uint32_t { ProtectMemory, CommandBuffer, Flip, - MapDmem, + MapMemory, + MapProcess, + UnmapProcess, }; struct CmdMemoryProt { @@ -79,15 +79,25 @@ struct CmdFlip { std::uint64_t arg; }; -struct CmdMapDmem { - std::uint64_t offset; +struct CmdMapMemory { + std::int64_t offset; std::uint64_t address; std::uint64_t size; std::uint32_t prot; std::uint32_t pid; + std::int32_t memoryType; std::uint32_t dmemIndex; }; +struct CmdMapProcess { + std::uint64_t pid; + int vmId; +}; + +struct CmdUnmapProcess { + std::uint64_t pid; +}; + enum { kPageWriteWatch = 1 << 0, kPageReadWriteLock = 1 << 1, @@ -112,17 +122,15 @@ struct BridgeHeader { volatile std::uint64_t flipArg; volatile std::uint64_t flipCount; volatile std::uint64_t bufferInUseAddress; - std::uint32_t memoryAreaCount; std::uint32_t commandBufferCount; std::uint32_t bufferCount; - CmdMemoryProt memoryAreas[512]; CmdCommandBuffer commandBuffers[32]; CmdBuffer buffers[10]; // orbis::shared_mutex cacheCommandMtx; // orbis::shared_cv cacheCommandCv; - std::atomic cacheCommands[4]; - std::atomic gpuCacheCommand; - std::atomic cachePages[0x100'0000'0000 / kHostPageSize]; + std::atomic cacheCommands[6][4]; + std::atomic gpuCacheCommand[6]; + std::atomic cachePages[6][0x100'0000'0000 / kHostPageSize]; volatile std::uint64_t pull; volatile std::uint64_t push; @@ -137,7 +145,9 @@ struct Command { CmdCommandBuffer commandBuffer; CmdBuffer buffer; CmdFlip flip; - CmdMapDmem mapDmem; + CmdMapMemory mapMemory; + CmdMapProcess mapProcess; + CmdUnmapProcess unmapProcess; }; }; @@ -160,29 +170,32 @@ struct BridgePusher { void sendMemoryProtect(std::uint32_t pid, std::uint64_t address, std::uint64_t size, std::uint32_t prot) { - if (pid == expGpuPid) { - sendCommand(CommandId::ProtectMemory, {pid, address, size, prot}); - } + sendCommand(CommandId::ProtectMemory, {pid, address, size, prot}); } - void sendMapDmem(std::uint32_t pid, std::uint32_t dmemIndex, std::uint64_t address, std::uint64_t size, std::uint32_t prot, std::uint64_t offset) { - // if (pid == expGpuPid) { - sendCommand(CommandId::MapDmem, {pid, dmemIndex, address, size, prot, offset}); - // } + void sendMapMemory(std::uint32_t pid, std::uint32_t memoryType, + std::uint32_t dmemIndex, std::uint64_t address, + std::uint64_t size, std::uint32_t prot, + std::uint64_t offset) { + sendCommand(CommandId::MapMemory, + {pid, memoryType, dmemIndex, address, size, prot, offset}); } void sendCommandBuffer(std::uint32_t pid, std::uint64_t queue, std::uint64_t address, std::uint64_t size) { - // if (pid == expGpuPid) { - sendCommand(CommandId::CommandBuffer, {pid, queue, address, size}); - // } + sendCommand(CommandId::CommandBuffer, {pid, queue, address, size}); } void sendFlip(std::uint32_t pid, std::uint32_t bufferIndex, std::uint64_t arg) { - // if (pid == expGpuPid) { - sendCommand(CommandId::Flip, {pid, bufferIndex, arg}); - // } + sendCommand(CommandId::Flip, {pid, bufferIndex, arg}); + } + + void sendMapProcess(std::uint32_t pid, unsigned vmId) { + sendCommand(CommandId::MapProcess, {pid, vmId}); + } + void sendUnmapProcess(std::uint32_t pid) { + sendCommand(CommandId::UnmapProcess, {pid}); } void wait() { @@ -198,7 +211,8 @@ private: void sendCommand(CommandId id, std::initializer_list args) { std::uint64_t exp = 0; - while (!header->lock.compare_exchange_weak(exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) { + while (!header->lock.compare_exchange_weak( + exp, 1, std::memory_order::acquire, std::memory_order::relaxed)) { exp = 0; } @@ -303,13 +317,23 @@ private: result.flip.arg = args[2]; return result; - case CommandId::MapDmem: - result.mapDmem.pid = args[0]; - result.mapDmem.dmemIndex = args[1]; - result.mapDmem.address = args[2]; - result.mapDmem.size = args[3]; - result.mapDmem.prot = args[4]; - result.mapDmem.offset = args[5]; + case CommandId::MapMemory: + result.mapMemory.pid = args[0]; + result.mapMemory.memoryType = args[1]; + result.mapMemory.dmemIndex = args[2]; + result.mapMemory.address = args[3]; + result.mapMemory.size = args[4]; + result.mapMemory.prot = args[5]; + result.mapMemory.offset = args[6]; + return result; + + case CommandId::MapProcess: + result.mapProcess.pid = args[0]; + result.mapProcess.vmId = args[1]; + return result; + + case CommandId::UnmapProcess: + result.unmapProcess.pid = args[0]; return result; } diff --git a/hw/amdgpu/bridge/src/bridge.cpp b/hw/amdgpu/bridge/src/bridge.cpp index 8d159fa2b..51a190668 100644 --- a/hw/amdgpu/bridge/src/bridge.cpp +++ b/hw/amdgpu/bridge/src/bridge.cpp @@ -8,8 +8,6 @@ static int gShmFd = -1; static constexpr std::size_t kShmSize = sizeof(amdgpu::bridge::BridgeHeader) + (sizeof(std::uint64_t) * 256); -std::uint32_t amdgpu::bridge::expGpuPid = 0; - amdgpu::bridge::BridgeHeader * amdgpu::bridge::createShmCommandBuffer(const char *name) { if (gShmFd != -1) { diff --git a/hw/amdgpu/device/include/amdgpu/device/device.hpp b/hw/amdgpu/device/include/amdgpu/device/device.hpp index 773fc561c..2749cc86b 100644 --- a/hw/amdgpu/device/include/amdgpu/device/device.hpp +++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp @@ -1,5 +1,6 @@ #pragma once +#include "amdgpu/RemoteMemory.hpp" #include "amdgpu/bridge/bridge.hpp" #include "amdgpu/shader/Instruction.hpp" #include "gpu-scheduler.hpp" @@ -1259,6 +1260,42 @@ struct GnmTBuffer { static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4); +struct GnmSSampler { + int32_t clamp_x : 3; + int32_t clamp_y : 3; + int32_t clamp_z : 3; + int32_t max_aniso_ratio : 3; + int32_t depth_compare_func : 3; + int32_t force_unorm_coords : 1; + int32_t aniso_threshold : 3; + int32_t mc_coord_trunc : 1; + int32_t force_degamma : 1; + int32_t aniso_bias : 6; + int32_t trunc_coord : 1; + int32_t disable_cube_wrap : 1; + int32_t filter_mode : 2; + int32_t : 1; + int32_t min_lod : 12; + int32_t max_lod : 12; + int32_t perf_mip : 4; + int32_t perf_z : 4; + int32_t lod_bias : 14; + int32_t lod_bias_sec : 6; + int32_t xy_mag_filter : 2; + int32_t xy_min_filter : 2; + int32_t z_filter : 2; + int32_t mip_filter : 2; + int32_t : 4; + int32_t border_color_ptr : 12; + int32_t : 18; + int32_t border_color_type : 2; + + auto operator<=>(const GnmSSampler &) const = default; + bool operator==(const GnmSSampler &) const = default; +}; + +static_assert(sizeof(GnmSSampler) == sizeof(std::uint32_t) * 4); + constexpr auto kPageSize = 0x4000; void setVkDevice(VkDevice device, @@ -1266,11 +1303,11 @@ void setVkDevice(VkDevice device, VkPhysicalDeviceProperties devProperties); struct AmdgpuDevice { - void handleProtectMemory(std::uint64_t address, std::uint64_t size, - std::uint32_t prot); - void handleCommandBuffer(std::uint64_t queueId, std::uint64_t address, - std::uint64_t size); - bool handleFlip(VkQueue queue, VkCommandBuffer cmdBuffer, + void handleProtectMemory(RemoteMemory memory, std::uint64_t address, + std::uint64_t size, std::uint32_t prot); + void handleCommandBuffer(RemoteMemory memory, std::uint64_t queueId, + std::uint64_t address, std::uint64_t size); + bool handleFlip(RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer, TaskChain &initTaskChain, std::uint32_t bufferIndex, std::uint64_t arg, VkImage targetImage, VkExtent2D targetExtent, VkSemaphore waitSemaphore, diff --git a/hw/amdgpu/device/src/device.cpp b/hw/amdgpu/device/src/device.cpp index 4334859cb..012e25146 100644 --- a/hw/amdgpu/device/src/device.cpp +++ b/hw/amdgpu/device/src/device.cpp @@ -46,11 +46,6 @@ using namespace amdgpu::device; static const bool kUseDirectMemory = false; static amdgpu::bridge::BridgeHeader *g_bridge; -// void *g_rwMemory; -std::size_t g_memorySize; -std::uint64_t g_memoryBase; -RemoteMemory g_hostMemory; - namespace amdgpu::device::vk { VkDevice g_vkDevice = VK_NULL_HANDLE; VkAllocationCallbacks *g_vkAllocator = nullptr; @@ -287,7 +282,7 @@ _vkCmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer, return fn(commandBuffer, firstAttachment, attachmentCount, pColorWriteMasks); } -static util::MemoryAreaTable memoryAreaTable; +static util::MemoryAreaTable memoryAreaTable[6]; void device::setVkDevice(VkDevice device, VkPhysicalDeviceMemoryProperties memProperties, @@ -2089,6 +2084,7 @@ enum class CacheMode { None, AsyncWrite, LazyWrite }; struct CacheOverlayBase { std::mutex mtx; + RemoteMemory memory; Ref writeBackTaskCtl; std::function unlockMutableTask; std::uint64_t lockTag = 0; @@ -2413,15 +2409,15 @@ struct CacheImageOverlay : CacheOverlayBase { taskChain.add( ProcessQueue::Graphics, transferBufferReadId, [=, self = Ref(this)](VkCommandBuffer commandBuffer) { - vk::ImageRef imageRef(self->image); - imageRef.transitionLayout(commandBuffer, VK_IMAGE_LAYOUT_GENERAL); - imageRef.readFromBuffer( + vk::ImageRef imageRef(self->image); + imageRef.transitionLayout(commandBuffer, VK_IMAGE_LAYOUT_GENERAL); + imageRef.readFromBuffer( commandBuffer, self->trasferBuffer.getHandle(), self->aspect); - auto tag = *srcBuffer->getSyncTag(address, size); - std::lock_guard lock(self->mtx); - self->syncState.map(address, address + size, tag.payload); - }); + auto tag = *srcBuffer->getSyncTag(address, size); + std::lock_guard lock(self->mtx); + self->syncState.map(address, address + size, tag.payload); + }); } void readBuffer(TaskChain &taskChain, Ref targetBuffer, @@ -2483,11 +2479,11 @@ struct MemoryOverlay : CacheOverlayBase { void readBuffer(TaskChain &taskChain, Ref targetBuffer, std::uint64_t address, std::uint64_t size, std::uint64_t waitTask = GpuTaskLayout::kInvalidId) override { - auto readTask = [=] { + auto readTask = [=, this] { auto offset = address - targetBuffer->bufferAddress; auto targetData = (char *)targetBuffer->buffer.getData() + offset; - std::memcpy(targetData, g_hostMemory.getPointer(address), size); + std::memcpy(targetData, memory.getPointer(address), size); }; if (size < bridge::kHostPageSize && waitTask == GpuTaskLayout::kInvalidId) { @@ -2499,13 +2495,13 @@ struct MemoryOverlay : CacheOverlayBase { void writeBuffer(TaskChain &taskChain, Ref sourceBuffer, - std::uint64_t address, std::uint64_t size, - std::uint64_t waitTask = GpuTaskLayout::kInvalidId) override { - auto writeTask = [=] { + std::uint64_t address, std::uint64_t size, + std::uint64_t waitTask = GpuTaskLayout::kInvalidId) override { + auto writeTask = [=, this] { auto offset = address - sourceBuffer->bufferAddress; auto sourceData = (char *)sourceBuffer->buffer.getData() + offset; - std::memcpy(g_hostMemory.getPointer(address), sourceData, size); + std::memcpy(memory.getPointer(address), sourceData, size); }; if (size < bridge::kHostPageSize && waitTask == GpuTaskLayout::kInvalidId) { @@ -2516,7 +2512,7 @@ struct MemoryOverlay : CacheOverlayBase { } }; -static void notifyPageChanges(std::uint32_t firstPage, +static void notifyPageChanges(int vmId, std::uint32_t firstPage, std::uint32_t pageCount) { std::uint64_t command = (static_cast(pageCount - 1) << 32) | firstPage; @@ -2524,7 +2520,7 @@ static void notifyPageChanges(std::uint32_t firstPage, while (true) { for (std::size_t i = 0; i < std::size(g_bridge->cacheCommands); ++i) { std::uint64_t expCommand = 0; - if (g_bridge->cacheCommands[i].compare_exchange_strong( + if (g_bridge->cacheCommands[vmId][i].compare_exchange_strong( expCommand, command, std::memory_order::acquire, std::memory_order::relaxed)) { return; @@ -2533,22 +2529,23 @@ static void notifyPageChanges(std::uint32_t firstPage, } } -static void modifyWatchFlags(std::uint64_t address, std::uint64_t size, - std::uint8_t addFlags, std::uint8_t removeFlags) { +static void modifyWatchFlags(int vmId, std::uint64_t address, + std::uint64_t size, std::uint8_t addFlags, + std::uint8_t removeFlags) { auto firstPage = address / bridge::kHostPageSize; auto lastPage = (address + size + bridge::kHostPageSize - 1) / bridge::kHostPageSize; bool hasChanges = false; for (auto page = firstPage; page < lastPage; ++page) { auto prevValue = - g_bridge->cachePages[page].load(std::memory_order::relaxed); + g_bridge->cachePages[vmId][page].load(std::memory_order::relaxed); auto newValue = (prevValue & ~removeFlags) | addFlags; if (newValue == prevValue) { continue; } - while (!g_bridge->cachePages[page].compare_exchange_weak( + while (!g_bridge->cachePages[vmId][page].compare_exchange_weak( prevValue, newValue, std::memory_order::relaxed)) { newValue = (prevValue & ~removeFlags) | addFlags; } @@ -2559,23 +2556,24 @@ static void modifyWatchFlags(std::uint64_t address, std::uint64_t size, } if (hasChanges) { - notifyPageChanges(firstPage, lastPage - firstPage); + notifyPageChanges(vmId, firstPage, lastPage - firstPage); } } -static void watchWrites(std::uint64_t address, std::uint64_t size) { - modifyWatchFlags(address, size, bridge::kPageWriteWatch, +static void watchWrites(int vmId, std::uint64_t address, std::uint64_t size) { + modifyWatchFlags(vmId, address, size, bridge::kPageWriteWatch, bridge::kPageInvalidated); } -static void lockReadWrite(std::uint64_t address, std::uint64_t size, +static void lockReadWrite(int vmId, std::uint64_t address, std::uint64_t size, bool isLazy) { - modifyWatchFlags(address, size, + modifyWatchFlags(vmId, address, size, bridge::kPageReadWriteLock | (isLazy ? bridge::kPageLazyLock : 0), bridge::kPageInvalidated); } -static void unlockReadWrite(std::uint64_t address, std::uint64_t size) { - modifyWatchFlags(address, size, bridge::kPageWriteWatch, +static void unlockReadWrite(int vmId, std::uint64_t address, + std::uint64_t size) { + modifyWatchFlags(vmId, address, size, bridge::kPageWriteWatch, bridge::kPageReadWriteLock | bridge::kPageLazyLock); } @@ -2600,6 +2598,7 @@ struct CacheLine { auto operator<=>(const ImageKey &other) const = default; }; + RemoteMemory memory; std::mutex hostSyncMtx; util::MemoryTableWithPayload hostSyncTable; @@ -2637,14 +2636,14 @@ struct CacheLine { bool hasInvalidations = false; for (auto page = firstPage; page < lastPage; ++page) { - auto prevValue = - g_bridge->cachePages[page].load(std::memory_order::relaxed); + auto prevValue = g_bridge->cachePages[memory.vmId][page].load( + std::memory_order::relaxed); if (~prevValue & bridge::kPageInvalidated) { continue; } - while (!g_bridge->cachePages[page].compare_exchange_weak( + while (!g_bridge->cachePages[memory.vmId][page].compare_exchange_weak( prevValue, prevValue & ~bridge::kPageInvalidated, std::memory_order::relaxed)) { } @@ -2658,7 +2657,7 @@ struct CacheLine { } void trackCacheRead(std::uint64_t address, std::uint64_t size) { - watchWrites(address, size); + watchWrites(memory.vmId, address, size); } void setWriteBackTask(std::uint64_t address, std::uint64_t size, @@ -2721,7 +2720,7 @@ struct CacheLine { area.beginAddress, areaSize); updateTaskChain->wait(); uploadBuffer->unlock(tag); - unlockReadWrite(area.beginAddress, areaSize); + unlockReadWrite(memory.vmId, area.beginAddress, areaSize); // std::printf("memory lazy update, %lx finish\n", address); } @@ -2730,7 +2729,8 @@ struct CacheLine { entry->unlockMutableTask = [=, this] { if (entry->cacheMode != CacheMode::None) { - lockReadWrite(address, size, entry->cacheMode == CacheMode::LazyWrite); + lockReadWrite(memory.vmId, address, size, + entry->cacheMode == CacheMode::LazyWrite); entry->syncState.map(address, address + size, tag); std::lock_guard lock(hostSyncMtx); @@ -2785,7 +2785,7 @@ struct CacheLine { taskChain->wait(); if (entry->cacheMode != CacheMode::None) { - unlockReadWrite(address, size); + unlockReadWrite(memory.vmId, address, size); } return TaskResult::Complete; }); @@ -3001,6 +3001,7 @@ private: } auto bufferOverlay = new CacheBufferOverlay(); + bufferOverlay->memory = memory; bufferOverlay->buffer = vk::Buffer::Allocate( getHostVisibleMemory(), size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | @@ -3085,6 +3086,7 @@ private: } auto newOverlay = new CacheImageOverlay(); + newOverlay->memory = memory; newOverlay->image = vk::Image2D::Allocate(getDeviceLocalMemory(), width, height, colorFormat, usage); @@ -3103,8 +3105,8 @@ private: } else */ if (isCompressed) { dataWidth = (width + 3) / 4; - dataPitch = (height + 3) / 4; - dataHeight = (pitch + 3) / 4; + dataPitch = (pitch + 3) / 4; + dataHeight = (height + 3) / 4; bpp = 16; } @@ -3196,8 +3198,8 @@ struct Cache { auto operator<=>(const DetachedImageKey &other) const = default; }; - // TODO: read S# - VkSampler sampler{}; + RemoteMemory memory; + std::map samplers; std::map> datachedImages; std::map> cacheLines; std::atomic nextTag{2}; @@ -3205,14 +3207,15 @@ struct Cache { std::mutex mtx; - Cache() { - getCpuScheduler().enqueue([this] { - auto page = g_bridge->gpuCacheCommand.load(std::memory_order::relaxed); + Cache(int vmId) : memory({vmId}) { + getCpuScheduler().enqueue([this, vmId] { + auto page = + g_bridge->gpuCacheCommand[vmId].load(std::memory_order::relaxed); if (page == 0) { return TaskResult::Reschedule; } - g_bridge->gpuCacheCommand.store(0, std::memory_order::relaxed); + g_bridge->gpuCacheCommand[vmId].store(0, std::memory_order::relaxed); auto address = static_cast(page) * bridge::kHostPageSize; auto &line = getLine(address, bridge::kHostPageSize); @@ -3226,10 +3229,12 @@ struct Cache { vk::g_vkAllocator); vkDestroyDescriptorPool(vk::g_vkDevice, computeDescriptorPool, vk::g_vkAllocator); - vkDestroySampler(vk::g_vkDevice, sampler, vk::g_vkAllocator); + for (auto &[s, handle] : samplers) { + vkDestroySampler(vk::g_vkDevice, handle, vk::g_vkAllocator); + } graphicsDescriptorPool = VK_NULL_HANDLE; computeDescriptorPool = VK_NULL_HANDLE; - sampler = VK_NULL_HANDLE; + samplers.clear(); graphicsDecsriptorSets.clear(); computeDecsriptorSets.clear(); @@ -3241,7 +3246,7 @@ struct Cache { void syncLines() { std::lock_guard lock(mtx); - auto areas = std::exchange(memoryAreaTable.invalidated, {}); + auto areas = std::exchange(memoryAreaTable[memory.vmId].invalidated, {}); auto it = cacheLines.begin(); if (it == cacheLines.end()) { @@ -3322,32 +3327,48 @@ struct Cache { std::uint64_t createTag() { return nextTag.fetch_add(2); } - VkSampler getSampler() { - if (sampler != VK_NULL_HANDLE) { - return sampler; + VkSampler getSampler(const GnmSSampler &ssampler) { + std::lock_guard lock(mtx); + auto [it, inserted] = samplers.try_emplace(ssampler, VK_NULL_HANDLE); + + if (!inserted) { + return it->second; } - std::lock_guard lock(mtx); + auto clampToVkAddressMode = [](int clamp) { + switch (clamp) { + case 0: + return VK_SAMPLER_ADDRESS_MODE_REPEAT; + case 1: + return VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT; + case 2: + return VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE; + case 4: + return VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + } + return VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT; + }; + VkSamplerCreateInfo samplerInfo{ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, .magFilter = VK_FILTER_LINEAR, .minFilter = VK_FILTER_LINEAR, .mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR, - .addressModeU = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, - .addressModeV = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, - .addressModeW = VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, + .addressModeU = clampToVkAddressMode(ssampler.clamp_x), + .addressModeV = clampToVkAddressMode(ssampler.clamp_y), + .addressModeW = clampToVkAddressMode(ssampler.clamp_z), .mipLodBias = 0.0f, .anisotropyEnable = VK_FALSE, .maxAnisotropy = 1.0, - .compareOp = VK_COMPARE_OP_NEVER, - .minLod = 0.0f, - .maxLod = 0.0f, + .compareOp = (VkCompareOp)ssampler.depth_compare_func, + .minLod = 0.f, + .maxLod = 1.f, .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE, }; Verify() << vkCreateSampler(vk::g_vkDevice, &samplerInfo, nullptr, - &sampler); - return sampler; + &it->second); + return it->second; } VkDescriptorSet getGraphicsDescriptorSet() { @@ -3445,7 +3466,7 @@ struct Cache { for (auto &shader : emplacedIt->second) { bool isAllSame = true; for (auto &[startAddress, bytes] : shader.cachedData) { - if (std::memcmp(g_hostMemory.getPointer(startAddress), bytes.data(), + if (std::memcmp(memory.getPointer(startAddress), bytes.data(), bytes.size()) != 0) { isAllSame = false; break; @@ -3465,17 +3486,17 @@ struct Cache { } taskSet.append( - getCpuScheduler(), createCpuTask([=](const AsyncTaskCtl &) { + getCpuScheduler(), createCpuTask([=, this](const AsyncTaskCtl &) { util::MemoryAreaTable<> dependencies; flockfile(stdout); auto info = shader::convert( - g_hostMemory, stage, address, + memory, stage, address, std::span(userSgprs, userSgprsCount), dimX, dimY, dimZ, dependencies); if (!validateSpirv(info.spirv)) { printSpirv(info.spirv); - dumpShader(g_hostMemory.getPointer(address)); + dumpShader(memory.getPointer(address)); util::unreachable(); } @@ -3487,7 +3508,7 @@ struct Cache { funlockfile(stdout); for (auto [startAddress, endAddress] : dependencies) { - auto ptr = g_hostMemory.getPointer(startAddress); + auto ptr = memory.getPointer(startAddress); auto &target = entry->cachedData[startAddress]; target.resize(endAddress - startAddress); @@ -3550,7 +3571,7 @@ private: if (it == cacheLines.end() || address >= it->second.areaAddress + it->second.areaSize || it->second.areaAddress >= address + size) { - auto area = memoryAreaTable.queryArea(address / kPageSize); + auto area = memoryAreaTable[memory.vmId].queryArea(address / kPageSize); area.beginAddress *= kPageSize; area.endAddress *= kPageSize; @@ -3558,15 +3579,16 @@ private: it = cacheLines.emplace_hint( it, std::piecewise_construct, std::tuple{area.beginAddress}, std::tuple{area.beginAddress, area.endAddress}); + it->second.memory = memory; } return it->second; } }; -static Cache &getCache() { - static Cache result; - return result; +static Cache &getCache(RemoteMemory memory) { + static Cache caches[6]{0, 1, 2, 3, 4, 5}; + return caches[memory.vmId]; } static VkShaderEXT getPrimTypeRectGeomShader() { @@ -3596,12 +3618,15 @@ static VkShaderEXT getPrimTypeRectGeomShader() { struct GpuActionResources { std::atomic refs{0}; + RemoteMemory memory; // GpuTaskHandle taskHandle; // QueueRegisters ®s; - std::uint64_t tag = getCache().createTag(); + std::uint64_t tag = getCache(memory).createTag(); std::vector> usedImages; std::vector> usedBuffers; + GpuActionResources(RemoteMemory memory) : memory(memory) {} + void release() { for (auto image : usedImages) { image->unlock(tag); @@ -3627,7 +3652,7 @@ struct GpuActionResources { case shader::Shader::UniformKind::Buffer: { auto &vbuffer = *reinterpret_cast(uniform.buffer); - auto bufferRef = getCache().getBuffer( + auto bufferRef = getCache(memory).getBuffer( tag, initTaskChain, vbuffer.getAddress(), vbuffer.getNumRecords(), vbuffer.getStride(), vbuffer.getElementSize(), uniform.accessOp); @@ -3683,7 +3708,7 @@ struct GpuActionResources { // tbuffer.base_array, tbuffer.last_array, tbuffer.min_lod_warn, // tbuffer.counter_bank_id, tbuffer.LOD_hdw_cnt_en); - auto image = getCache().getImage( + auto image = getCache(memory).getImage( tag, initTaskChain, tbuffer.getAddress(), dataFormat, channelType, tileMode, width, height, depth, pitch, tbuffer.dst_sel_x, tbuffer.dst_sel_y, tbuffer.dst_sel_z, tbuffer.dst_sel_w, @@ -3714,8 +3739,8 @@ struct GpuActionResources { } case shader::Shader::UniformKind::Sampler: { - // TODO: load S# sampler - auto sampler = getCache().getSampler(); + auto &ssampler = *reinterpret_cast(uniform.buffer); + auto sampler = getCache(memory).getSampler(ssampler); VkDescriptorImageInfo imageInfo{ .sampler = sampler, @@ -3739,20 +3764,20 @@ struct GpuActionResources { } }; -static void eliminateFastClear() { +static void eliminateFastClear(RemoteMemory memory) { // TODO // util::unreachable(); } -static void resolve() { +static void resolve(RemoteMemory memory) { // TODO: when texture cache will be implemented it MSAA should be done by // GPU util::unreachable(); // auto srcBuffer = regs.colorBuffers[0]; // auto dstBuffer = regs.colorBuffers[1]; - // const auto src = g_hostMemory.getPointer(srcBuffer.base); - // auto dst = g_hostMemory.getPointer(dstBuffer.base); + // const auto src = memory.getPointer(srcBuffer.base); + // auto dst = memory.getPointer(dstBuffer.base); // if (src == nullptr || dst == nullptr) { // return; @@ -3761,19 +3786,20 @@ static void resolve() { // std::memcpy(dst, src, regs.screenScissorH * regs.screenScissorW * 4); } -static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, - std::uint64_t indeciesAddress, std::uint32_t indexCount) { +static void draw(RemoteMemory memory, TaskChain &taskSet, QueueRegisters ®s, + std::uint32_t count, std::uint64_t indeciesAddress, + std::uint32_t indexCount) { if (regs.cbColorFormat == CbColorFormat::Disable) { return; } if (regs.cbColorFormat == CbColorFormat::EliminateFastClear) { - eliminateFastClear(); + eliminateFastClear(memory); return; } if (regs.cbColorFormat == CbColorFormat::Resolve) { - resolve(); + resolve(memory); return; } @@ -3793,17 +3819,18 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, regs.depthClearEnable = true; - auto resources = Ref(new GpuActionResources()); + auto resources = Ref(new GpuActionResources(memory)); + auto &cache = getCache(memory); // std::printf("draw action, tag %lu\n", resources->tag); TaskSet shaderLoadTaskSet; auto [desriptorSetLayout, pipelineLayout] = getGraphicsLayout(); - auto &vertexShader = getCache().getShader( - shaderLoadTaskSet, desriptorSetLayout, shader::Stage::Vertex, - regs.pgmVsAddress, regs.userVsData, regs.vsUserSpgrs); + auto &vertexShader = cache.getShader(shaderLoadTaskSet, desriptorSetLayout, + shader::Stage::Vertex, regs.pgmVsAddress, + regs.userVsData, regs.vsUserSpgrs); - auto &fragmentShader = getCache().getShader( + auto &fragmentShader = cache.getShader( shaderLoadTaskSet, desriptorSetLayout, shader::Stage::Fragment, regs.pgmPsAddress, regs.userPsData, regs.psUserSpgrs); @@ -3834,7 +3861,7 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, auto dataFormat = (SurfaceFormat)colorBuffer.format; auto channelType = kTextureChannelTypeSrgb; // TODO - auto colorImage = getCache().getImage( + auto colorImage = getCache(memory).getImage( resources->tag, taskSet, colorBuffer.base, dataFormat, channelType, (TileMode)colorBuffer.tileModeIndex, regs.screenScissorW + regs.screenScissorX, @@ -3866,7 +3893,10 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, regs.blendSeparateAlpha ? blendMultiplierToVkBlendFactor(regs.blendAlphaDst) : blendMultiplierToVkBlendFactor(regs.blendColorDst), - .alphaBlendOp = blendFuncToVkBlendOp(regs.blendAlphaFn), + .alphaBlendOp = regs.blendSeparateAlpha + ? blendFuncToVkBlendOp(regs.blendAlphaFn) + : blendFuncToVkBlendOp(regs.blendColorFn), + }); colorWriteMask.push_back(((mask & 1) ? VK_COLOR_COMPONENT_R_BIT : 0) | @@ -3875,7 +3905,7 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, ((mask & 8) ? VK_COLOR_COMPONENT_A_BIT : 0)); } - auto descSet = getCache().getGraphicsDescriptorSet(); + auto descSet = cache.getGraphicsDescriptorSet(); resources->loadShaderBindings(taskSet, descSet, vertexShader.info); resources->loadShaderBindings(taskSet, descSet, fragmentShader.info); @@ -3899,13 +3929,13 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, VkRenderingAttachmentInfo depthAttachment; if (regs.depthEnable) { - depthImage = getCache().getImage( - resources->tag, taskSet, regs.zReadBase, kSurfaceFormat24_8, - kTextureChannelTypeUNorm, kTileModeDisplay_LinearAligned, - regs.screenScissorW + regs.screenScissorX, - regs.screenScissorH + regs.screenScissorY, 1, - regs.screenScissorW + regs.screenScissorX, 0, 0, 0, 0, depthAccess, - false); + depthImage = cache.getImage(resources->tag, taskSet, regs.zReadBase, + kSurfaceFormat24_8, kTextureChannelTypeUNorm, + kTileModeDisplay_LinearAligned, + regs.screenScissorW + regs.screenScissorX, + regs.screenScissorH + regs.screenScissorY, 1, + regs.screenScissorW + regs.screenScissorX, 0, 0, + 0, 0, depthAccess, false); depthAttachment = { .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO, @@ -3930,7 +3960,7 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, : VK_INDEX_TYPE_UINT32; if (needConversion) { - auto indecies = g_hostMemory.getPointer(indeciesAddress); + auto indecies = memory.getPointer(indeciesAddress); if (indecies == nullptr) { indexCount = count; } @@ -3993,8 +4023,8 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, unsigned indexSize = vkIndexType == VK_INDEX_TYPE_UINT16 ? 2 : 4; auto bufferRef = - getCache().getBuffer(resources->tag, taskSet, indeciesAddress, - indexCount, 0, indexSize, shader::AccessOp::Load); + cache.getBuffer(resources->tag, taskSet, indeciesAddress, indexCount, 0, + indexSize, shader::AccessOp::Load); indexBuffer = { .buffer = bufferRef->buffer.getHandle(), .offset = indeciesAddress - bufferRef->bufferAddress, @@ -4125,12 +4155,6 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, } _vkCmdBindShadersEXT(drawCommandBuffer, std::size(stages), stages, shaders); - if (primType == kPrimitiveTypeRectList) { - VkShaderStageFlagBits stage = VK_SHADER_STAGE_GEOMETRY_BIT; - auto shader = getPrimTypeRectGeomShader(); - _vkCmdBindShadersEXT(drawCommandBuffer, 1, &stage, &shader); - } - if (indexBuffer.buffer == nullptr) { vkCmdDraw(drawCommandBuffer, count, 1, 0, 0); } else { @@ -4147,22 +4171,24 @@ static void draw(TaskChain &taskSet, QueueRegisters ®s, std::uint32_t count, taskSet.add(drawTaskId, [=] { // std::printf("releasing draw action, tag %lu\n", resources->tag); - getCache().releaseGraphicsDescriptorSet(descSet); + getCache(memory).releaseGraphicsDescriptorSet(descSet); resources->release(); }); taskSet.wait(); } -static void dispatch(TaskChain &taskSet, QueueRegisters ®s, std::size_t dimX, - std::size_t dimY, std::size_t dimZ) { +static void dispatch(RemoteMemory memory, TaskChain &taskSet, + QueueRegisters ®s, std::size_t dimX, std::size_t dimY, + std::size_t dimZ) { if (regs.pgmComputeAddress == 0) { std::fprintf(stderr, "attempt to invoke dispatch without compute shader\n"); return; } - auto resources = Ref(new GpuActionResources()); - auto descSet = getCache().getComputeDescriptorSet(); + auto resources = Ref(new GpuActionResources(memory)); + auto &cache = getCache(memory); + auto descSet = cache.getComputeDescriptorSet(); // std::printf("dispatch action, tag %lu\n", resources->tag); @@ -4170,7 +4196,7 @@ static void dispatch(TaskChain &taskSet, QueueRegisters ®s, std::size_t dimX, TaskSet loadShaderTaskSet; - auto &computeShader = getCache().getShader( + auto &computeShader = cache.getShader( loadShaderTaskSet, desriptorSetLayout, shader::Stage::Compute, regs.pgmComputeAddress, regs.userComputeData, regs.computeUserSpgrs, regs.computeNumThreadX, regs.computeNumThreadY, regs.computeNumThreadZ); @@ -4196,7 +4222,7 @@ static void dispatch(TaskChain &taskSet, QueueRegisters ®s, std::size_t dimX, taskSet.add(computeTaskId, [=] { // std::printf("releasing dispatch action, tag %lu\n", resources->tag); - getCache().releaseComputeDescriptorSet(descSet); + getCache(memory).releaseComputeDescriptorSet(descSet); resources->release(); }); } @@ -4228,7 +4254,7 @@ static std::uint64_t gpuCoreClock() { return 0x0; } -static void writeEop(EopData data) { +static void writeEop(RemoteMemory memory, EopData data) { // std::printf("write eop: dstSel=%x, intSel=%x,eventIndex=%x, address = // %#lx, // " @@ -4237,33 +4263,33 @@ static void writeEop(EopData data) { // data.value, (unsigned)data.eventSource); switch (data.eventSource) { case EventWriteSource::Immediate32: { - *g_hostMemory.getPointer(data.address) = data.value; + *memory.getPointer(data.address) = data.value; break; } case EventWriteSource::Immediate64: { - *g_hostMemory.getPointer(data.address) = data.value; + *memory.getPointer(data.address) = data.value; break; } case EventWriteSource::GlobalClockCounter: { - *g_hostMemory.getPointer(data.address) = globalClock(); + *memory.getPointer(data.address) = globalClock(); break; } case EventWriteSource::GpuCoreClockCounter: { - *g_hostMemory.getPointer(data.address) = gpuCoreClock(); + *memory.getPointer(data.address) = gpuCoreClock(); break; } } } -static void drawIndexAuto(TaskChain &waitTaskSet, QueueRegisters ®s, - std::uint32_t count) { - draw(waitTaskSet, regs, count, 0, 0); +static void drawIndexAuto(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::uint32_t count) { + draw(memory, waitTaskSet, regs, count, 0, 0); } -static void drawIndex2(TaskChain &waitTaskSet, QueueRegisters ®s, - std::uint32_t maxSize, std::uint64_t address, - std::uint32_t count) { - draw(waitTaskSet, regs, count, address, maxSize); +static void drawIndex2(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::uint32_t maxSize, + std::uint64_t address, std::uint32_t count) { + draw(memory, waitTaskSet, regs, count, address, maxSize); } struct Queue { @@ -4278,9 +4304,11 @@ struct Queue { std::deque commandBuffers; }; -static void handleCommandBuffer(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleCommandBuffer(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span &packets); -static void handleLoadConstRam(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleLoadConstRam(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { std::uint64_t addressLo = packet[1]; std::uint64_t addressHi = packet[2]; @@ -4289,7 +4317,8 @@ static void handleLoadConstRam(TaskChain &waitTaskSet, QueueRegisters ®s, auto address = addressLo | (addressHi << 32); } -static void handleSET_UCONFIG_REG(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleSET_UCONFIG_REG(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { std::uint32_t regId = 0xc000 + packet[1]; @@ -4299,7 +4328,8 @@ static void handleSET_UCONFIG_REG(TaskChain &waitTaskSet, QueueRegisters ®s, } } -static void handleSET_CONTEXT_REG(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleSET_CONTEXT_REG(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { std::uint32_t regId = 0xa000 + packet[1]; @@ -4308,7 +4338,8 @@ static void handleSET_CONTEXT_REG(TaskChain &waitTaskSet, QueueRegisters ®s, } } -static void handleSET_SH_REG(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleSET_SH_REG(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { std::uint32_t regId = 0x2c00 + packet[1]; @@ -4318,7 +4349,8 @@ static void handleSET_SH_REG(TaskChain &waitTaskSet, QueueRegisters ®s, } } -static void handleDMA_DATA(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleDMA_DATA(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto srcAddrLo = packet[2]; auto srcAddrHi = packet[3]; @@ -4332,12 +4364,14 @@ static void handleDMA_DATA(TaskChain &waitTaskSet, QueueRegisters ®s, // dstAddr); } -static void handleAQUIRE_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleAQUIRE_MEM(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { // std::printf("aquire mem\n"); } -static void handleWRITE_DATA(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleWRITE_DATA(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto control = packet[1]; auto destAddrLo = packet[2]; @@ -4371,19 +4405,21 @@ static void handleWRITE_DATA(TaskChain &waitTaskSet, QueueRegisters ®s, auto gdsOffset = getBits(destAddrLo, 15, 0); auto address = destAddrLo | (static_cast(destAddrHi) << 32); - auto dest = g_hostMemory.getPointer(address); + auto dest = memory.getPointer(address); // std::printf("write data: address=%lx\n", address); for (unsigned i = 0; i < size; ++i) { dest[i] = data[i]; } } -static void handleINDEX_TYPE(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleINDEX_TYPE(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { regs.indexType = packet[1]; } -static void handleINDEX_BASE(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleINDEX_BASE(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { // std::printf("INDEX_BASE:\n"); // for (auto cmd : packet) { @@ -4396,12 +4432,14 @@ static void handleINDEX_BASE(TaskChain &waitTaskSet, QueueRegisters ®s, regs.indexBase = (addressHi << 32) | addressLo; } -static void handleDRAW_INDEX_AUTO(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleDRAW_INDEX_AUTO(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { - drawIndexAuto(waitTaskSet, regs, packet[1]); + drawIndexAuto(memory, waitTaskSet, regs, packet[1]); } -static void handleDRAW_INDEX_OFFSET_2(TaskChain &waitTaskSet, +static void handleDRAW_INDEX_OFFSET_2(RemoteMemory memory, + TaskChain &waitTaskSet, QueueRegisters ®s, std::span packet) { auto maxSize = packet[1]; @@ -4409,38 +4447,44 @@ static void handleDRAW_INDEX_OFFSET_2(TaskChain &waitTaskSet, auto count = packet[3]; auto drawInitiator = packet[4]; - drawIndex2(waitTaskSet, regs, maxSize, regs.indexBase + offset, count); + drawIndex2(memory, waitTaskSet, regs, maxSize, regs.indexBase + offset, + count); } -static void handleDRAW_INDEX_2(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleDRAW_INDEX_2(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto maxSize = packet[1]; auto address = packet[2] | (static_cast(packet[3]) << 32); auto count = packet[4]; - drawIndex2(waitTaskSet, regs, maxSize, address, count); + drawIndex2(memory, waitTaskSet, regs, maxSize, address, count); } -static void handleDISPATCH_DIRECT(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleDISPATCH_DIRECT(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto dimX = packet[1]; auto dimY = packet[2]; auto dimZ = packet[3]; - dispatch(waitTaskSet, regs, dimX, dimY, dimZ); + dispatch(memory, waitTaskSet, regs, dimX, dimY, dimZ); } -static void handleCONTEXT_CONTROL(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleCONTEXT_CONTROL(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { // std::printf("context control\n"); } -static void handleCLEAR_STATE(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleCLEAR_STATE(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { // std::printf("clear state\n"); } -static void handleRELEASE_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleRELEASE_MEM(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto writeSource = static_cast(getBits(packet[2], 32, 29)); auto addressLo = packet[3]; @@ -4457,30 +4501,32 @@ static void handleRELEASE_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, switch (writeSource) { case EventWriteSource::Immediate32: { - *g_hostMemory.getPointer(address) = data; + *memory.getPointer(address) = data; break; } case EventWriteSource::Immediate64: { - *g_hostMemory.getPointer(address) = data; + *memory.getPointer(address) = data; break; } case EventWriteSource::GlobalClockCounter: { - *g_hostMemory.getPointer(address) = globalClock(); + *memory.getPointer(address) = globalClock(); break; } case EventWriteSource::GpuCoreClockCounter: { - *g_hostMemory.getPointer(address) = gpuCoreClock(); + *memory.getPointer(address) = gpuCoreClock(); break; } } } -static void handleEVENT_WRITE(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleEVENT_WRITE(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { // std::printf("event write\n"); } -static void handleINDIRECT_BUFFER_3F(TaskChain &waitTaskSet, +static void handleINDIRECT_BUFFER_3F(RemoteMemory memory, + TaskChain &waitTaskSet, QueueRegisters ®s, std::span packet) { auto swapFn = getBits(packet[1], 1, 0); @@ -4492,12 +4538,11 @@ static void handleINDIRECT_BUFFER_3F(TaskChain &waitTaskSet, std::printf("indirect buffer: address=%lx, size = %x, vmid=%x\n", address, count, vmid); - auto commands = - std::span(g_hostMemory.getPointer(address), count); + auto commands = std::span(memory.getPointer(address), count); waitTaskSet.add([=, waitTaskSet = TaskChain::Create()] mutable { while (!commands.empty()) { - handleCommandBuffer(*waitTaskSet.get(), regs, commands); + handleCommandBuffer(memory, *waitTaskSet.get(), regs, commands); waitTaskSet->wait(); } std::printf("indirect buffer end\n"); @@ -4505,7 +4550,8 @@ static void handleINDIRECT_BUFFER_3F(TaskChain &waitTaskSet, }); } -static void handleEVENT_WRITE_EOP(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleEVENT_WRITE_EOP(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { EopData eopData{}; eopData.eventType = getBits(packet[1], 6, 0); @@ -4517,10 +4563,11 @@ static void handleEVENT_WRITE_EOP(TaskChain &waitTaskSet, QueueRegisters ®s, eopData.intSel = getBits(packet[3], 26, 24); eopData.eventSource = static_cast(getBits(packet[3], 32, 29)); - writeEop(eopData); + writeEop(memory, eopData); } -static void handleEVENT_WRITE_EOS(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleEVENT_WRITE_EOS(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { std::uint32_t eventType = getBits(packet[1], 6, 0); std::uint32_t eventIndex = getBits(packet[1], 12, 8); @@ -4531,13 +4578,14 @@ static void handleEVENT_WRITE_EOS(TaskChain &waitTaskSet, QueueRegisters ®s, // "address = %#lx, command = %#x\n", // eventType, eventIndex, address, command); if (command == 0x4000) { // store 32bit data - *g_hostMemory.getPointer(address) = packet[4]; + *memory.getPointer(address) = packet[4]; } else { util::unreachable(); } } -static void handleWAIT_REG_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleWAIT_REG_MEM(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto function = packet[1] & 7; auto pollAddressLo = packet[2]; @@ -4548,7 +4596,7 @@ static void handleWAIT_REG_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, auto pollAddress = pollAddressLo | (static_cast(pollAddressHi) << 32); - auto pointer = g_hostMemory.getPointer(pollAddress); + auto pointer = memory.getPointer(pollAddress); auto compare = [&](std::uint32_t value, std::uint32_t reference, int function) { @@ -4591,10 +4639,11 @@ static void handleWAIT_REG_MEM(TaskChain &waitTaskSet, QueueRegisters ®s, }); } -static void handleNOP(TaskChain &waitTaskSet, QueueRegisters ®s, - std::span packet) {} +static void handleNOP(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) {} -static void handleUnknownCommand(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleUnknownCommand(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet) { auto op = getBits(packet[0], 15, 8); auto len = getBits(packet[0], 29, 16) + 1; @@ -4602,7 +4651,8 @@ static void handleUnknownCommand(TaskChain &waitTaskSet, QueueRegisters ®s, // opcodeToString(op).c_str(), len); } -using CommandHandler = void (*)(TaskChain &waitTaskSet, QueueRegisters ®s, +using CommandHandler = void (*)(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span packet); static auto g_commandHandlers = [] { std::array handlers; @@ -4637,11 +4687,12 @@ static auto g_commandHandlers = [] { return handlers; }(); -static void handleCommandBuffer(TaskChain &waitTaskSet, QueueRegisters ®s, +static void handleCommandBuffer(RemoteMemory memory, TaskChain &waitTaskSet, + QueueRegisters ®s, std::span &packets) { while (!packets.empty()) { // std::uint64_t address = - // (char *)packets.data() - g_hostMemory.shmPointer + 0x40000; + // (char *)packets.data() - memory.shmPointer + 0x40000; // std::fprintf(stderr, "address = %lx\n", address); auto cmd = packets[0]; auto type = getBits(cmd, 31, 30); @@ -4655,7 +4706,7 @@ static void handleCommandBuffer(TaskChain &waitTaskSet, QueueRegisters ®s, // std::printf("cmd: %s:%x, %x, %x\n", opcodeToString(op).c_str(), len, // predicate, shaderType); - g_commandHandlers[op](waitTaskSet, regs, packets.subspan(0, len)); + g_commandHandlers[op](memory, waitTaskSet, regs, packets.subspan(0, len)); packets = packets.subspan(len); if (!waitTaskSet.empty()) { @@ -4686,16 +4737,15 @@ static void handleCommandBuffer(TaskChain &waitTaskSet, QueueRegisters ®s, } } -void amdgpu::device::AmdgpuDevice::handleProtectMemory(std::uint64_t address, +void amdgpu::device::AmdgpuDevice::handleProtectMemory(RemoteMemory memory, + std::uint64_t address, std::uint64_t size, std::uint32_t prot) { auto beginPage = address / kPageSize; auto endPage = (address + size + kPageSize - 1) / kPageSize; - ::mprotect(g_hostMemory.getPointer(address), size, prot >> 4); - if (prot >> 4) { - memoryAreaTable.map(beginPage, endPage); + memoryAreaTable[memory.vmId].map(beginPage, endPage); const char *protStr; switch (prot >> 4) { case PROT_READ: @@ -4717,32 +4767,15 @@ void amdgpu::device::AmdgpuDevice::handleProtectMemory(std::uint64_t address, std::fprintf(stderr, "Allocated area at %zx, size %lx, prot %s\n", address, size, protStr); } else { - memoryAreaTable.unmap(beginPage, endPage); + memoryAreaTable[memory.vmId].unmap(beginPage, endPage); std::fprintf(stderr, "Unmapped area at %zx, size %lx\n", address, size); } - - std::size_t index = 0; - for (auto area : memoryAreaTable) { - // std::printf("area %lx-%lx\n", area.beginAddress * kPageSize, - // area.endAddress * kPageSize); - - if (index >= std::size(g_bridge->memoryAreas)) { - util::unreachable("too many memory areas"); - } - - g_bridge->memoryAreas[index++] = { - .address = area.beginAddress * kPageSize, - .size = (area.endAddress - area.beginAddress) * kPageSize, - .prot = (PROT_READ | PROT_WRITE) << 4 // TODO - }; - } - - g_bridge->memoryAreaCount = index; } static std::map queues; -void amdgpu::device::AmdgpuDevice::handleCommandBuffer(std::uint64_t queueId, +void amdgpu::device::AmdgpuDevice::handleCommandBuffer(RemoteMemory memory, + std::uint64_t queueId, std::uint64_t address, std::uint64_t size) { auto count = size / sizeof(std::uint32_t); @@ -4783,7 +4816,7 @@ void amdgpu::device::AmdgpuDevice::handleCommandBuffer(std::uint64_t queueId, } auto taskChain = TaskChain::Create(); - ::handleCommandBuffer(*taskChain.get(), queue->regs, + ::handleCommandBuffer(memory, *taskChain.get(), queue->regs, commandBuffer->commands); taskChain->wait(); return TaskResult::Reschedule; @@ -4795,15 +4828,14 @@ void amdgpu::device::AmdgpuDevice::handleCommandBuffer(std::uint64_t queueId, std::lock_guard lock(it->second.mtx); it->second.commandBuffers.push_back( {.commands = - std::span(g_hostMemory.getPointer(address), count)}); + std::span(memory.getPointer(address), count)}); } bool amdgpu::device::AmdgpuDevice::handleFlip( - VkQueue queue, VkCommandBuffer cmdBuffer, TaskChain &taskChain, - std::uint32_t bufferIndex, std::uint64_t arg, VkImage targetImage, - VkExtent2D targetExtent, VkSemaphore waitSemaphore, + RemoteMemory memory, VkQueue queue, VkCommandBuffer cmdBuffer, + TaskChain &taskChain, std::uint32_t bufferIndex, std::uint64_t arg, + VkImage targetImage, VkExtent2D targetExtent, VkSemaphore waitSemaphore, VkSemaphore signalSemaphore, VkFence fence) { - // std::printf("requested flip %d\n", bufferIndex); if (bufferIndex == ~static_cast(0)) { g_bridge->flipBuffer = bufferIndex; @@ -4864,14 +4896,15 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( buffer.pixelFormat); } - auto tag = getCache().createTag(); + auto &cache = getCache(memory); + auto tag = cache.createTag(); - imageRef = getCache().getImage( - tag, taskChain, buffer.address, surfFormat, channelType, - buffer.tilingMode == 1 ? kTileModeDisplay_2dThin - : kTileModeDisplay_LinearAligned, - buffer.width, buffer.height, 1, buffer.pitch, 4, 5, 6, 7, - shader::AccessOp::Load); + imageRef = + cache.getImage(tag, taskChain, buffer.address, surfFormat, channelType, + buffer.tilingMode == 1 ? kTileModeDisplay_2dThin + : kTileModeDisplay_LinearAligned, + buffer.width, buffer.height, 1, buffer.pitch, 4, 5, 6, 7, + shader::AccessOp::Load); auto initTask = taskChain.getLastTaskId(); @@ -4982,7 +5015,7 @@ bool amdgpu::device::AmdgpuDevice::handleFlip( g_bridge->flipArg = arg; g_bridge->flipCount = g_bridge->flipCount + 1; auto bufferInUse = - g_hostMemory.getPointer(g_bridge->bufferInUseAddress); + memory.getPointer(g_bridge->bufferInUseAddress); if (bufferInUse != nullptr) { bufferInUse[bufferIndex] = 0; } @@ -4998,7 +5031,9 @@ AmdgpuDevice::AmdgpuDevice(amdgpu::bridge::BridgeHeader *bridge) { } AmdgpuDevice::~AmdgpuDevice() { - getCache().clear(); + for (int vmid = 0; vmid < 6; ++vmid) { + getCache(RemoteMemory{vmid}).clear(); + } auto [gSetLayout, gPipelineLayout] = getGraphicsLayout(); auto [cSetLayout, cPipelineLayout] = getComputeLayout(); diff --git a/hw/amdgpu/include/amdgpu/RemoteMemory.hpp b/hw/amdgpu/include/amdgpu/RemoteMemory.hpp index e292bc515..efcecfe6a 100644 --- a/hw/amdgpu/include/amdgpu/RemoteMemory.hpp +++ b/hw/amdgpu/include/amdgpu/RemoteMemory.hpp @@ -3,10 +3,11 @@ namespace amdgpu { struct RemoteMemory { - char *shmPointer; + int vmId; template T *getPointer(std::uint64_t address) const { - return address ? reinterpret_cast(shmPointer + address - 0x40000) + return address ? reinterpret_cast( + static_cast(vmId) << 40 | address) : nullptr; } }; diff --git a/orbis-kernel/include/orbis/AuthInfo.hpp b/orbis-kernel/include/orbis/AuthInfo.hpp index 93cc5a7ed..164db0033 100644 --- a/orbis-kernel/include/orbis/AuthInfo.hpp +++ b/orbis-kernel/include/orbis/AuthInfo.hpp @@ -7,7 +7,14 @@ struct AuthInfo { uint64_t unk0; uint64_t caps[4]; uint64_t attrs[4]; - uint64_t unk[8]; + uint64_t ucred[8]; + + bool hasUseHp3dPipeCapability() const { + return ucred[2] == 0x3800000000000009; + } + bool hasMmapSelfCapability() const { return ((ucred[4] >> 0x3a) & 1) != 1; } + bool hasSystemCapability() const { return ((ucred[3] >> 0x3e) & 1) != 0; } + bool hasSceProgramAttribute() const { return ((ucred[3] >> 0x1f) & 1) != 0; } }; static_assert(sizeof(AuthInfo) == 136); diff --git a/orbis-kernel/include/orbis/thread/Process.hpp b/orbis-kernel/include/orbis/thread/Process.hpp index 33416bdc7..d8b0a4052 100644 --- a/orbis-kernel/include/orbis/thread/Process.hpp +++ b/orbis-kernel/include/orbis/thread/Process.hpp @@ -51,6 +51,7 @@ struct Process final { ProcessState state = ProcessState::NEW; Process *parentProcess = nullptr; shared_mutex mtx; + int vmId = -1; void (*onSysEnter)(Thread *thread, int id, uint64_t *args, int argsCount) = nullptr; void (*onSysExit)(Thread *thread, int id, uint64_t *args, int argsCount, diff --git a/rpcsx-gpu/CMakeLists.txt b/rpcsx-gpu/CMakeLists.txt index b148c0dd2..a10d6d572 100644 --- a/rpcsx-gpu/CMakeLists.txt +++ b/rpcsx-gpu/CMakeLists.txt @@ -8,4 +8,5 @@ add_executable(rpcsx-gpu target_include_directories(rpcsx-gpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(rpcsx-gpu PUBLIC amdgpu::bridge amdgpu::device glfw Vulkan::Vulkan rx) set_target_properties(rpcsx-gpu PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +target_link_options(rpcsx-os PUBLIC "LINKER:-Ttext-segment,0x0000060000000000") install(TARGETS rpcsx-gpu RUNTIME DESTINATION bin) diff --git a/rpcsx-gpu/main.cpp b/rpcsx-gpu/main.cpp index 3ad1df3c7..be83a8c4d 100644 --- a/rpcsx-gpu/main.cpp +++ b/rpcsx-gpu/main.cpp @@ -1,7 +1,9 @@ #include "amdgpu/RemoteMemory.hpp" #include "amdgpu/device/gpu-scheduler.hpp" #include "amdgpu/device/vk.hpp" +#include "rx/MemoryTable.hpp" #include "rx/Version.hpp" +#include "rx/mem.hpp" #include "util/unreachable.hpp" #include #include @@ -16,18 +18,14 @@ #include #include #include +#include +#include #include #include #include #include // TODO: make in optional -// TODO -// extern void *g_rwMemory; -extern std::size_t g_memorySize; -extern std::uint64_t g_memoryBase; -extern amdgpu::RemoteMemory g_hostMemory; - static void usage(std::FILE *out, const char *argv0) { std::fprintf(out, "usage: %s [options...]\n", argv0); std::fprintf(out, " options:\n"); @@ -159,6 +157,11 @@ int main(int argc, const char *argv[]) { return 1; } + if (!rx::mem::reserve((void *)0x40000, 0x60000000000 - 0x40000)) { + std::fprintf(stderr, "failed to reserve virtual memory\n"); + return 1; + } + glfwInit(); glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); auto window = glfwCreateWindow(1280, 720, "RPCSX", nullptr, nullptr); @@ -725,20 +728,6 @@ int main(int argc, const char *argv[]) { amdgpu::bridge::BridgePuller bridgePuller{bridge}; amdgpu::bridge::Command commandsBuffer[1]; - if (!std::filesystem::exists(std::string("/dev/shm") + shmName)) { - std::printf("Waiting for OS\n"); - while (!std::filesystem::exists(std::string("/dev/shm") + shmName)) { - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - } - } - - int memoryFd = ::shm_open(shmName, O_RDWR, S_IRUSR | S_IWUSR); - - if (memoryFd < 0) { - std::printf("failed to open shared memory\n"); - return 1; - } - int dmemFd[3]; for (std::size_t i = 0; i < std::size(dmemFd); ++i) { @@ -759,26 +748,80 @@ int main(int argc, const char *argv[]) { } } - struct stat memoryStat; - ::fstat(memoryFd, &memoryStat); - amdgpu::RemoteMemory memory{(char *)::mmap( - nullptr, memoryStat.st_size, PROT_NONE, MAP_SHARED, memoryFd, 0)}; - - // extern void *g_rwMemory; - g_memorySize = memoryStat.st_size; - g_memoryBase = 0x40000; - // g_rwMemory = ::mmap(nullptr, g_memorySize, PROT_READ | PROT_WRITE, MAP_SHARED, - // memoryFd, 0); - - g_hostMemory = memory; - { amdgpu::device::AmdgpuDevice device(bridgePuller.header); - for (std::uint32_t end = bridge->memoryAreaCount, i = 0; i < end; ++i) { - auto area = bridge->memoryAreas[i]; - device.handleProtectMemory(area.address, area.size, area.prot); - } + struct VmMapSlot { + int memoryType; + int prot; + std::int64_t offset; + std::uint64_t baseAddress; + + auto operator<=>(const VmMapSlot &) const = default; + }; + + struct ProcessInfo { + int vmId = -1; + int vmFd = -1; + rx::MemoryTableWithPayload vmTable; + }; + + auto mapProcess = [&](std::int64_t pid, int vmId, ProcessInfo &process) { + process.vmId = vmId; + + auto memory = amdgpu::RemoteMemory{vmId}; + + std::string pidVmName = shmName; + pidVmName += '-'; + pidVmName += std::to_string(pid); + int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR); + process.vmFd = memoryFd; + + if (memoryFd < 0) { + std::printf("failed to process %x shared memory\n", (int)pid); + std::abort(); + } + + for (auto [startAddress, endAddress, slot] : process.vmTable) { + auto gpuProt = slot.prot >> 4; + if (gpuProt == 0) { + continue; + } + + auto devOffset = slot.offset + startAddress - slot.baseAddress; + int mapFd = memoryFd; + + if (slot.memoryType >= 0) { + mapFd = dmemFd[slot.memoryType]; + } + + auto mmapResult = + ::mmap(memory.getPointer(startAddress), endAddress - startAddress, + gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset); + + if (mmapResult == MAP_FAILED) { + std::printf( + "failed to map process %x memory, address %lx-%lx, type %x\n", + (int)pid, startAddress, endAddress, slot.memoryType); + std::abort(); + } + + device.handleProtectMemory(memory, startAddress, + endAddress - startAddress, slot.prot); + } + }; + + auto unmapProcess = [&](ProcessInfo &process) { + auto startAddress = static_cast(process.vmId) << 40; + auto size = static_cast(1) << 40; + rx::mem::reserve(reinterpret_cast(startAddress), size); + + ::close(process.vmFd); + process.vmFd = -1; + process.vmId = -1; + }; + + std::unordered_map processInfo; std::vector presentCmdBuffers(swapchainImages.size()); @@ -966,66 +1009,141 @@ int main(int argc, const char *argv[]) { for (auto cmd : std::span(commandsBuffer, pulledCount)) { switch (cmd.id) { - case amdgpu::bridge::CommandId::ProtectMemory: - device.handleProtectMemory(cmd.memoryProt.address, - cmd.memoryProt.size, cmd.memoryProt.prot); - break; - case amdgpu::bridge::CommandId::CommandBuffer: - device.handleCommandBuffer(cmd.commandBuffer.queue, - cmd.commandBuffer.address, - cmd.commandBuffer.size); - break; - case amdgpu::bridge::CommandId::Flip: { - if (!isImageAcquired) { - Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX, - presentCompleteSemaphore, nullptr, - &imageIndex); + case amdgpu::bridge::CommandId::ProtectMemory: { + auto &process = processInfo[cmd.memoryProt.pid]; - vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE, - UINT64_MAX); - vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]); + auto vmSlotIt = process.vmTable.queryArea(cmd.memoryProt.address); + if (vmSlotIt == process.vmTable.end()) { + std::abort(); } - isImageAcquired = false; + auto vmSlot = (*vmSlotIt).payload; - vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0); - VkCommandBufferBeginInfo beginInfo{}; - beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + process.vmTable.map(cmd.memoryProt.address, + cmd.memoryProt.address + cmd.memoryProt.size, + VmMapSlot{ + .memoryType = vmSlot.memoryType, + .prot = static_cast(cmd.memoryProt.prot), + .offset = vmSlot.offset, + .baseAddress = vmSlot.baseAddress, + }); - vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo); + if (process.vmId >= 0) { + auto memory = amdgpu::RemoteMemory{process.vmId}; + rx::mem::protect(memory.getPointer(cmd.memoryProt.address), + cmd.memoryProt.size, cmd.memoryProt.prot >> 4); + device.handleProtectMemory(memory, cmd.mapMemory.address, + cmd.mapMemory.size, cmd.mapMemory.prot); + } + break; + } + case amdgpu::bridge::CommandId::CommandBuffer: { + auto &process = processInfo[cmd.commandBuffer.pid]; + if (process.vmId >= 0) { + device.handleCommandBuffer( + amdgpu::RemoteMemory{process.vmId}, cmd.commandBuffer.queue, + cmd.commandBuffer.address, cmd.commandBuffer.size); + } + break; + } + case amdgpu::bridge::CommandId::Flip: { + auto &process = processInfo[cmd.flip.pid]; - if (device.handleFlip( - presentQueue, presentCmdBuffers[imageIndex], - *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex, - cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent, - presentCompleteSemaphore, renderCompleteSemaphore, - inFlightFences[imageIndex])) { - VkPresentInfoKHR presentInfo{ - .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, - .waitSemaphoreCount = 1, - .pWaitSemaphores = &renderCompleteSemaphore, - .swapchainCount = 1, - .pSwapchains = &swapchain, - .pImageIndices = &imageIndex, - }; - if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) { - std::printf("swapchain was invalidated\n"); - createSwapchain(); + if (process.vmId >= 0) { + if (!isImageAcquired) { + Verify() << vkAcquireNextImageKHR(vkDevice, swapchain, UINT64_MAX, + presentCompleteSemaphore, + nullptr, &imageIndex); + + vkWaitForFences(vkDevice, 1, &inFlightFences[imageIndex], VK_TRUE, + UINT64_MAX); + vkResetFences(vkDevice, 1, &inFlightFences[imageIndex]); + } + + isImageAcquired = false; + + vkResetCommandBuffer(presentCmdBuffers[imageIndex], 0); + VkCommandBufferBeginInfo beginInfo{}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + + vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo); + + if (device.handleFlip( + amdgpu::RemoteMemory{process.vmId}, presentQueue, + presentCmdBuffers[imageIndex], + *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex, + cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent, + presentCompleteSemaphore, renderCompleteSemaphore, + inFlightFences[imageIndex])) { + VkPresentInfoKHR presentInfo{ + .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, + .waitSemaphoreCount = 1, + .pWaitSemaphores = &renderCompleteSemaphore, + .swapchainCount = 1, + .pSwapchains = &swapchain, + .pImageIndices = &imageIndex, + }; + if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) { + std::printf("swapchain was invalidated\n"); + createSwapchain(); + } + } else { + isImageAcquired = true; } - } else { - isImageAcquired = true; } break; } - case amdgpu::bridge::CommandId::MapDmem: { - auto addr = g_hostMemory.getPointer(cmd.mapDmem.address); - auto mapping = ::mmap(addr, cmd.mapDmem.size, - PROT_READ | PROT_WRITE /*TODO: cmd.mapDmem.prot >> 4*/, - MAP_FIXED | MAP_SHARED, dmemFd[cmd.mapDmem.dmemIndex], - cmd.mapDmem.offset); - device.handleProtectMemory(cmd.mapDmem.address, cmd.mapDmem.size, 0x33 /*TODO: cmd.mapDmem.prot*/); + case amdgpu::bridge::CommandId::MapProcess: { + mapProcess(cmd.mapProcess.pid, cmd.mapProcess.vmId, processInfo[cmd.mapProcess.pid]); + break; + } + case amdgpu::bridge::CommandId::UnmapProcess: { + unmapProcess(processInfo[cmd.mapProcess.pid]); + break; + } + + case amdgpu::bridge::CommandId::MapMemory: { + auto &process = processInfo[cmd.mapMemory.pid]; + + process.vmTable.map( + cmd.mapMemory.address, cmd.mapMemory.address + cmd.mapMemory.size, + VmMapSlot{ + .memoryType = static_cast(cmd.mapMemory.memoryType >= 0 + ? cmd.mapMemory.dmemIndex + : -1), + .prot = static_cast(cmd.mapMemory.prot), + .offset = cmd.mapMemory.offset, + .baseAddress = cmd.mapMemory.address, + }); + + if (process.vmId >= 0) { + auto memory = amdgpu::RemoteMemory{process.vmId}; + + int mapFd = process.vmFd; + + if (cmd.mapMemory.memoryType >= 0) { + mapFd = dmemFd[cmd.mapMemory.dmemIndex]; + } + + auto mmapResult = + ::mmap(memory.getPointer(cmd.mapMemory.address), + cmd.mapMemory.size, cmd.mapMemory.prot >> 4, + MAP_FIXED | MAP_SHARED, mapFd, cmd.mapMemory.offset); + + if (mmapResult == MAP_FAILED) { + std::printf( + "failed to map process %x memory, address %lx-%lx, type %x\n", + (int)cmd.mapMemory.pid, cmd.mapMemory.address, + cmd.mapMemory.address + cmd.mapMemory.size, + cmd.mapMemory.memoryType); + std::abort(); + } + + device.handleProtectMemory(memory, cmd.mapMemory.address, + cmd.mapMemory.size, cmd.mapMemory.prot); + } break; } diff --git a/rpcsx-os/iodev/dmem.cpp b/rpcsx-os/iodev/dmem.cpp index 6844037a1..d21c39cc6 100644 --- a/rpcsx-os/iodev/dmem.cpp +++ b/rpcsx-os/iodev/dmem.cpp @@ -44,6 +44,12 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len, rx::vm::kMapProtGpuAll; } + auto allocationInfoIt = allocations.queryArea(directMemoryStart); + if (allocationInfoIt == allocations.end()) { + std::abort(); + } + auto allocationInfo = *allocationInfoIt; + auto result = rx::vm::map(*address, len, prot, flags, rx::vm::kMapInternalReserveOnly, this, directMemoryStart); @@ -60,9 +66,10 @@ orbis::ErrorCode DmemDevice::mmap(void **address, std::uint64_t len, return orbis::ErrorCode::INVAL; } - rx::bridge.sendMapDmem(orbis::g_currentThread->tproc->pid, index, - reinterpret_cast(result), len, prot, - directMemoryStart); + rx::bridge.sendMapMemory(orbis::g_currentThread->tproc->pid, + allocationInfo.payload.memoryType, index, + reinterpret_cast(result), len, prot, + directMemoryStart); *address = result; diff --git a/rpcsx-os/iodev/gc.cpp b/rpcsx-os/iodev/gc.cpp index 7fa381691..ca918cd7b 100644 --- a/rpcsx-os/iodev/gc.cpp +++ b/rpcsx-os/iodev/gc.cpp @@ -10,6 +10,7 @@ #include #include #include +#include struct ComputeQueue { std::uint64_t ringBaseAddress{}; @@ -19,14 +20,104 @@ struct ComputeQueue { std::uint64_t len{}; }; +static void runBridge(int vmId) { + std::thread{[=] { + pthread_setname_np(pthread_self(), "Bridge"); + auto bridge = rx::bridge.header; + + std::vector fetchedCommands; + fetchedCommands.reserve(std::size(bridge->cacheCommands)); + + while (true) { + for (auto &command : bridge->cacheCommands) { + std::uint64_t value = command[vmId].load(std::memory_order::relaxed); + + if (value != 0) { + fetchedCommands.push_back(value); + command[vmId].store(0, std::memory_order::relaxed); + } + } + + if (fetchedCommands.empty()) { + continue; + } + + for (auto command : fetchedCommands) { + auto page = static_cast(command); + auto count = static_cast(command >> 32) + 1; + + auto pageFlags = + bridge->cachePages[vmId][page].load(std::memory_order::relaxed); + + auto address = + static_cast(page) * amdgpu::bridge::kHostPageSize; + auto origVmProt = rx::vm::getPageProtection(address); + int prot = 0; + + if (origVmProt & rx::vm::kMapProtCpuRead) { + prot |= PROT_READ; + } + if (origVmProt & rx::vm::kMapProtCpuWrite) { + prot |= PROT_WRITE; + } + if (origVmProt & rx::vm::kMapProtCpuExec) { + prot |= PROT_EXEC; + } + + if (pageFlags & amdgpu::bridge::kPageReadWriteLock) { + prot &= ~(PROT_READ | PROT_WRITE); + } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) { + prot &= ~PROT_WRITE; + } + + // std::fprintf(stderr, "protection %lx-%lx\n", address, + // address + amdgpu::bridge::kHostPageSize * count); + if (::mprotect(reinterpret_cast(address), + amdgpu::bridge::kHostPageSize * count, prot)) { + perror("protection failed"); + std::abort(); + } + } + + fetchedCommands.clear(); + } + }}.detach(); +} + +static constexpr auto kVmIdCount = 6; + struct GcDevice : public IoDevice { + std::uint32_t freeVmIds = (1 << (kVmIdCount + 1)) - 1; orbis::shared_mutex mtx; + orbis::kmap clients; orbis::kmap computeQueues; orbis::ErrorCode open(orbis::Ref *file, const char *path, std::uint32_t flags, std::uint32_t mode, orbis::Thread *thread) override; + + void addClient(orbis::Process *process); + void removeClient(orbis::Process *process); + + int allocateVmId() { + int id = std::countr_zero(freeVmIds); + + if (id >= kVmIdCount) { + std::fprintf(stderr, "out of vm slots\n"); + std::abort(); + } + + freeVmIds &= ~(1 << id); + return id; + }; + + void deallocateVmId(int vmId) { freeVmIds |= (1 << vmId); }; }; -struct GcFile : public orbis::File {}; + +struct GcFile : public orbis::File { + orbis::Process *process = nullptr; + ~GcFile() { device.staticCast()->removeClient(process); } +}; + static std::uint64_t g_submitDoneFlag; static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, @@ -34,7 +125,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, // 0xc00c8110 // 0xc0848119 - auto device = static_cast(file->device.get()); + auto device = file->device.staticCast(); std::lock_guard lock(device->mtx); switch (request) { @@ -55,7 +146,7 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, // flockfile(stderr); // if (thread->tproc->pid != amdgpu::bridge::expGpuPid) { - // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds); + // ORBIS_LOG_ERROR("gc ioctl submit", args->arg0, args->count, args->cmds); // } for (unsigned i = 0; i < args->count; ++i) { @@ -172,14 +263,20 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, break; } - case 0xc010810b: { // something like stats masks? + case 0xc010810b: { // get cu masks param struct Args { - std::uint64_t arg1; - std::uint64_t arg2; + std::uint32_t se0sh0; + std::uint32_t se0sh1; + std::uint32_t se1sh0; + std::uint32_t se1sh1; }; auto args = reinterpret_cast(argp); - ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2); + // ORBIS_LOG_ERROR("gc ioctl stats mask", args->arg1, args->arg2); + args->se0sh0 = ~0; + args->se0sh1 = ~0; + args->se1sh0 = ~0; + args->se1sh1 = ~0; break; } @@ -265,8 +362,14 @@ static orbis::ErrorCode gc_ioctl(orbis::File *file, std::uint64_t request, } case 0xc0048113: { - // get client number - *(std::uint32_t *)argp = 0; + // get num clients + + struct Args { + std::uint32_t numClients; + }; + + auto *args = reinterpret_cast(argp); + args->numClients = device->clients.size(); break; } @@ -312,8 +415,38 @@ orbis::ErrorCode GcDevice::open(orbis::Ref *file, const char *path, auto newFile = orbis::knew(); newFile->device = this; newFile->ops = &ops; + newFile->process = thread->tproc; + addClient(thread->tproc); *file = newFile; return {}; } +void GcDevice::addClient(orbis::Process *process) { + std::lock_guard lock(mtx); + auto &client = clients[process->pid]; + ++client; + + if (client == 1) { + auto vmId = allocateVmId(); + rx::bridge.sendMapProcess(process->pid, vmId); + process->vmId = vmId; + + runBridge(vmId); + } +} + +void GcDevice::removeClient(orbis::Process *process) { + std::lock_guard lock(mtx); + auto clientIt = clients.find(process->pid); + assert(clientIt != clients.end()); + assert(clientIt->second != 0); + --clientIt->second; + if (clientIt->second == 0) { + clients.erase(clientIt); + rx::bridge.sendUnmapProcess(process->pid); + deallocateVmId(process->vmId); + process->vmId = -1; + } +} + IoDevice *createGcCharacterDevice() { return orbis::knew(); } diff --git a/rpcsx-os/main.cpp b/rpcsx-os/main.cpp index 5baa9fc42..60d1f84d9 100644 --- a/rpcsx-os/main.cpp +++ b/rpcsx-os/main.cpp @@ -41,71 +41,6 @@ #include static int g_gpuPid; - -void runBridge() { - std::thread{[] { - pthread_setname_np(pthread_self(), "Bridge"); - auto bridge = rx::bridge.header; - - std::vector fetchedCommands; - fetchedCommands.reserve(std::size(bridge->cacheCommands)); - - while (true) { - for (auto &command : bridge->cacheCommands) { - std::uint64_t value = command.load(std::memory_order::relaxed); - - if (value != 0) { - fetchedCommands.push_back(value); - command.store(0, std::memory_order::relaxed); - } - } - - if (fetchedCommands.empty()) { - continue; - } - - for (auto command : fetchedCommands) { - auto page = static_cast(command); - auto count = static_cast(command >> 32) + 1; - - auto pageFlags = - bridge->cachePages[page].load(std::memory_order::relaxed); - - auto address = - static_cast(page) * amdgpu::bridge::kHostPageSize; - auto origVmProt = rx::vm::getPageProtection(address); - int prot = 0; - - if (origVmProt & rx::vm::kMapProtCpuRead) { - prot |= PROT_READ; - } - if (origVmProt & rx::vm::kMapProtCpuWrite) { - prot |= PROT_WRITE; - } - if (origVmProt & rx::vm::kMapProtCpuExec) { - prot |= PROT_EXEC; - } - - if (pageFlags & amdgpu::bridge::kPageReadWriteLock) { - prot &= ~(PROT_READ | PROT_WRITE); - } else if (pageFlags & amdgpu::bridge::kPageWriteWatch) { - prot &= ~PROT_WRITE; - } - - // std::fprintf(stderr, "protection %lx-%lx\n", address, - // address + amdgpu::bridge::kHostPageSize * count); - if (::mprotect(reinterpret_cast(address), - amdgpu::bridge::kHostPageSize * count, prot)) { - perror("protection failed"); - std::abort(); - } - } - - fetchedCommands.clear(); - } - }}.detach(); -} - extern bool allowMonoDebug; __attribute__((no_stack_protector)) static void @@ -116,8 +51,9 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) { auto signalAddress = reinterpret_cast(info->si_addr); - if (orbis::g_currentThread != nullptr && sig == SIGSEGV && + if (orbis::g_currentThread != nullptr && orbis::g_currentThread->tproc->vmId >= 0 && sig == SIGSEGV && signalAddress >= 0x40000 && signalAddress < 0x100'0000'0000) { + auto vmid = orbis::g_currentThread->tproc->vmId; auto ctx = reinterpret_cast(ucontext); bool isWrite = (ctx->uc_mcontext.gregs[REG_ERR] & 0x2) != 0; auto origVmProt = rx::vm::getPageProtection(signalAddress); @@ -138,17 +74,17 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) { auto bridge = rx::bridge.header; while (true) { - auto flags = bridge->cachePages[page].load(std::memory_order::relaxed); + auto flags = bridge->cachePages[vmid][page].load(std::memory_order::relaxed); if ((flags & amdgpu::bridge::kPageReadWriteLock) != 0) { if ((flags & amdgpu::bridge::kPageLazyLock) != 0) { if (std::uint32_t gpuCommand = 0; - !bridge->gpuCacheCommand.compare_exchange_weak(gpuCommand, + !bridge->gpuCacheCommand[vmid].compare_exchange_weak(gpuCommand, page)) { continue; } - while (!bridge->cachePages[page].compare_exchange_weak( + while (!bridge->cachePages[vmid][page].compare_exchange_weak( flags, flags & ~amdgpu::bridge::kPageLazyLock, std::memory_order::relaxed)) { } @@ -165,7 +101,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) { break; } - if (bridge->cachePages[page].compare_exchange_weak( + if (bridge->cachePages[vmid][page].compare_exchange_weak( flags, amdgpu::bridge::kPageInvalidated, std::memory_order::relaxed)) { break; @@ -188,6 +124,7 @@ handle_signal(int sig, siginfo_t *info, void *ucontext) { } if (orbis::g_currentThread != nullptr) { + orbis::g_currentThread->tproc->exitStatus = sig; orbis::g_currentThread->tproc->event.emit(orbis::kEvFiltProc, orbis::kNoteExit, sig); } @@ -1640,29 +1577,34 @@ int main(int argc, const char *argv[]) { }; if (isSystem) { - amdgpu::bridge::expGpuPid = isSafeMode ? 20001 : 60001; orbis::g_context.safeMode = isSafeMode ? 1 : 0; - initProcess->authInfo = { - .unk0 = 0x380000000000000f, - .caps = - { - -1ul, - -1ul, - -1ul, - -1ul, - }, - .attrs = - { - 0x4000400040000000, - 0x4000000000000000, - 0x0080000000000002, - 0xF0000000FFFF4000, - }, - }; + initProcess->authInfo = {.unk0 = 0x380000000000000f, + .caps = + { + -1ul, + -1ul, + -1ul, + -1ul, + }, + .attrs = + { + 0x4000400040000000, + 0x4000000000000000, + 0x0080000000000002, + 0xF0000000FFFF4000, + }, + .ucred = { + -1ul, + -1ul, + 0x3800000000000022, + -1ul, + (1ul << 0x3a), + -1ul, + -1ul, + }}; initProcess->budgetId = 0; initProcess->isInSandbox = false; } else { - amdgpu::bridge::expGpuPid = initProcess->pid; initProcess->authInfo = { .unk0 = 0x3100000000000001, .caps = @@ -1788,7 +1730,6 @@ int main(int argc, const char *argv[]) { launchDaemon(mainThread, "/system/sys/orbis_audiod.elf", {"/system/sys/orbis_audiod.elf"}, {}); - runBridge(); status = ps4Exec(mainThread, execEnv, std::move(executableModule), ps4Argv, {}); } diff --git a/rpcsx-os/ops.cpp b/rpcsx-os/ops.cpp index fa35f0773..c005ae6e4 100644 --- a/rpcsx-os/ops.cpp +++ b/rpcsx-os/ops.cpp @@ -43,7 +43,6 @@ using namespace orbis; extern bool allowMonoDebug; extern "C" void __register_frame(const void *); -void runBridge(); void setupSigHandlers(); int ps4Exec(orbis::Thread *mainThread, orbis::utils::Ref executableModule, @@ -828,9 +827,6 @@ SysResult fork(Thread *thread, slong flags) { dup2(logFd, 1); dup2(logFd, 2); - if (childPid == amdgpu::bridge::expGpuPid) { - runBridge(); - } return {}; } diff --git a/rpcsx-os/vm.cpp b/rpcsx-os/vm.cpp index ba5705633..24bd3f885 100644 --- a/rpcsx-os/vm.cpp +++ b/rpcsx-os/vm.cpp @@ -958,11 +958,8 @@ void *rx::vm::map(void *addr, std::uint64_t len, std::int32_t prot, } if (auto thr = orbis::g_currentThread) { - // std::fprintf(stderr, "sending mapping %lx-%lx, pid %lx\n", address, - // address + len, thr->tproc->pid); - // if (!noOverwrite) { - // rx::bridge.sendMemoryProtect(thr->tproc->pid, address, len, prot); - // } + rx::bridge.sendMapMemory(thr->tproc->pid, -1, -1, address, len, prot, + address - kMinAddress); } else { std::fprintf(stderr, "ignoring mapping %lx-%lx\n", address, address + len); }