[amdgpu] WIP cache implementation

This commit is contained in:
DH 2023-07-24 06:03:55 +03:00
parent 306fecf2ab
commit 760aea0e49
13 changed files with 1823 additions and 1302 deletions

View file

@ -1,15 +1,11 @@
#pragma once
#include "amdgpu/RemoteMemory.hpp"
#include "amdgpu/bridge/bridge.hpp"
#include "amdgpu/shader/Converter.hpp"
#include "amdgpu/shader/Instruction.hpp"
#include "util/Verify.hpp"
#include "util/area.hpp"
#include <cassert>
#include <map>
#include <set>
#include <string>
#include <vector>
#include <vulkan/vulkan_core.h>
namespace amdgpu::device {
@ -1264,199 +1260,12 @@ struct GnmTBuffer {
static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);
struct ShaderModule {
VkPipeline pipeline;
VkPipelineLayout pipelineLayout;
VkDescriptorSetLayout descriptorSetLayout;
VkDescriptorPool descriptorPool;
void destroy() const;
};
constexpr auto kPageSize = 0x4000;
struct AreaInfo {
std::uint64_t beginAddress;
std::uint64_t endAddress;
};
struct NoInvalidationHandle {
void handleInvalidation(std::uint64_t) {}
};
struct StdSetInvalidationHandle {
std::set<std::uint64_t, std::greater<>> invalidated;
void handleInvalidation(std::uint64_t address) {
invalidated.insert(address);
}
};
template <typename InvalidationHandleT = NoInvalidationHandle>
class MemoryAreaTable : public InvalidationHandleT {
enum class Kind { O, X };
std::map<std::uint64_t, Kind> mAreas;
public:
class iterator {
using map_iterator = typename std::map<std::uint64_t, Kind>::iterator;
map_iterator it;
public:
iterator() = default;
iterator(map_iterator it) : it(it) {}
AreaInfo operator*() const { return {it->first, std::next(it)->first}; }
iterator &operator++() const {
++it;
++it;
return *this;
}
iterator &operator--() const {
--it;
--it;
return *this;
}
bool operator==(iterator other) const { return it == other.it; }
bool operator!=(iterator other) const { return it != other.it; }
};
iterator begin() { return iterator(mAreas.begin()); }
iterator end() { return iterator(mAreas.end()); }
void clear() { mAreas.clear(); }
AreaInfo queryArea(std::uint64_t address) const {
auto it = mAreas.lower_bound(address);
assert(it != mAreas.end());
std::uint64_t endAddress = 0;
if (it->first != address) {
assert(it->second == Kind::X);
endAddress = it->first;
--it;
} else {
assert(it->second == Kind::O);
endAddress = std::next(it)->first;
}
auto startAddress = std::uint64_t(it->first);
return {startAddress, endAddress};
}
void map(std::uint64_t beginAddress, std::uint64_t endAddress) {
auto [beginIt, beginInserted] = mAreas.emplace(beginAddress, Kind::O);
auto [endIt, endInserted] = mAreas.emplace(endAddress, Kind::X);
if (!beginInserted) {
if (beginIt->second == Kind::X) {
// it was close, extend to open
assert(beginIt != mAreas.begin());
--beginIt;
}
} else if (beginIt != mAreas.begin()) {
auto prevRangePointIt = std::prev(beginIt);
if (prevRangePointIt->second == Kind::O) {
// we found range start before inserted one, remove insertion and extend
// begin
this->handleInvalidation(beginIt->first);
mAreas.erase(beginIt);
beginIt = prevRangePointIt;
}
}
if (!endInserted) {
if (endIt->second == Kind::O) {
// it was open, extend to close
assert(endIt != mAreas.end());
++endIt;
}
} else {
auto nextRangePointIt = std::next(endIt);
if (nextRangePointIt != mAreas.end() &&
nextRangePointIt->second == Kind::X) {
// we found range end after inserted one, remove insertion and extend
// end
this->handleInvalidation(std::prev(endIt)->first);
mAreas.erase(endIt);
endIt = nextRangePointIt;
}
}
// eat everything in middle of the range
++beginIt;
while (beginIt != endIt) {
this->handleInvalidation(std::prev(endIt)->first);
beginIt = mAreas.erase(beginIt);
}
}
void unmap(std::uint64_t beginAddress, std::uint64_t endAddress) {
auto beginIt = mAreas.lower_bound(beginAddress);
if (beginIt == mAreas.end() || beginIt->first >= endAddress) {
return;
}
if (beginIt->first > beginAddress && beginIt->second == Kind::X) {
// we have found end after unmap begin, need to insert new end
this->handleInvalidation(std::prev(beginIt)->first);
auto newBeginIt = mAreas.emplace_hint(beginIt, beginAddress, Kind::X);
mAreas.erase(beginIt);
if (newBeginIt == mAreas.end()) {
return;
}
beginIt = std::next(newBeginIt);
} else if (beginIt->second == Kind::X) {
beginIt = ++beginIt;
}
Kind lastKind = Kind::X;
while (beginIt != mAreas.end() && beginIt->first <= endAddress) {
lastKind = beginIt->second;
if (lastKind == Kind::O) {
this->handleInvalidation(std::prev(beginIt)->first);
}
beginIt = mAreas.erase(beginIt);
}
if (lastKind != Kind::O) {
return;
}
// Last removed was range open, need to insert new one at unmap end
mAreas.emplace_hint(beginIt, endAddress, Kind::O);
}
std::size_t totalMemory() const {
std::size_t result = 0;
for (auto it = mAreas.begin(), end = mAreas.end(); it != end; ++it) {
auto rangeBegin = it;
auto rangeEnd = ++it;
result += rangeEnd->first - rangeBegin->first;
}
return result;
}
};
extern MemoryAreaTable<StdSetInvalidationHandle> memoryAreaTable;
struct DrawContext {
VkPipelineCache pipelineCache;
VkQueue queue;
VkCommandPool commandPool;
std::vector<VkShaderModule> loadedShaderModules;
~DrawContext();
};
void setVkDevice(VkDevice device,

View file

@ -0,0 +1,303 @@
#pragma once
#include <atomic>
#include <bit>
#include <condition_variable>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>
namespace amdgpu::device {
template <typename T> class Ref {
T *m_ref = nullptr;
public:
Ref() = default;
Ref(std::nullptr_t) {}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref(OT *ref) : m_ref(ref) {
if (m_ref != nullptr) {
ref->incRef();
}
}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref(const Ref<OT> &other) : m_ref(other.get()) {
if (m_ref != nullptr) {
m_ref->incRef();
}
}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref(Ref<OT> &&other) : m_ref(other.release()) {}
Ref(const Ref &other) : m_ref(other.get()) {
if (m_ref != nullptr) {
m_ref->incRef();
}
}
Ref(Ref &&other) : m_ref(other.release()) {}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref &operator=(Ref<OT> &&other) {
other.swap(*this);
return *this;
}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref &operator=(OT *other) {
*this = Ref(other);
return *this;
}
template <typename OT>
requires(std::is_base_of_v<T, OT>)
Ref &operator=(const Ref<OT> &other) {
*this = Ref(other);
return *this;
}
Ref &operator=(const Ref &other) {
*this = Ref(other);
return *this;
}
Ref &operator=(Ref &&other) {
other.swap(*this);
return *this;
}
~Ref() {
if (m_ref != nullptr) {
m_ref->decRef();
}
}
void swap(Ref<T> &other) { std::swap(m_ref, other.m_ref); }
T *get() const { return m_ref; }
T *release() { return std::exchange(m_ref, nullptr); }
T *operator->() const { return m_ref; }
explicit operator bool() const { return m_ref != nullptr; }
bool operator==(std::nullptr_t) const { return m_ref == nullptr; }
bool operator!=(std::nullptr_t) const { return m_ref != nullptr; }
auto operator<=>(const T *other) const { return m_ref <=> other; }
auto operator<=>(const Ref &other) const = default;
};
enum class TaskState { InProgress, Complete, Canceled };
struct AsyncTaskCtl {
std::atomic<unsigned> refs{0};
std::atomic<TaskState> stateStorage{TaskState::InProgress};
virtual ~AsyncTaskCtl() = default;
void incRef() { refs.fetch_add(1, std::memory_order::relaxed); }
void decRef() {
if (refs.fetch_sub(1, std::memory_order::relaxed) == 1) {
delete this;
}
}
bool isCanceled() const {
return stateStorage.load(std::memory_order::relaxed) == TaskState::Canceled;
}
bool isComplete() const {
return stateStorage.load(std::memory_order::relaxed) == TaskState::Complete;
}
bool isInProgress() const {
return stateStorage.load(std::memory_order::relaxed) ==
TaskState::InProgress;
}
void cancel() {
auto state = TaskState::InProgress;
while (state == TaskState::InProgress) {
if (stateStorage.compare_exchange_weak(state, TaskState::Canceled,
std::memory_order::relaxed)) {
break;
}
}
stateStorage.notify_all();
}
void complete() {
auto state = TaskState::InProgress;
while (state != TaskState::Complete) {
if (stateStorage.compare_exchange_weak(state, TaskState::Complete,
std::memory_order::relaxed)) {
break;
}
}
stateStorage.notify_all();
}
void wait() {
stateStorage.wait(TaskState::InProgress, std::memory_order::relaxed);
}
virtual void invoke() = 0;
};
namespace detail {
template <typename T>
concept LambdaWithoutClosure = requires(T t) { +t; };
}
template <typename T> struct AsyncTask;
template <typename T>
requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
detail::LambdaWithoutClosure<T>)
struct AsyncTask<T> : AsyncTaskCtl {
static constexpr bool (*fn)(const AsyncTaskCtl &) = +std::declval<T>();
AsyncTask() = default;
AsyncTask(T &&) {}
void invoke() override {
auto &base = *static_cast<const AsyncTaskCtl *>(this);
if (fn(base)) {
complete();
}
}
};
template <typename T>
requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
Ref<AsyncTaskCtl> createTask(T &&task) {
return Ref<AsyncTaskCtl>(new AsyncTask<T>(std::forward<T>(task)));
}
template <typename T>
requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
!detail::LambdaWithoutClosure<T>)
struct AsyncTask<T> : AsyncTaskCtl {
alignas(T) std::byte taskStorage[sizeof(T)];
AsyncTask() = default;
AsyncTask(T &&t) { new (taskStorage) T(std::forward<T>(t)); }
AsyncTask &operator=(T &&t) {
new (taskStorage) T(std::forward<T>(t));
return *this;
}
void invoke() override {
auto &lambda = *std::bit_cast<T *>(&taskStorage);
auto &base = *static_cast<const AsyncTaskCtl *>(this);
if (lambda(base)) {
complete();
}
std::bit_cast<T *>(&taskStorage)->~T();
}
};
class Scheduler;
class TaskSet {
std::vector<Ref<AsyncTaskCtl>> tasks;
public:
void append(Ref<AsyncTaskCtl> task) { tasks.push_back(std::move(task)); }
void wait() {
for (auto task : tasks) {
task->wait();
}
tasks.clear();
}
void enqueue(Scheduler &scheduler);
};
class Scheduler {
std::vector<std::thread> workThreads;
std::vector<Ref<AsyncTaskCtl>> tasks;
std::mutex taskMtx;
std::condition_variable taskCv;
std::atomic<bool> exit{false};
public:
explicit Scheduler(std::size_t threadCount) {
for (std::size_t i = 0; i < threadCount; ++i) {
workThreads.push_back(std::thread{[this] { entry(); }});
}
}
~Scheduler() {
exit = true;
taskCv.notify_all();
for (auto &thread : workThreads) {
thread.join();
}
}
template <typename T>
requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
Ref<AsyncTaskCtl> enqueue(T &&task) {
auto taskHandle = createTask(std::forward<T>(task));
enqueue(taskHandle);
return taskHandle;
}
void enqueue(Ref<AsyncTaskCtl> task) {
std::lock_guard lock(taskMtx);
tasks.push_back(std::move(task));
taskCv.notify_one();
}
template <typename T>
requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
void enqueue(TaskSet &set, T &&task) {
auto taskCtl = enqueue(std::forward<T>(task));
set.append(taskCtl);
}
private:
void entry() {
while (!exit.load(std::memory_order::relaxed)) {
Ref<AsyncTaskCtl> task;
if (task == nullptr) {
std::unique_lock lock(taskMtx);
if (tasks.empty()) {
taskCv.wait(lock);
}
if (tasks.empty()) {
continue;
}
task = std::move(tasks.back());
tasks.pop_back();
}
if (task != nullptr) {
task->invoke();
}
}
}
};
inline void TaskSet::enqueue(Scheduler &scheduler) {
for (auto task : tasks) {
scheduler.enqueue(std::move(task));
}
}
} // namespace amdgpu::device

View file

@ -2,9 +2,12 @@
#include "tiler.hpp"
#include "util/VerifyVulkan.hpp"
#include "util/area.hpp"
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstring>
#include <mutex>
#include <span>
#include <utility>
#include <vulkan/vulkan_core.h>
@ -150,78 +153,58 @@ struct DeviceMemoryRef {
VkDeviceSize offset = 0;
VkDeviceSize size = 0;
void *data = nullptr;
void *allocator = nullptr;
void (*release)(DeviceMemoryRef &memoryRef) = nullptr;
};
class MemoryResource {
DeviceMemory mMemory;
VkMemoryPropertyFlags mProperties = 0;
std::size_t mSize = 0;
std::size_t mAllocationOffset = 0;
char *mData = nullptr;
util::MemoryAreaTable<> table;
const char *debugName = "<unknown>";
std::mutex mMtx;
public:
MemoryResource(const MemoryResource &) = delete;
MemoryResource() = default;
MemoryResource(MemoryResource &&other) = default;
MemoryResource &operator=(MemoryResource &&other) = default;
~MemoryResource() {
if (mMemory.getHandle() != nullptr && mData != nullptr) {
vkUnmapMemory(g_vkDevice, mMemory.getHandle());
}
}
void clear() { mAllocationOffset = 0; }
static MemoryResource CreateFromFd(int fd, std::size_t size) {
void initFromHost(void *data, std::size_t size) {
assert(mMemory.getHandle() == nullptr);
auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
MemoryResource result;
result.mMemory = DeviceMemory::CreateExternalFd(
fd, size, findPhysicalMemoryTypeIndex(~0, properties));
result.mProperties = properties;
result.mSize = size;
return result;
mMemory = DeviceMemory::CreateExternalHostMemory(data, size, properties);
table.map(0, size);
debugName = "direct";
}
static MemoryResource CreateFromHost(void *data, std::size_t size) {
void initHostVisible(std::size_t size) {
assert(mMemory.getHandle() == nullptr);
auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
MemoryResource result;
result.mMemory =
DeviceMemory::CreateExternalHostMemory(data, size, properties);
result.mProperties = properties;
result.mSize = size;
return result;
}
static MemoryResource CreateHostVisible(std::size_t size) {
auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
MemoryResource result;
result.mMemory = DeviceMemory::Allocate(size, ~0, properties);
result.mProperties = properties;
result.mSize = size;
auto memory = DeviceMemory::Allocate(size, ~0, properties);
void *data = nullptr;
Verify() << vkMapMemory(g_vkDevice, result.mMemory.getHandle(), 0, size, 0,
&data);
result.mData = reinterpret_cast<char *>(data);
Verify() << vkMapMemory(g_vkDevice, memory.getHandle(), 0, size, 0, &data);
return result;
mMemory = std::move(memory);
table.map(0, size);
mData = reinterpret_cast<char *>(data);
debugName = "host";
}
static MemoryResource CreateDeviceLocal(std::size_t size) {
void initDeviceLocal(std::size_t size) {
assert(mMemory.getHandle() == nullptr);
auto properties = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
MemoryResource result;
result.mMemory = DeviceMemory::Allocate(size, ~0, properties);
result.mProperties = properties;
result.mSize = size;
return result;
mMemory = DeviceMemory::Allocate(size, ~0, properties);
table.map(0, size);
debugName = "local";
}
DeviceMemoryRef allocate(VkMemoryRequirements requirements) {
@ -230,22 +213,55 @@ public:
util::unreachable();
}
auto offset = (mAllocationOffset + requirements.alignment - 1) &
~(requirements.alignment - 1);
mAllocationOffset = offset + requirements.size;
if (mAllocationOffset > mSize) {
util::unreachable("out of memory resource");
std::lock_guard lock(mMtx);
for (auto elem : table) {
auto offset = (elem.beginAddress + requirements.alignment - 1) &
~(requirements.alignment - 1);
if (offset >= elem.endAddress) {
continue;
}
auto blockSize = elem.endAddress - offset;
if (blockSize < requirements.size) {
continue;
}
table.unmap(offset, offset + requirements.size);
return {mMemory.getHandle(),
offset,
requirements.size,
mData,
this,
[](DeviceMemoryRef &memoryRef) {
auto self =
reinterpret_cast<MemoryResource *>(memoryRef.allocator);
self->deallocate(memoryRef);
}};
}
return {mMemory.getHandle(), offset, requirements.size, mData};
util::unreachable("out of memory resource");
}
void deallocate(DeviceMemoryRef memory) {
std::lock_guard lock(mMtx);
table.map(memory.offset, memory.offset + memory.size);
}
void dump() {
std::lock_guard lock(mMtx);
for (auto elem : table) {
std::fprintf(stderr, "%zu - %zu\n", elem.beginAddress, elem.endAddress);
}
}
DeviceMemoryRef getFromOffset(std::uint64_t offset, std::size_t size) {
return {mMemory.getHandle(), offset, size, nullptr};
return {mMemory.getHandle(), offset, size, nullptr, nullptr, nullptr};
}
std::size_t getSize() const { return mSize; }
explicit operator bool() const { return mMemory.getHandle() != nullptr; }
};
@ -364,6 +380,10 @@ public:
~Buffer() {
if (mBuffer != nullptr) {
vkDestroyBuffer(g_vkDevice, mBuffer, g_vkAllocator);
if (mMemory.release != nullptr) {
mMemory.release(mMemory);
}
}
}
@ -589,12 +609,13 @@ public:
return requirements;
}
void readFromBuffer(VkCommandBuffer cmdBuffer, const Buffer &buffer,
VkImageAspectFlags destAspect) {
void readFromBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
VkImageAspectFlags destAspect,
VkDeviceSize bufferOffset = 0) {
transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
VkBufferImageCopy region{};
region.bufferOffset = 0;
region.bufferOffset = bufferOffset;
region.bufferRowLength = 0;
region.bufferImageHeight = 0;
region.imageSubresource.aspectMask = destAspect;
@ -604,11 +625,11 @@ public:
region.imageOffset = {0, 0, 0};
region.imageExtent = {mWidth, mHeight, 1};
vkCmdCopyBufferToImage(cmdBuffer, buffer.getHandle(), mImage,
vkCmdCopyBufferToImage(cmdBuffer, buffer, mImage,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
}
void writeToBuffer(VkCommandBuffer cmdBuffer, const Buffer &buffer,
void writeToBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
VkImageAspectFlags sourceAspect) {
transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
@ -624,8 +645,8 @@ public:
region.imageExtent = {mWidth, mHeight, 1};
vkCmdCopyImageToBuffer(cmdBuffer, mImage,
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
buffer.getHandle(), 1, &region);
VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, 1,
&region);
}
[[nodiscard]] Buffer writeToBuffer(VkCommandBuffer cmdBuffer,
@ -635,7 +656,7 @@ public:
pool, getMemoryRequirements().size,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
writeToBuffer(cmdBuffer, transferBuffer, sourceAspect);
writeToBuffer(cmdBuffer, transferBuffer.getHandle(), sourceAspect);
return transferBuffer;
}
@ -661,7 +682,7 @@ public:
transferBuffer.readFromImage(address, bpp, tileMode, width, height, 1,
pitch);
readFromBuffer(cmdBuffer, transferBuffer, destAspect);
readFromBuffer(cmdBuffer, transferBuffer.getHandle(), destAspect);
return transferBuffer;
}
@ -736,6 +757,7 @@ class Image2D {
VkImageLayout mLayout = {};
unsigned mWidth = 0;
unsigned mHeight = 0;
DeviceMemoryRef mMemory;
public:
Image2D(const Image2D &) = delete;
@ -746,6 +768,10 @@ public:
~Image2D() {
if (mImage != nullptr) {
vkDestroyImage(g_vkDevice, mImage, g_vkAllocator);
if (mMemory.release != nullptr) {
mMemory.release(mMemory);
}
}
}
@ -829,6 +855,7 @@ public:
void bindMemory(DeviceMemoryRef memory) {
Verify() << vkBindImageMemory(g_vkDevice, mImage, memory.deviceMemory,
memory.offset);
mMemory = memory;
}
friend ImageRef;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,182 @@
#pragma once
#include <cassert>
#include <cstdint>
#include <map>
#include <set>
namespace util {
struct AreaInfo {
std::uint64_t beginAddress;
std::uint64_t endAddress;
};
struct NoInvalidationHandle {
void handleInvalidation(std::uint64_t) {}
};
struct StdSetInvalidationHandle {
std::set<std::uint64_t, std::greater<>> invalidated;
void handleInvalidation(std::uint64_t address) {
invalidated.insert(address);
}
};
template <typename InvalidationHandleT = NoInvalidationHandle>
class MemoryAreaTable : public InvalidationHandleT {
enum class Kind { O, X };
std::map<std::uint64_t, Kind> mAreas;
public:
class iterator {
using map_iterator = typename std::map<std::uint64_t, Kind>::iterator;
map_iterator it;
public:
iterator() = default;
iterator(map_iterator it) : it(it) {}
AreaInfo operator*() const { return {it->first, std::next(it)->first}; }
iterator &operator++() {
++it;
++it;
return *this;
}
iterator &operator--() {
--it;
--it;
return *this;
}
bool operator==(iterator other) const { return it == other.it; }
bool operator!=(iterator other) const { return it != other.it; }
};
iterator begin() { return iterator(mAreas.begin()); }
iterator end() { return iterator(mAreas.end()); }
void clear() { mAreas.clear(); }
AreaInfo queryArea(std::uint64_t address) const {
auto it = mAreas.lower_bound(address);
assert(it != mAreas.end());
std::uint64_t endAddress = 0;
if (it->first != address) {
assert(it->second == Kind::X);
endAddress = it->first;
--it;
} else {
assert(it->second == Kind::O);
endAddress = std::next(it)->first;
}
auto startAddress = std::uint64_t(it->first);
return {startAddress, endAddress};
}
void map(std::uint64_t beginAddress, std::uint64_t endAddress) {
auto [beginIt, beginInserted] = mAreas.emplace(beginAddress, Kind::O);
auto [endIt, endInserted] = mAreas.emplace(endAddress, Kind::X);
if (!beginInserted) {
if (beginIt->second == Kind::X) {
// it was close, extend to open
assert(beginIt != mAreas.begin());
--beginIt;
}
} else if (beginIt != mAreas.begin()) {
auto prevRangePointIt = std::prev(beginIt);
if (prevRangePointIt->second == Kind::O) {
// we found range start before inserted one, remove insertion and extend
// begin
this->handleInvalidation(beginIt->first);
mAreas.erase(beginIt);
beginIt = prevRangePointIt;
}
}
if (!endInserted) {
if (endIt->second == Kind::O) {
// it was open, extend to close
assert(endIt != mAreas.end());
++endIt;
}
} else {
auto nextRangePointIt = std::next(endIt);
if (nextRangePointIt != mAreas.end() &&
nextRangePointIt->second == Kind::X) {
// we found range end after inserted one, remove insertion and extend
// end
this->handleInvalidation(std::prev(endIt)->first);
mAreas.erase(endIt);
endIt = nextRangePointIt;
}
}
// eat everything in middle of the range
++beginIt;
while (beginIt != endIt) {
this->handleInvalidation(std::prev(endIt)->first);
beginIt = mAreas.erase(beginIt);
}
}
void unmap(std::uint64_t beginAddress, std::uint64_t endAddress) {
auto beginIt = mAreas.lower_bound(beginAddress);
if (beginIt == mAreas.end() || beginIt->first >= endAddress) {
return;
}
if (beginIt->first > beginAddress && beginIt->second == Kind::X) {
// we have found end after unmap begin, need to insert new end
this->handleInvalidation(std::prev(beginIt)->first);
auto newBeginIt = mAreas.emplace_hint(beginIt, beginAddress, Kind::X);
mAreas.erase(beginIt);
if (newBeginIt == mAreas.end()) {
return;
}
beginIt = std::next(newBeginIt);
} else if (beginIt->second == Kind::X) {
beginIt = ++beginIt;
}
Kind lastKind = Kind::X;
while (beginIt != mAreas.end() && beginIt->first <= endAddress) {
lastKind = beginIt->second;
if (lastKind == Kind::O) {
this->handleInvalidation(std::prev(beginIt)->first);
}
beginIt = mAreas.erase(beginIt);
}
if (lastKind != Kind::O) {
return;
}
// Last removed was range open, need to insert new one at unmap end
mAreas.emplace_hint(beginIt, endAddress, Kind::O);
}
std::size_t totalMemory() const {
std::size_t result = 0;
for (auto it = mAreas.begin(), end = mAreas.end(); it != end; ++it) {
auto rangeBegin = it;
auto rangeEnd = ++it;
result += rangeEnd->first - rangeBegin->first;
}
return result;
}
};
} // namespace util

View file

@ -4,6 +4,7 @@
#include "Stage.hpp"
#include <amdgpu/RemoteMemory.hpp>
#include <util/area.hpp>
#include <cstdint>
#include <span>
@ -25,7 +26,7 @@ struct Shader {
};
Shader convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
std::span<const std::uint32_t> userSpgrs, int bindingOffset,
std::uint32_t dimX = 1, std::uint32_t dimY = 1,
std::uint32_t dimZ = 1);
std::span<const std::uint32_t> userSpgrs, std::uint32_t dimX,
std::uint32_t dimY, std::uint32_t dimZ,
util::MemoryAreaTable<> &dependencies);
} // namespace amdgpu::shader

View file

@ -5,11 +5,11 @@
#include "Stage.hpp"
#include "TypeId.hpp"
#include "Uniform.hpp"
#include "util/area.hpp"
#include <amdgpu/RemoteMemory.hpp>
#include <forward_list>
#include <spirv/spirv-builder.hpp>
#include <unordered_map>
#include <util/unreachable.hpp>
#include <bit>
@ -96,8 +96,11 @@ class ConverterContext {
spirv::Function mDiscardFn;
public:
ConverterContext(RemoteMemory memory, Stage stage)
: mMemory(memory), mStage(stage) {
util::MemoryAreaTable<> *dependencies = nullptr;
ConverterContext(RemoteMemory memory, Stage stage,
util::MemoryAreaTable<> *dependencies)
: mStage(stage), mMemory(memory), dependencies(dependencies) {
mGlslStd450 = mBuilder.createExtInstImport("GLSL.std.450");
}

View file

@ -1,5 +1,5 @@
#pragma once
namespace amdgpu::shader {
enum class Stage { None, Vertex, Fragment, Geometry, Compute };
enum class Stage : unsigned char { None, Vertex, Fragment, Geometry, Compute };
}

View file

@ -0,0 +1,62 @@
#pragma once
#include "Stage.hpp"
#include "util/unreachable.hpp"
namespace amdgpu::shader {
struct UniformBindings {
static constexpr auto kBufferSlots = 16;
static constexpr auto kImageSlots = 16;
static constexpr auto kSamplerSlots = 16;
static constexpr auto kBufferOffset = 0;
static constexpr auto kImageOffset = kBufferOffset + kBufferSlots;
static constexpr auto kSamplerOffset = kImageOffset + kImageSlots;
static constexpr auto kStageSize = kSamplerOffset + kSamplerSlots;
static constexpr auto kVertexOffset = 0;
static constexpr auto kFragmentOffset = kStageSize;
static unsigned getBufferBinding(Stage stage, unsigned index) {
if (index >= kBufferSlots) {
util::unreachable();
}
return index + getStageOffset(stage) + kBufferOffset;
}
static unsigned getImageBinding(Stage stage, unsigned index) {
if (index >= kImageSlots) {
util::unreachable();
}
return index + getStageOffset(stage) + kImageOffset;
}
static unsigned getSamplerBinding(Stage stage, unsigned index) {
if (index >= kSamplerSlots) {
util::unreachable();
}
return index + getStageOffset(stage) + kSamplerOffset;
}
private:
static unsigned getStageOffset(Stage stage) {
switch (stage) {
case Stage::Fragment:
return kFragmentOffset;
case Stage::Vertex:
return kVertexOffset;
case Stage::Compute:
return kVertexOffset;
default:
util::unreachable();
}
}
};
} // namespace amdgpu::shader

View file

@ -12,7 +12,7 @@ struct CfgBuilder {
RemoteMemory memory;
std::size_t analyzeBb(cf::BasicBlock *bb, std::uint64_t *successors,
std::size_t *successorsCount, auto pushWork) {
std::size_t *successorsCount) {
auto address = bb->getAddress();
auto instBegin = memory.getPointer<std::uint32_t>(address);
auto instHex = instBegin;
@ -130,18 +130,10 @@ struct CfgBuilder {
std::uint64_t successors[2];
std::size_t successorsCount = 0;
std::size_t size = analyzeBb(bb, successors, &successorsCount,
[&](std::uint64_t address) {
if (processed.insert(address).second) {
workList.push_back(address);
}
});
std::size_t size = analyzeBb(bb, successors, &successorsCount);
bb->setSize(size);
if (successorsCount == 2) {
auto succ0Address = successors[0];
auto succ1Address = successors[1];
branches.push_back(
{address + size - 4, 2, {successors[0], successors[1]}});

View file

@ -2,21 +2,16 @@
#include "CfBuilder.hpp"
#include "ConverterContext.hpp"
#include "Fragment.hpp"
#include "FragmentTerminator.hpp"
#include "Instruction.hpp"
#include "RegisterId.hpp"
#include "RegisterState.hpp"
#include "UniformBindings.hpp"
#include "amdgpu/RemoteMemory.hpp"
#include "cf.hpp"
#include "scf.hpp"
#include "util/unreachable.hpp"
#include <compare>
#include <cstddef>
#include <forward_list>
#include <memory>
#include <spirv/spirv.hpp>
#include <unordered_map>
#include <utility>
#include <vector>
static void printInstructions(const scf::PrintOptions &options, unsigned depth,
@ -365,9 +360,10 @@ private:
amdgpu::shader::Shader
amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
std::span<const std::uint32_t> userSpgrs,
int bindingOffset, std::uint32_t dimX,
std::uint32_t dimY, std::uint32_t dimZ) {
ConverterContext ctxt(memory, stage);
std::uint32_t dimX, std::uint32_t dimY,
std::uint32_t dimZ,
util::MemoryAreaTable<> &dependencies) {
ConverterContext ctxt(memory, stage, &dependencies);
auto &builder = ctxt.getBuilder();
builder.createCapability(spv::Capability::Shader);
builder.createCapability(spv::Capability::ImageQuery);
@ -412,9 +408,12 @@ amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
std::fflush(stdout);
mainFunction->exitFragment.outputs.clear();
std::size_t samplerCount = 0;
std::size_t imageCount = 0;
std::size_t bufferCount = 0;
for (auto &uniform : ctxt.getUniforms()) {
auto &newUniform = result.uniforms.emplace_back();
newUniform.binding = bindingOffset++;
for (int i = 0; i < 8; ++i) {
newUniform.buffer[i] = uniform.buffer[i];
@ -422,23 +421,29 @@ amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
std::uint32_t descriptorSet = 0;
switch (uniform.typeId) {
case TypeId::Sampler:
newUniform.kind = Shader::UniformKind::Sampler;
newUniform.binding =
UniformBindings::getSamplerBinding(stage, samplerCount++);
break;
case TypeId::Image2D:
newUniform.kind = Shader::UniformKind::Image;
newUniform.binding =
UniformBindings::getImageBinding(stage, imageCount++);
break;
default:
newUniform.kind = Shader::UniformKind::Buffer;
newUniform.binding =
UniformBindings::getBufferBinding(stage, bufferCount++);
break;
}
ctxt.getBuilder().createDecorate(
uniform.variable, spv::Decoration::DescriptorSet, {{descriptorSet}});
ctxt.getBuilder().createDecorate(uniform.variable, spv::Decoration::Binding,
{{newUniform.binding}});
switch (uniform.typeId) {
case TypeId::Sampler:
newUniform.kind = Shader::UniformKind::Sampler;
break;
case TypeId::Image2D:
newUniform.kind = Shader::UniformKind::Image;
break;
default:
newUniform.kind = Shader::UniformKind::Buffer;
break;
}
newUniform.accessOp = uniform.accessOp;
}

View file

@ -1568,6 +1568,10 @@ void convertSmrd(Fragment &fragment, Smrd inst) {
auto address =
*optLoAddress | (static_cast<std::uint64_t>(*optHiAddress) << 32);
fragment.context->dependencies->map(address + (inst.offset << 2),
address + (inst.offset << 2) +
sizeof(std::uint32_t) * count);
auto data =
memory.getPointer<std::uint32_t>(address + (inst.offset << 2));
for (std::uint32_t i = 0; i < count; ++i) {
@ -5574,6 +5578,8 @@ void amdgpu::shader::Fragment::convert(std::uint64_t size) {
auto ptr = context->getMemory().getPointer<std::uint32_t>(registers->pc);
auto endptr = ptr + size / sizeof(std::uint32_t);
context->dependencies->map(registers->pc, registers->pc + size);
while (ptr < endptr) {
Instruction inst(ptr);
// auto startPoint = builder.bodyRegion.getCurrentPosition();
@ -5615,6 +5621,8 @@ Value amdgpu::shader::Fragment::getRegister(RegisterId id) {
case 247:
return {context->getFloat32Type(), context->getFloat32(-4.0f)};
case 255: {
context->dependencies->map(registers->pc,
registers->pc + sizeof(std::uint32_t));
auto ptr = context->getMemory().getPointer<std::uint32_t>(registers->pc);
registers->pc += sizeof(std::uint32_t);
return {context->getUInt32Type(), context->getUInt32(*ptr)};

View file

@ -1,3 +1,4 @@
#include "amdgpu/RemoteMemory.hpp"
#include <algorithm>
#include <amdgpu/bridge/bridge.hpp>
#include <amdgpu/device/device.hpp>
@ -33,6 +34,7 @@ static void usage(std::FILE *out, const char *argv0) {
" --gpu <index> - specify physical gpu index to use, default is 0\n");
std::fprintf(out,
" --presenter <presenter mode> - set flip engine target\n");
std::fprintf(out, " --no-validation - disable validation layers\n");
std::fprintf(out, " -h, --help - show this message\n");
std::fprintf(out, "\n");
std::fprintf(out, " presenter mode:\n");
@ -52,6 +54,7 @@ int main(int argc, const char *argv[]) {
const char *shmName = "/rpcsx-os-memory";
unsigned long gpuIndex = 0;
auto presenter = PresenterMode::Window;
bool noValidation = false;
for (int i = 1; i < argc; ++i) {
if (argv[i] == std::string_view("--cmd-bridge")) {
@ -106,6 +109,11 @@ int main(int argc, const char *argv[]) {
continue;
}
if (argv[i] == std::string_view("--no-validation")) {
noValidation = true;
continue;
}
usage(stderr, argv[0]);
return 1;
}
@ -122,7 +130,7 @@ int main(int argc, const char *argv[]) {
auto requiredInstanceExtensions = std::vector<const char *>(
glfwExtensions, glfwExtensions + glfwExtensionCount);
bool enableValidation = true;
bool enableValidation = !noValidation;
if (enableValidation) {
requiredInstanceExtensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
@ -248,6 +256,7 @@ int main(int argc, const char *argv[]) {
// VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
VK_EXT_SEPARATE_STENCIL_USAGE_EXTENSION_NAME,
VK_KHR_SWAPCHAIN_EXTENSION_NAME,
VK_EXT_SHADER_OBJECT_EXTENSION_NAME,
};
if (isDeviceExtensionSupported(VK_EXT_DEBUG_MARKER_EXTENSION_NAME)) {
@ -404,9 +413,16 @@ int main(int argc, const char *argv[]) {
}
}
VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures{
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT,
.shaderObject = VK_TRUE};
VkPhysicalDeviceVulkan13Features phyDevFeatures13{
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
.maintenance4 = VK_TRUE};
.pNext = &shaderObjectFeatures,
.dynamicRendering = VK_TRUE,
.maintenance4 = VK_TRUE,
};
VkPhysicalDeviceVulkan12Features phyDevFeatures12{
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,