[amdgpu] WIP cache implementation

2026-03-12 08:24:12 +01:00 · 2023-07-24 06:03:55 +03:00 · 2023-07-24 06:03:55 +03:00 · 760aea0e49
parent 306fecf2ab
commit 760aea0e49
13 changed files with 1823 additions and 1302 deletions
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -1,15 +1,11 @@
 #pragma once

-#include "amdgpu/RemoteMemory.hpp"
 #include "amdgpu/bridge/bridge.hpp"
-#include "amdgpu/shader/Converter.hpp"
 #include "amdgpu/shader/Instruction.hpp"
-#include "util/Verify.hpp"
+#include "util/area.hpp"

-#include <cassert>
-#include <map>
-#include <set>
 #include <string>
+#include <vector>
 #include <vulkan/vulkan_core.h>

 namespace amdgpu::device {
@ -1264,199 +1260,12 @@ struct GnmTBuffer {

 static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);

-struct ShaderModule {
-  VkPipeline pipeline;
-  VkPipelineLayout pipelineLayout;
-  VkDescriptorSetLayout descriptorSetLayout;
-  VkDescriptorPool descriptorPool;
-
-  void destroy() const;
-};
-
 constexpr auto kPageSize = 0x4000;

-struct AreaInfo {
-  std::uint64_t beginAddress;
-  std::uint64_t endAddress;
-};
-
-struct NoInvalidationHandle {
-  void handleInvalidation(std::uint64_t) {}
-};
-
-struct StdSetInvalidationHandle {
-  std::set<std::uint64_t, std::greater<>> invalidated;
-
-  void handleInvalidation(std::uint64_t address) {
-    invalidated.insert(address);
-  }
-};
-
-template <typename InvalidationHandleT = NoInvalidationHandle>
-class MemoryAreaTable : public InvalidationHandleT {
-  enum class Kind { O, X };
-  std::map<std::uint64_t, Kind> mAreas;
-
-public:
-  class iterator {
-    using map_iterator = typename std::map<std::uint64_t, Kind>::iterator;
-    map_iterator it;
-
-  public:
-    iterator() = default;
-    iterator(map_iterator it) : it(it) {}
-
-    AreaInfo operator*() const { return {it->first, std::next(it)->first}; }
-
-    iterator &operator++() const {
-      ++it;
-      ++it;
-      return *this;
-    }
-
-    iterator &operator--() const {
-      --it;
-      --it;
-      return *this;
-    }
-
-    bool operator==(iterator other) const { return it == other.it; }
-    bool operator!=(iterator other) const { return it != other.it; }
-  };
-
-  iterator begin() { return iterator(mAreas.begin()); }
-  iterator end() { return iterator(mAreas.end()); }
-
-  void clear() { mAreas.clear(); }
-
-  AreaInfo queryArea(std::uint64_t address) const {
-    auto it = mAreas.lower_bound(address);
-    assert(it != mAreas.end());
-    std::uint64_t endAddress = 0;
-    if (it->first != address) {
-      assert(it->second == Kind::X);
-      endAddress = it->first;
-      --it;
-    } else {
-      assert(it->second == Kind::O);
-      endAddress = std::next(it)->first;
-    }
-
-    auto startAddress = std::uint64_t(it->first);
-
-    return {startAddress, endAddress};
-  }
-
-  void map(std::uint64_t beginAddress, std::uint64_t endAddress) {
-    auto [beginIt, beginInserted] = mAreas.emplace(beginAddress, Kind::O);
-    auto [endIt, endInserted] = mAreas.emplace(endAddress, Kind::X);
-
-    if (!beginInserted) {
-      if (beginIt->second == Kind::X) {
-        // it was close, extend to open
-        assert(beginIt != mAreas.begin());
-        --beginIt;
-      }
-    } else if (beginIt != mAreas.begin()) {
-      auto prevRangePointIt = std::prev(beginIt);
-
-      if (prevRangePointIt->second == Kind::O) {
-        // we found range start before inserted one, remove insertion and extend
-        // begin
-        this->handleInvalidation(beginIt->first);
-        mAreas.erase(beginIt);
-        beginIt = prevRangePointIt;
-      }
-    }
-
-    if (!endInserted) {
-      if (endIt->second == Kind::O) {
-        // it was open, extend to close
-        assert(endIt != mAreas.end());
-        ++endIt;
-      }
-    } else {
-      auto nextRangePointIt = std::next(endIt);
-
-      if (nextRangePointIt != mAreas.end() &&
-          nextRangePointIt->second == Kind::X) {
-        // we found range end after inserted one, remove insertion and extend
-        // end
-        this->handleInvalidation(std::prev(endIt)->first);
-        mAreas.erase(endIt);
-        endIt = nextRangePointIt;
-      }
-    }
-
-    // eat everything in middle of the range
-    ++beginIt;
-    while (beginIt != endIt) {
-      this->handleInvalidation(std::prev(endIt)->first);
-      beginIt = mAreas.erase(beginIt);
-    }
-  }
-
-  void unmap(std::uint64_t beginAddress, std::uint64_t endAddress) {
-    auto beginIt = mAreas.lower_bound(beginAddress);
-
-    if (beginIt == mAreas.end() || beginIt->first >= endAddress) {
-      return;
-    }
-    if (beginIt->first > beginAddress && beginIt->second == Kind::X) {
-      // we have found end after unmap begin, need to insert new end
-      this->handleInvalidation(std::prev(beginIt)->first);
-      auto newBeginIt = mAreas.emplace_hint(beginIt, beginAddress, Kind::X);
-      mAreas.erase(beginIt);
-
-      if (newBeginIt == mAreas.end()) {
-        return;
-      }
-
-      beginIt = std::next(newBeginIt);
-    } else if (beginIt->second == Kind::X) {
-      beginIt = ++beginIt;
-    }
-
-    Kind lastKind = Kind::X;
-    while (beginIt != mAreas.end() && beginIt->first <= endAddress) {
-      lastKind = beginIt->second;
-      if (lastKind == Kind::O) {
-        this->handleInvalidation(std::prev(beginIt)->first);
-      }
-      beginIt = mAreas.erase(beginIt);
-    }
-
-    if (lastKind != Kind::O) {
-      return;
-    }
-
-    // Last removed was range open, need to insert new one at unmap end
-    mAreas.emplace_hint(beginIt, endAddress, Kind::O);
-  }
-
-  std::size_t totalMemory() const {
-    std::size_t result = 0;
-
-    for (auto it = mAreas.begin(), end = mAreas.end(); it != end; ++it) {
-      auto rangeBegin = it;
-      auto rangeEnd = ++it;
-
-      result += rangeEnd->first - rangeBegin->first;
-    }
-
-    return result;
-  }
-};
-
-extern MemoryAreaTable<StdSetInvalidationHandle> memoryAreaTable;
-
 struct DrawContext {
  VkPipelineCache pipelineCache;
  VkQueue queue;
  VkCommandPool commandPool;
-  std::vector<VkShaderModule> loadedShaderModules;
-
-  ~DrawContext();
 };

 void setVkDevice(VkDevice device,
--- a/hw/amdgpu/device/include/amdgpu/device/scheduler.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/scheduler.hpp
@ -0,0 +1,303 @@
+#pragma once
+
+#include <atomic>
+#include <bit>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace amdgpu::device {
+template <typename T> class Ref {
+  T *m_ref = nullptr;
+
+public:
+  Ref() = default;
+  Ref(std::nullptr_t) {}
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref(OT *ref) : m_ref(ref) {
+    if (m_ref != nullptr) {
+      ref->incRef();
+    }
+  }
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref(const Ref<OT> &other) : m_ref(other.get()) {
+    if (m_ref != nullptr) {
+      m_ref->incRef();
+    }
+  }
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref(Ref<OT> &&other) : m_ref(other.release()) {}
+
+  Ref(const Ref &other) : m_ref(other.get()) {
+    if (m_ref != nullptr) {
+      m_ref->incRef();
+    }
+  }
+  Ref(Ref &&other) : m_ref(other.release()) {}
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref &operator=(Ref<OT> &&other) {
+    other.swap(*this);
+    return *this;
+  }
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref &operator=(OT *other) {
+    *this = Ref(other);
+    return *this;
+  }
+
+  template <typename OT>
+    requires(std::is_base_of_v<T, OT>)
+  Ref &operator=(const Ref<OT> &other) {
+    *this = Ref(other);
+    return *this;
+  }
+
+  Ref &operator=(const Ref &other) {
+    *this = Ref(other);
+    return *this;
+  }
+
+  Ref &operator=(Ref &&other) {
+    other.swap(*this);
+    return *this;
+  }
+
+  ~Ref() {
+    if (m_ref != nullptr) {
+      m_ref->decRef();
+    }
+  }
+
+  void swap(Ref<T> &other) { std::swap(m_ref, other.m_ref); }
+  T *get() const { return m_ref; }
+  T *release() { return std::exchange(m_ref, nullptr); }
+  T *operator->() const { return m_ref; }
+  explicit operator bool() const { return m_ref != nullptr; }
+  bool operator==(std::nullptr_t) const { return m_ref == nullptr; }
+  bool operator!=(std::nullptr_t) const { return m_ref != nullptr; }
+  auto operator<=>(const T *other) const { return m_ref <=> other; }
+  auto operator<=>(const Ref &other) const = default;
+};
+
+enum class TaskState { InProgress, Complete, Canceled };
+
+struct AsyncTaskCtl {
+  std::atomic<unsigned> refs{0};
+  std::atomic<TaskState> stateStorage{TaskState::InProgress};
+
+  virtual ~AsyncTaskCtl() = default;
+
+  void incRef() { refs.fetch_add(1, std::memory_order::relaxed); }
+  void decRef() {
+    if (refs.fetch_sub(1, std::memory_order::relaxed) == 1) {
+      delete this;
+    }
+  }
+
+  bool isCanceled() const {
+    return stateStorage.load(std::memory_order::relaxed) == TaskState::Canceled;
+  }
+  bool isComplete() const {
+    return stateStorage.load(std::memory_order::relaxed) == TaskState::Complete;
+  }
+  bool isInProgress() const {
+    return stateStorage.load(std::memory_order::relaxed) ==
+           TaskState::InProgress;
+  }
+
+  void cancel() {
+    auto state = TaskState::InProgress;
+
+    while (state == TaskState::InProgress) {
+      if (stateStorage.compare_exchange_weak(state, TaskState::Canceled,
+                                             std::memory_order::relaxed)) {
+        break;
+      }
+    }
+
+    stateStorage.notify_all();
+  }
+
+  void complete() {
+    auto state = TaskState::InProgress;
+
+    while (state != TaskState::Complete) {
+      if (stateStorage.compare_exchange_weak(state, TaskState::Complete,
+                                             std::memory_order::relaxed)) {
+        break;
+      }
+    }
+
+    stateStorage.notify_all();
+  }
+
+  void wait() {
+    stateStorage.wait(TaskState::InProgress, std::memory_order::relaxed);
+  }
+
+  virtual void invoke() = 0;
+};
+
+namespace detail {
+template <typename T>
+concept LambdaWithoutClosure = requires(T t) { +t; };
+}
+
+template <typename T> struct AsyncTask;
+
+template <typename T>
+  requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
+           detail::LambdaWithoutClosure<T>)
+struct AsyncTask<T> : AsyncTaskCtl {
+  static constexpr bool (*fn)(const AsyncTaskCtl &) = +std::declval<T>();
+
+  AsyncTask() = default;
+  AsyncTask(T &&) {}
+
+  void invoke() override {
+    auto &base = *static_cast<const AsyncTaskCtl *>(this);
+
+    if (fn(base)) {
+      complete();
+    }
+  }
+};
+
+template <typename T>
+  requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
+Ref<AsyncTaskCtl> createTask(T &&task) {
+  return Ref<AsyncTaskCtl>(new AsyncTask<T>(std::forward<T>(task)));
+}
+
+template <typename T>
+  requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
+           !detail::LambdaWithoutClosure<T>)
+struct AsyncTask<T> : AsyncTaskCtl {
+  alignas(T) std::byte taskStorage[sizeof(T)];
+
+  AsyncTask() = default;
+  AsyncTask(T &&t) { new (taskStorage) T(std::forward<T>(t)); }
+  AsyncTask &operator=(T &&t) {
+    new (taskStorage) T(std::forward<T>(t));
+    return *this;
+  }
+
+  void invoke() override {
+    auto &lambda = *std::bit_cast<T *>(&taskStorage);
+    auto &base = *static_cast<const AsyncTaskCtl *>(this);
+
+    if (lambda(base)) {
+      complete();
+    }
+
+    std::bit_cast<T *>(&taskStorage)->~T();
+  }
+};
+
+class Scheduler;
+class TaskSet {
+  std::vector<Ref<AsyncTaskCtl>> tasks;
+
+public:
+  void append(Ref<AsyncTaskCtl> task) { tasks.push_back(std::move(task)); }
+
+  void wait() {
+    for (auto task : tasks) {
+      task->wait();
+    }
+
+    tasks.clear();
+  }
+
+  void enqueue(Scheduler &scheduler);
+};
+
+class Scheduler {
+  std::vector<std::thread> workThreads;
+  std::vector<Ref<AsyncTaskCtl>> tasks;
+  std::mutex taskMtx;
+  std::condition_variable taskCv;
+  std::atomic<bool> exit{false};
+
+public:
+  explicit Scheduler(std::size_t threadCount) {
+    for (std::size_t i = 0; i < threadCount; ++i) {
+      workThreads.push_back(std::thread{[this] { entry(); }});
+    }
+  }
+
+  ~Scheduler() {
+    exit = true;
+    taskCv.notify_all();
+
+    for (auto &thread : workThreads) {
+      thread.join();
+    }
+  }
+
+  template <typename T>
+    requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
+  Ref<AsyncTaskCtl> enqueue(T &&task) {
+    auto taskHandle = createTask(std::forward<T>(task));
+    enqueue(taskHandle);
+    return taskHandle;
+  }
+
+  void enqueue(Ref<AsyncTaskCtl> task) {
+    std::lock_guard lock(taskMtx);
+    tasks.push_back(std::move(task));
+    taskCv.notify_one();
+  }
+
+  template <typename T>
+    requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
+  void enqueue(TaskSet &set, T &&task) {
+    auto taskCtl = enqueue(std::forward<T>(task));
+    set.append(taskCtl);
+  }
+
+private:
+  void entry() {
+    while (!exit.load(std::memory_order::relaxed)) {
+      Ref<AsyncTaskCtl> task;
+
+      if (task == nullptr) {
+        std::unique_lock lock(taskMtx);
+
+        if (tasks.empty()) {
+          taskCv.wait(lock);
+        }
+
+        if (tasks.empty()) {
+          continue;
+        }
+
+        task = std::move(tasks.back());
+        tasks.pop_back();
+      }
+
+      if (task != nullptr) {
+        task->invoke();
+      }
+    }
+  }
+};
+
+inline void TaskSet::enqueue(Scheduler &scheduler) {
+  for (auto task : tasks) {
+    scheduler.enqueue(std::move(task));
+  }
+}
+} // namespace amdgpu::device
--- a/hw/amdgpu/device/include/amdgpu/device/vk.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/vk.hpp
@ -2,9 +2,12 @@

 #include "tiler.hpp"
 #include "util/VerifyVulkan.hpp"
+#include "util/area.hpp"
 #include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <cstring>
+#include <mutex>
 #include <span>
 #include <utility>
 #include <vulkan/vulkan_core.h>
@ -150,78 +153,58 @@ struct DeviceMemoryRef {
  VkDeviceSize offset = 0;
  VkDeviceSize size = 0;
  void *data = nullptr;
+  void *allocator = nullptr;
+
+  void (*release)(DeviceMemoryRef &memoryRef) = nullptr;
 };

 class MemoryResource {
  DeviceMemory mMemory;
-  VkMemoryPropertyFlags mProperties = 0;
-  std::size_t mSize = 0;
-  std::size_t mAllocationOffset = 0;
  char *mData = nullptr;
+  util::MemoryAreaTable<> table;
+  const char *debugName = "<unknown>";
+
+  std::mutex mMtx;

 public:
-  MemoryResource(const MemoryResource &) = delete;
-
  MemoryResource() = default;
-  MemoryResource(MemoryResource &&other) = default;
-  MemoryResource &operator=(MemoryResource &&other) = default;
-
  ~MemoryResource() {
    if (mMemory.getHandle() != nullptr && mData != nullptr) {
      vkUnmapMemory(g_vkDevice, mMemory.getHandle());
    }
  }

-  void clear() { mAllocationOffset = 0; }
-
-  static MemoryResource CreateFromFd(int fd, std::size_t size) {
+  void initFromHost(void *data, std::size_t size) {
+    assert(mMemory.getHandle() == nullptr);
    auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    MemoryResource result;
-    result.mMemory = DeviceMemory::CreateExternalFd(
-        fd, size, findPhysicalMemoryTypeIndex(~0, properties));
-    result.mProperties = properties;
-    result.mSize = size;
-
-    return result;
+    mMemory = DeviceMemory::CreateExternalHostMemory(data, size, properties);
+    table.map(0, size);
+    debugName = "direct";
  }

-  static MemoryResource CreateFromHost(void *data, std::size_t size) {
+  void initHostVisible(std::size_t size) {
+    assert(mMemory.getHandle() == nullptr);
    auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    MemoryResource result;
-    result.mMemory =
-        DeviceMemory::CreateExternalHostMemory(data, size, properties);
-    result.mProperties = properties;
-    result.mSize = size;
-
-    return result;
-  }
-
-  static MemoryResource CreateHostVisible(std::size_t size) {
-    auto properties = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
-    MemoryResource result;
-    result.mMemory = DeviceMemory::Allocate(size, ~0, properties);
-    result.mProperties = properties;
-    result.mSize = size;
+    auto memory = DeviceMemory::Allocate(size, ~0, properties);

    void *data = nullptr;
-    Verify() << vkMapMemory(g_vkDevice, result.mMemory.getHandle(), 0, size, 0,
-                            &data);
-    result.mData = reinterpret_cast<char *>(data);
+    Verify() << vkMapMemory(g_vkDevice, memory.getHandle(), 0, size, 0, &data);

-    return result;
+    mMemory = std::move(memory);
+    table.map(0, size);
+    mData = reinterpret_cast<char *>(data);
+    debugName = "host";
  }

-  static MemoryResource CreateDeviceLocal(std::size_t size) {
+  void initDeviceLocal(std::size_t size) {
+    assert(mMemory.getHandle() == nullptr);
    auto properties = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;

-    MemoryResource result;
-    result.mMemory = DeviceMemory::Allocate(size, ~0, properties);
-    result.mProperties = properties;
-    result.mSize = size;
-    return result;
+    mMemory = DeviceMemory::Allocate(size, ~0, properties);
+    table.map(0, size);
+    debugName = "local";
  }

  DeviceMemoryRef allocate(VkMemoryRequirements requirements) {
@ -230,22 +213,55 @@ public:
      util::unreachable();
    }

-    auto offset = (mAllocationOffset + requirements.alignment - 1) &
-                  ~(requirements.alignment - 1);
-    mAllocationOffset = offset + requirements.size;
-    if (mAllocationOffset > mSize) {
-      util::unreachable("out of memory resource");
+    std::lock_guard lock(mMtx);
+
+    for (auto elem : table) {
+      auto offset = (elem.beginAddress + requirements.alignment - 1) &
+                    ~(requirements.alignment - 1);
+
+      if (offset >= elem.endAddress) {
+        continue;
+      }
+
+      auto blockSize = elem.endAddress - offset;
+
+      if (blockSize < requirements.size) {
+        continue;
+      }
+
+      table.unmap(offset, offset + requirements.size);
+      return {mMemory.getHandle(),
+              offset,
+              requirements.size,
+              mData,
+              this,
+              [](DeviceMemoryRef &memoryRef) {
+                auto self =
+                    reinterpret_cast<MemoryResource *>(memoryRef.allocator);
+                self->deallocate(memoryRef);
+              }};
    }

-    return {mMemory.getHandle(), offset, requirements.size, mData};
+    util::unreachable("out of memory resource");
+  }
+
+  void deallocate(DeviceMemoryRef memory) {
+    std::lock_guard lock(mMtx);
+    table.map(memory.offset, memory.offset + memory.size);
+  }
+
+  void dump() {
+    std::lock_guard lock(mMtx);
+
+    for (auto elem : table) {
+      std::fprintf(stderr, "%zu - %zu\n", elem.beginAddress, elem.endAddress);
+    }
  }

  DeviceMemoryRef getFromOffset(std::uint64_t offset, std::size_t size) {
-    return {mMemory.getHandle(), offset, size, nullptr};
+    return {mMemory.getHandle(), offset, size, nullptr, nullptr, nullptr};
  }

-  std::size_t getSize() const { return mSize; }
-
  explicit operator bool() const { return mMemory.getHandle() != nullptr; }
 };

@ -364,6 +380,10 @@ public:
  ~Buffer() {
    if (mBuffer != nullptr) {
      vkDestroyBuffer(g_vkDevice, mBuffer, g_vkAllocator);
+
+      if (mMemory.release != nullptr) {
+        mMemory.release(mMemory);
+      }
    }
  }

@ -589,12 +609,13 @@ public:
    return requirements;
  }

-  void readFromBuffer(VkCommandBuffer cmdBuffer, const Buffer &buffer,
-                      VkImageAspectFlags destAspect) {
+  void readFromBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
+                      VkImageAspectFlags destAspect,
+                      VkDeviceSize bufferOffset = 0) {
    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);

    VkBufferImageCopy region{};
-    region.bufferOffset = 0;
+    region.bufferOffset = bufferOffset;
    region.bufferRowLength = 0;
    region.bufferImageHeight = 0;
    region.imageSubresource.aspectMask = destAspect;
@ -604,11 +625,11 @@ public:
    region.imageOffset = {0, 0, 0};
    region.imageExtent = {mWidth, mHeight, 1};

-    vkCmdCopyBufferToImage(cmdBuffer, buffer.getHandle(), mImage,
+    vkCmdCopyBufferToImage(cmdBuffer, buffer, mImage,
                           VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
  }

-  void writeToBuffer(VkCommandBuffer cmdBuffer, const Buffer &buffer,
+  void writeToBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
                     VkImageAspectFlags sourceAspect) {
    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);

@ -624,8 +645,8 @@ public:
    region.imageExtent = {mWidth, mHeight, 1};

    vkCmdCopyImageToBuffer(cmdBuffer, mImage,
-                           VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
-                           buffer.getHandle(), 1, &region);
+                           VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, 1,
+                           &region);
  }

  [[nodiscard]] Buffer writeToBuffer(VkCommandBuffer cmdBuffer,
@ -635,7 +656,7 @@ public:
        pool, getMemoryRequirements().size,
        VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);

-    writeToBuffer(cmdBuffer, transferBuffer, sourceAspect);
+    writeToBuffer(cmdBuffer, transferBuffer.getHandle(), sourceAspect);
    return transferBuffer;
  }

@ -661,7 +682,7 @@ public:
    transferBuffer.readFromImage(address, bpp, tileMode, width, height, 1,
                                 pitch);

-    readFromBuffer(cmdBuffer, transferBuffer, destAspect);
+    readFromBuffer(cmdBuffer, transferBuffer.getHandle(), destAspect);

    return transferBuffer;
  }
@ -736,6 +757,7 @@ class Image2D {
  VkImageLayout mLayout = {};
  unsigned mWidth = 0;
  unsigned mHeight = 0;
+  DeviceMemoryRef mMemory;

 public:
  Image2D(const Image2D &) = delete;
@ -746,6 +768,10 @@ public:
  ~Image2D() {
    if (mImage != nullptr) {
      vkDestroyImage(g_vkDevice, mImage, g_vkAllocator);
+
+      if (mMemory.release != nullptr) {
+        mMemory.release(mMemory);
+      }
    }
  }

@ -829,6 +855,7 @@ public:
  void bindMemory(DeviceMemoryRef memory) {
    Verify() << vkBindImageMemory(g_vkDevice, mImage, memory.deviceMemory,
                                  memory.offset);
+    mMemory = memory;
  }

  friend ImageRef;
--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
--- a/hw/amdgpu/include/util/area.hpp
+++ b/hw/amdgpu/include/util/area.hpp
@ -0,0 +1,182 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <set>
+
+namespace util {
+struct AreaInfo {
+  std::uint64_t beginAddress;
+  std::uint64_t endAddress;
+};
+
+struct NoInvalidationHandle {
+  void handleInvalidation(std::uint64_t) {}
+};
+
+struct StdSetInvalidationHandle {
+  std::set<std::uint64_t, std::greater<>> invalidated;
+
+  void handleInvalidation(std::uint64_t address) {
+    invalidated.insert(address);
+  }
+};
+
+template <typename InvalidationHandleT = NoInvalidationHandle>
+class MemoryAreaTable : public InvalidationHandleT {
+  enum class Kind { O, X };
+  std::map<std::uint64_t, Kind> mAreas;
+
+public:
+  class iterator {
+    using map_iterator = typename std::map<std::uint64_t, Kind>::iterator;
+    map_iterator it;
+
+  public:
+    iterator() = default;
+    iterator(map_iterator it) : it(it) {}
+
+    AreaInfo operator*() const { return {it->first, std::next(it)->first}; }
+
+    iterator &operator++() {
+      ++it;
+      ++it;
+      return *this;
+    }
+
+    iterator &operator--() {
+      --it;
+      --it;
+      return *this;
+    }
+
+    bool operator==(iterator other) const { return it == other.it; }
+    bool operator!=(iterator other) const { return it != other.it; }
+  };
+
+  iterator begin() { return iterator(mAreas.begin()); }
+  iterator end() { return iterator(mAreas.end()); }
+
+  void clear() { mAreas.clear(); }
+
+  AreaInfo queryArea(std::uint64_t address) const {
+    auto it = mAreas.lower_bound(address);
+    assert(it != mAreas.end());
+    std::uint64_t endAddress = 0;
+    if (it->first != address) {
+      assert(it->second == Kind::X);
+      endAddress = it->first;
+      --it;
+    } else {
+      assert(it->second == Kind::O);
+      endAddress = std::next(it)->first;
+    }
+
+    auto startAddress = std::uint64_t(it->first);
+
+    return {startAddress, endAddress};
+  }
+
+  void map(std::uint64_t beginAddress, std::uint64_t endAddress) {
+    auto [beginIt, beginInserted] = mAreas.emplace(beginAddress, Kind::O);
+    auto [endIt, endInserted] = mAreas.emplace(endAddress, Kind::X);
+
+    if (!beginInserted) {
+      if (beginIt->second == Kind::X) {
+        // it was close, extend to open
+        assert(beginIt != mAreas.begin());
+        --beginIt;
+      }
+    } else if (beginIt != mAreas.begin()) {
+      auto prevRangePointIt = std::prev(beginIt);
+
+      if (prevRangePointIt->second == Kind::O) {
+        // we found range start before inserted one, remove insertion and extend
+        // begin
+        this->handleInvalidation(beginIt->first);
+        mAreas.erase(beginIt);
+        beginIt = prevRangePointIt;
+      }
+    }
+
+    if (!endInserted) {
+      if (endIt->second == Kind::O) {
+        // it was open, extend to close
+        assert(endIt != mAreas.end());
+        ++endIt;
+      }
+    } else {
+      auto nextRangePointIt = std::next(endIt);
+
+      if (nextRangePointIt != mAreas.end() &&
+          nextRangePointIt->second == Kind::X) {
+        // we found range end after inserted one, remove insertion and extend
+        // end
+        this->handleInvalidation(std::prev(endIt)->first);
+        mAreas.erase(endIt);
+        endIt = nextRangePointIt;
+      }
+    }
+
+    // eat everything in middle of the range
+    ++beginIt;
+    while (beginIt != endIt) {
+      this->handleInvalidation(std::prev(endIt)->first);
+      beginIt = mAreas.erase(beginIt);
+    }
+  }
+
+  void unmap(std::uint64_t beginAddress, std::uint64_t endAddress) {
+    auto beginIt = mAreas.lower_bound(beginAddress);
+
+    if (beginIt == mAreas.end() || beginIt->first >= endAddress) {
+      return;
+    }
+    if (beginIt->first > beginAddress && beginIt->second == Kind::X) {
+      // we have found end after unmap begin, need to insert new end
+      this->handleInvalidation(std::prev(beginIt)->first);
+      auto newBeginIt = mAreas.emplace_hint(beginIt, beginAddress, Kind::X);
+      mAreas.erase(beginIt);
+
+      if (newBeginIt == mAreas.end()) {
+        return;
+      }
+
+      beginIt = std::next(newBeginIt);
+    } else if (beginIt->second == Kind::X) {
+      beginIt = ++beginIt;
+    }
+
+    Kind lastKind = Kind::X;
+    while (beginIt != mAreas.end() && beginIt->first <= endAddress) {
+      lastKind = beginIt->second;
+      if (lastKind == Kind::O) {
+        this->handleInvalidation(std::prev(beginIt)->first);
+      }
+      beginIt = mAreas.erase(beginIt);
+    }
+
+    if (lastKind != Kind::O) {
+      return;
+    }
+
+    // Last removed was range open, need to insert new one at unmap end
+    mAreas.emplace_hint(beginIt, endAddress, Kind::O);
+  }
+
+  std::size_t totalMemory() const {
+    std::size_t result = 0;
+
+    for (auto it = mAreas.begin(), end = mAreas.end(); it != end; ++it) {
+      auto rangeBegin = it;
+      auto rangeEnd = ++it;
+
+      result += rangeEnd->first - rangeBegin->first;
+    }
+
+    return result;
+  }
+};
+
+} // namespace util
--- a/hw/amdgpu/shader/include/amdgpu/shader/Converter.hpp
+++ b/hw/amdgpu/shader/include/amdgpu/shader/Converter.hpp
@ -4,6 +4,7 @@
 #include "Stage.hpp"

 #include <amdgpu/RemoteMemory.hpp>
+#include <util/area.hpp>

 #include <cstdint>
 #include <span>
@ -25,7 +26,7 @@ struct Shader {
 };

 Shader convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
-               std::span<const std::uint32_t> userSpgrs, int bindingOffset,
-               std::uint32_t dimX = 1, std::uint32_t dimY = 1,
-               std::uint32_t dimZ = 1);
+               std::span<const std::uint32_t> userSpgrs, std::uint32_t dimX,
+               std::uint32_t dimY, std::uint32_t dimZ,
+               util::MemoryAreaTable<> &dependencies);
 } // namespace amdgpu::shader
--- a/hw/amdgpu/shader/include/amdgpu/shader/ConverterContext.hpp
+++ b/hw/amdgpu/shader/include/amdgpu/shader/ConverterContext.hpp
@ -5,11 +5,11 @@
 #include "Stage.hpp"
 #include "TypeId.hpp"
 #include "Uniform.hpp"
+#include "util/area.hpp"

 #include <amdgpu/RemoteMemory.hpp>
 #include <forward_list>
 #include <spirv/spirv-builder.hpp>
-#include <unordered_map>
 #include <util/unreachable.hpp>

 #include <bit>
@ -96,8 +96,11 @@ class ConverterContext {
  spirv::Function mDiscardFn;

 public:
-  ConverterContext(RemoteMemory memory, Stage stage)
-      : mMemory(memory), mStage(stage) {
+  util::MemoryAreaTable<> *dependencies = nullptr;
+
+  ConverterContext(RemoteMemory memory, Stage stage,
+                   util::MemoryAreaTable<> *dependencies)
+      : mStage(stage), mMemory(memory), dependencies(dependencies) {
    mGlslStd450 = mBuilder.createExtInstImport("GLSL.std.450");
  }

--- a/hw/amdgpu/shader/include/amdgpu/shader/Stage.hpp
+++ b/hw/amdgpu/shader/include/amdgpu/shader/Stage.hpp
@ -1,5 +1,5 @@
 #pragma once

 namespace amdgpu::shader {
-enum class Stage { None, Vertex, Fragment, Geometry, Compute };
+enum class Stage : unsigned char { None, Vertex, Fragment, Geometry, Compute };
 }
--- a/hw/amdgpu/shader/include/amdgpu/shader/UniformBindings.hpp
+++ b/hw/amdgpu/shader/include/amdgpu/shader/UniformBindings.hpp
@ -0,0 +1,62 @@
+#pragma once
+
+#include "Stage.hpp"
+#include "util/unreachable.hpp"
+
+namespace amdgpu::shader {
+struct UniformBindings {
+  static constexpr auto kBufferSlots = 16;
+  static constexpr auto kImageSlots = 16;
+  static constexpr auto kSamplerSlots = 16;
+
+  static constexpr auto kBufferOffset = 0;
+  static constexpr auto kImageOffset = kBufferOffset + kBufferSlots;
+  static constexpr auto kSamplerOffset = kImageOffset + kImageSlots;
+
+  static constexpr auto kStageSize = kSamplerOffset + kSamplerSlots;
+
+  static constexpr auto kVertexOffset = 0;
+  static constexpr auto kFragmentOffset = kStageSize;
+
+  static unsigned getBufferBinding(Stage stage, unsigned index) {
+    if (index >= kBufferSlots) {
+      util::unreachable();
+    }
+
+    return index + getStageOffset(stage) + kBufferOffset;
+  }
+
+  static unsigned getImageBinding(Stage stage, unsigned index) {
+    if (index >= kImageSlots) {
+      util::unreachable();
+    }
+
+    return index + getStageOffset(stage) + kImageOffset;
+  }
+
+  static unsigned getSamplerBinding(Stage stage, unsigned index) {
+    if (index >= kSamplerSlots) {
+      util::unreachable();
+    }
+
+    return index + getStageOffset(stage) + kSamplerOffset;
+  }
+
+private:
+  static unsigned getStageOffset(Stage stage) {
+    switch (stage) {
+    case Stage::Fragment:
+      return kFragmentOffset;
+
+    case Stage::Vertex:
+      return kVertexOffset;
+
+    case Stage::Compute:
+      return kVertexOffset;
+
+    default:
+      util::unreachable();
+    }
+  }
+};
+} // namespace amdgpu::shader
--- a/hw/amdgpu/shader/src/CfBuilder.cpp
+++ b/hw/amdgpu/shader/src/CfBuilder.cpp
@ -12,7 +12,7 @@ struct CfgBuilder {
  RemoteMemory memory;

  std::size_t analyzeBb(cf::BasicBlock *bb, std::uint64_t *successors,
-                        std::size_t *successorsCount, auto pushWork) {
+                        std::size_t *successorsCount) {
    auto address = bb->getAddress();
    auto instBegin = memory.getPointer<std::uint32_t>(address);
    auto instHex = instBegin;
@ -130,18 +130,10 @@ struct CfgBuilder {

      std::uint64_t successors[2];
      std::size_t successorsCount = 0;
-      std::size_t size = analyzeBb(bb, successors, &successorsCount,
-                                   [&](std::uint64_t address) {
-                                     if (processed.insert(address).second) {
-                                       workList.push_back(address);
-                                     }
-                                   });
+      std::size_t size = analyzeBb(bb, successors, &successorsCount);
      bb->setSize(size);

      if (successorsCount == 2) {
-        auto succ0Address = successors[0];
-        auto succ1Address = successors[1];
-
        branches.push_back(
            {address + size - 4, 2, {successors[0], successors[1]}});

--- a/hw/amdgpu/shader/src/Converter.cpp
+++ b/hw/amdgpu/shader/src/Converter.cpp
@ -2,21 +2,16 @@
 #include "CfBuilder.hpp"
 #include "ConverterContext.hpp"
 #include "Fragment.hpp"
-#include "FragmentTerminator.hpp"
 #include "Instruction.hpp"
-#include "RegisterId.hpp"
 #include "RegisterState.hpp"
+#include "UniformBindings.hpp"
 #include "amdgpu/RemoteMemory.hpp"
 #include "cf.hpp"
 #include "scf.hpp"
 #include "util/unreachable.hpp"
-#include <compare>
 #include <cstddef>
 #include <forward_list>
-#include <memory>
 #include <spirv/spirv.hpp>
-#include <unordered_map>
-#include <utility>
 #include <vector>

 static void printInstructions(const scf::PrintOptions &options, unsigned depth,
@ -365,9 +360,10 @@ private:
 amdgpu::shader::Shader
 amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
                        std::span<const std::uint32_t> userSpgrs,
-                        int bindingOffset, std::uint32_t dimX,
-                        std::uint32_t dimY, std::uint32_t dimZ) {
-  ConverterContext ctxt(memory, stage);
+                        std::uint32_t dimX, std::uint32_t dimY,
+                        std::uint32_t dimZ,
+                        util::MemoryAreaTable<> &dependencies) {
+  ConverterContext ctxt(memory, stage, &dependencies);
  auto &builder = ctxt.getBuilder();
  builder.createCapability(spv::Capability::Shader);
  builder.createCapability(spv::Capability::ImageQuery);
@ -412,9 +408,12 @@ amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,
  std::fflush(stdout);
  mainFunction->exitFragment.outputs.clear();

+  std::size_t samplerCount = 0;
+  std::size_t imageCount = 0;
+  std::size_t bufferCount = 0;
+
  for (auto &uniform : ctxt.getUniforms()) {
    auto &newUniform = result.uniforms.emplace_back();
-    newUniform.binding = bindingOffset++;

    for (int i = 0; i < 8; ++i) {
      newUniform.buffer[i] = uniform.buffer[i];
@ -422,23 +421,29 @@ amdgpu::shader::convert(RemoteMemory memory, Stage stage, std::uint64_t entry,

    std::uint32_t descriptorSet = 0;

+    switch (uniform.typeId) {
+    case TypeId::Sampler:
+      newUniform.kind = Shader::UniformKind::Sampler;
+      newUniform.binding =
+          UniformBindings::getSamplerBinding(stage, samplerCount++);
+      break;
+    case TypeId::Image2D:
+      newUniform.kind = Shader::UniformKind::Image;
+      newUniform.binding =
+          UniformBindings::getImageBinding(stage, imageCount++);
+      break;
+    default:
+      newUniform.kind = Shader::UniformKind::Buffer;
+      newUniform.binding =
+          UniformBindings::getBufferBinding(stage, bufferCount++);
+      break;
+    }
+
    ctxt.getBuilder().createDecorate(
        uniform.variable, spv::Decoration::DescriptorSet, {{descriptorSet}});
    ctxt.getBuilder().createDecorate(uniform.variable, spv::Decoration::Binding,
                                     {{newUniform.binding}});

-    switch (uniform.typeId) {
-    case TypeId::Sampler:
-      newUniform.kind = Shader::UniformKind::Sampler;
-      break;
-    case TypeId::Image2D:
-      newUniform.kind = Shader::UniformKind::Image;
-      break;
-    default:
-      newUniform.kind = Shader::UniformKind::Buffer;
-      break;
-    }
-
    newUniform.accessOp = uniform.accessOp;
  }

--- a/hw/amdgpu/shader/src/Fragment.cpp
+++ b/hw/amdgpu/shader/src/Fragment.cpp
@ -1568,6 +1568,10 @@ void convertSmrd(Fragment &fragment, Smrd inst) {
      auto address =
          *optLoAddress | (static_cast<std::uint64_t>(*optHiAddress) << 32);

+      fragment.context->dependencies->map(address + (inst.offset << 2),
+                                          address + (inst.offset << 2) +
+                                              sizeof(std::uint32_t) * count);
+
      auto data =
          memory.getPointer<std::uint32_t>(address + (inst.offset << 2));
      for (std::uint32_t i = 0; i < count; ++i) {
@ -5574,6 +5578,8 @@ void amdgpu::shader::Fragment::convert(std::uint64_t size) {
  auto ptr = context->getMemory().getPointer<std::uint32_t>(registers->pc);
  auto endptr = ptr + size / sizeof(std::uint32_t);

+  context->dependencies->map(registers->pc, registers->pc + size);
+
  while (ptr < endptr) {
    Instruction inst(ptr);
    // auto startPoint = builder.bodyRegion.getCurrentPosition();
@ -5615,6 +5621,8 @@ Value amdgpu::shader::Fragment::getRegister(RegisterId id) {
    case 247:
      return {context->getFloat32Type(), context->getFloat32(-4.0f)};
    case 255: {
+      context->dependencies->map(registers->pc,
+                                 registers->pc + sizeof(std::uint32_t));
      auto ptr = context->getMemory().getPointer<std::uint32_t>(registers->pc);
      registers->pc += sizeof(std::uint32_t);
      return {context->getUInt32Type(), context->getUInt32(*ptr)};
--- a/rpcsx-gpu/main.cpp
+++ b/rpcsx-gpu/main.cpp
@ -1,3 +1,4 @@
+#include "amdgpu/RemoteMemory.hpp"
 #include <algorithm>
 #include <amdgpu/bridge/bridge.hpp>
 #include <amdgpu/device/device.hpp>
@ -33,6 +34,7 @@ static void usage(std::FILE *out, const char *argv0) {
      "    --gpu <index> - specify physical gpu index to use, default is 0\n");
  std::fprintf(out,
               "    --presenter <presenter mode> - set flip engine target\n");
+  std::fprintf(out, "    --no-validation - disable validation layers\n");
  std::fprintf(out, "    -h, --help - show this message\n");
  std::fprintf(out, "\n");
  std::fprintf(out, "  presenter mode:\n");
@ -52,6 +54,7 @@ int main(int argc, const char *argv[]) {
  const char *shmName = "/rpcsx-os-memory";
  unsigned long gpuIndex = 0;
  auto presenter = PresenterMode::Window;
+  bool noValidation = false;

  for (int i = 1; i < argc; ++i) {
    if (argv[i] == std::string_view("--cmd-bridge")) {
@ -106,6 +109,11 @@ int main(int argc, const char *argv[]) {
      continue;
    }

+    if (argv[i] == std::string_view("--no-validation")) {
+      noValidation = true;
+      continue;
+    }
+
    usage(stderr, argv[0]);
    return 1;
  }
@ -122,7 +130,7 @@ int main(int argc, const char *argv[]) {
  auto requiredInstanceExtensions = std::vector<const char *>(
      glfwExtensions, glfwExtensions + glfwExtensionCount);

-  bool enableValidation = true;
+  bool enableValidation = !noValidation;

  if (enableValidation) {
    requiredInstanceExtensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
@ -248,6 +256,7 @@ int main(int argc, const char *argv[]) {
      // VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
      VK_EXT_SEPARATE_STENCIL_USAGE_EXTENSION_NAME,
      VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+      VK_EXT_SHADER_OBJECT_EXTENSION_NAME,
  };

  if (isDeviceExtensionSupported(VK_EXT_DEBUG_MARKER_EXTENSION_NAME)) {
@ -404,9 +413,16 @@ int main(int argc, const char *argv[]) {
    }
  }

+  VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures{
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT,
+      .shaderObject = VK_TRUE};
+
  VkPhysicalDeviceVulkan13Features phyDevFeatures13{
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
-      .maintenance4 = VK_TRUE};
+      .pNext = &shaderObjectFeatures,
+      .dynamicRendering = VK_TRUE,
+      .maintenance4 = VK_TRUE,
+  };

  VkPhysicalDeviceVulkan12Features phyDevFeatures12{
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES,