[amdgpu] device: implement multi queue support

implement IT_INDIRECT_BUFFER, IT_INDEX_BASE and IT_DRAW_INDEX_OFFSET_2 New gpu task scheduler
2026-02-21 23:24:44 +01:00 · 2023-08-06 17:35:54 +03:00 · 2023-08-06 17:35:54 +03:00 · 9e109918fd
parent ade074721e
commit 9e109918fd
6 changed files with 2382 additions and 1557 deletions
--- a/hw/amdgpu/device/include/amdgpu/device/device.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/device.hpp
@ -2,6 +2,7 @@

 #include "amdgpu/bridge/bridge.hpp"
 #include "amdgpu/shader/Instruction.hpp"
+#include "gpu-scheduler.hpp"
 #include "util/area.hpp"

 #include <string>
@ -760,7 +761,7 @@ enum Opcodes {
  kOpcodeDISPATCH_DIRECT = 0x15,
  kOpcodeDISPATCH_INDIRECT = 0x16,
  kOpcodeINDIRECT_BUFFER_END = 0x17,
-  MODE_CONTROL = 0x18,
+  kOpcodeMODE_CONTROL = 0x18,
  kOpcodeATOMIC_GDS = 0x1D,
  kOpcodeATOMIC_MEM = 0x1E,
  kOpcodeOCCLUSION_QUERY = 0x1F,
@ -773,7 +774,7 @@ enum Opcodes {
  kOpcodeINDEX_BASE = 0x26,
  kOpcodeDRAW_INDEX_2 = 0x27,
  kOpcodeCONTEXT_CONTROL = 0x28,
-  DRAW_INDEX_OFFSET = 0x29,
+  kOpcodeDRAW_INDEX_OFFSET = 0x29,
  kOpcodeINDEX_TYPE = 0x2A,
  kOpcodeDRAW_INDEX = 0x2B,
  kOpcodeDRAW_INDIRECT_MULTI = 0x2C,
@ -805,11 +806,11 @@ enum Opcodes {
  kOpcodeEVENT_WRITE_EOS = 0x48,
  kOpcodeRELEASE_MEM = 0x49,
  kOpcodePREAMBLE_CNTL = 0x4A,
-  RB_OFFSET = 0x4B,
-  ALU_PS_CONST_BUFFER_COPY = 0x4C,
-  ALU_VS_CONST_BUFFER_COPY = 0x4D,
-  ALU_PS_CONST_UPDATE = 0x4E,
-  ALU_VS_CONST_UPDATE = 0x4F,
+  kOpcodeRB_OFFSET = 0x4B,
+  kOpcodeALU_PS_CONST_BUFFER_COPY = 0x4C,
+  kOpcodeALU_VS_CONST_BUFFER_COPY = 0x4D,
+  kOpcodeALU_PS_CONST_UPDATE = 0x4E,
+  kOpcodeALU_VS_CONST_UPDATE = 0x4F,
  kOpcodeDMA_DATA = 0x50,
  kOpcodeONE_REG_WRITE = 0x57,
  kOpcodeAQUIRE_MEM = 0x58,
@ -826,12 +827,12 @@ enum Opcodes {
  kOpcodeSET_RESOURCE = 0x6D,
  kOpcodeSET_SAMPLER = 0x6E,
  kOpcodeSET_CTL_CONST = 0x6F,
-  SET_RESOURCE_OFFSET = 0x70,
-  SET_ALU_CONST_VS = 0x71,
-  SET_ALU_CONST_DI = 0x72,
+  kOpcodeSET_RESOURCE_OFFSET = 0x70,
+  kOpcodeSET_ALU_CONST_VS = 0x71,
+  kOpcodeSET_ALU_CONST_DI = 0x72,
  kOpcodeSET_CONTEXT_REG_INDIRECT = 0x73,
-  SET_RESOURCE_INDIRECT = 0x74,
-  SET_APPEND_CNT = 0x75,
+  kOpcodeSET_RESOURCE_INDIRECT = 0x74,
+  kOpcodeSET_APPEND_CNT = 0x75,
  kOpcodeSET_SH_REG = 0x76,
  kOpcodeSET_SH_REG_OFFSET = 0x77,
  kOpcodeSET_QUEUE_REG = 0x78,
@ -1018,8 +1019,6 @@ inline const std::string opcodeToString(int op) {
 }

 inline void dumpShader(const std::uint32_t *data) {
-  int hackExit = 0;
-
  flockfile(stdout);
  while (true) {
    auto instHex = *data;
@ -1262,29 +1261,22 @@ static_assert(sizeof(GnmTBuffer) == sizeof(std::uint64_t) * 4);

 constexpr auto kPageSize = 0x4000;

-struct DrawContext {
-  VkQueue queue;
-  VkCommandPool commandPool;
-};
-
 void setVkDevice(VkDevice device,
                 VkPhysicalDeviceMemoryProperties memProperties,
                 VkPhysicalDeviceProperties devProperties);

 struct AmdgpuDevice {
-  amdgpu::device::DrawContext dc;
-
  void handleProtectMemory(std::uint64_t address, std::uint64_t size,
                           std::uint32_t prot);
  void handleCommandBuffer(std::uint64_t queueId, std::uint64_t address,
                           std::uint64_t size);
-  bool handleFlip(std::uint32_t bufferIndex, std::uint64_t arg,
-                  VkCommandBuffer cmd, VkImage targetImage,
-                  VkExtent2D targetExtent, std::vector<VkBuffer> &usedBuffers,
-                  std::vector<VkImage> &usedImages);
+  bool handleFlip(VkQueue queue, VkCommandBuffer cmdBuffer,
+                  TaskChain &initTaskChain, std::uint32_t bufferIndex,
+                  std::uint64_t arg, VkImage targetImage,
+                  VkExtent2D targetExtent, VkSemaphore waitSemaphore,
+                  VkSemaphore signalSemaphore, VkFence fence);

-  AmdgpuDevice(amdgpu::device::DrawContext dc,
-               amdgpu::bridge::BridgeHeader *bridge);
+  AmdgpuDevice(amdgpu::bridge::BridgeHeader *bridge);

  ~AmdgpuDevice();
 };
--- a/hw/amdgpu/device/include/amdgpu/device/gpu-scheduler.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/gpu-scheduler.hpp
@ -0,0 +1,321 @@
+#pragma once
+
+#include "scheduler.hpp"
+#include "vk.hpp"
+#include <atomic>
+#include <concepts>
+#include <cstdint>
+#include <list>
+#include <source_location>
+#include <thread>
+#include <utility>
+#include <vulkan/vulkan_core.h>
+
+namespace amdgpu::device {
+enum class ProcessQueue {
+  Graphics = 1 << 1,
+  Compute = 1 << 2,
+  Transfer = 1 << 3,
+  Any = Graphics | Compute | Transfer
+};
+
+inline ProcessQueue operator|(ProcessQueue lhs, ProcessQueue rhs) {
+  return static_cast<ProcessQueue>(std::to_underlying(lhs) |
+                                   std::to_underlying(rhs));
+}
+
+inline ProcessQueue operator&(ProcessQueue lhs, ProcessQueue rhs) {
+  return static_cast<ProcessQueue>(std::to_underlying(lhs) &
+                                   std::to_underlying(rhs));
+}
+
+struct TaskChain;
+class GpuScheduler;
+
+Scheduler &getCpuScheduler();
+GpuScheduler &getGpuScheduler(ProcessQueue queue);
+
+struct GpuTaskLayout {
+  static constexpr auto kInvalidId = 0; //~static_cast<std::uint64_t>(0);
+
+  Ref<TaskChain> chain;
+  std::uint64_t id;
+  std::uint64_t waitId = kInvalidId;
+  VkPipelineStageFlags waitStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+
+  std::function<void(VkCommandBuffer)> invoke;
+  std::function<void(VkQueue, VkCommandBuffer)> submit;
+};
+
+struct TaskChain {
+  vk::Semaphore semaphore;
+  std::uint64_t nextTaskId = 1;
+  std::atomic<unsigned> refs{0};
+  std::vector<std::source_location> taskLocations;
+
+  void incRef() { refs.fetch_add(1, std::memory_order::relaxed); }
+  void decRef() {
+    if (refs.fetch_sub(1, std::memory_order::relaxed) == 1) {
+      delete this;
+    }
+  }
+
+  static Ref<TaskChain> Create() {
+    auto result = new TaskChain();
+    result->semaphore = vk::Semaphore::Create();
+    return result;
+  }
+
+  std::uint64_t add(ProcessQueue queue, std::uint64_t waitId,
+                    std::function<void(VkCommandBuffer)> invoke);
+
+  std::uint64_t add(ProcessQueue queue,
+                    std::function<void(VkCommandBuffer)> invoke) {
+    return add(queue, GpuTaskLayout::kInvalidId, std::move(invoke));
+  }
+
+  template <typename T>
+    requires requires(T &&t) {
+      { t() } -> std::same_as<void>;
+    }
+  std::uint64_t add(std::uint64_t waitId, T &&task) {
+    auto prevTaskId = getLastTaskId();
+    auto id = nextTaskId++;
+    auto cpuTask =
+        createCpuTask([=, task = std::forward<T>(task),
+                       self = Ref(this)](const AsyncTaskCtl &) mutable {
+          if (waitId != GpuTaskLayout::kInvalidId) {
+            if (self->semaphore.getCounterValue() < waitId) {
+              return TaskResult::Reschedule;
+            }
+          }
+
+          task();
+
+          if (prevTaskId != GpuTaskLayout::kInvalidId && waitId != prevTaskId) {
+            self->wait(prevTaskId);
+          }
+
+          self->semaphore.signal(id);
+          return TaskResult::Complete;
+        });
+    getCpuScheduler().enqueue(std::move(cpuTask));
+    return id;
+  }
+
+  template <typename T>
+    requires requires(T &&t) {
+      { t() } -> std::same_as<void>;
+    }
+  std::uint64_t add(T &&task) {
+    return add(GpuTaskLayout::kInvalidId, std::forward<T>(task));
+  }
+
+  std::uint64_t getLastTaskId() const { return nextTaskId - 1; }
+
+  std::uint64_t createExternalTask() { return nextTaskId++; }
+  void notifyExternalTaskComplete(std::uint64_t id) { semaphore.signal(id); }
+
+  bool isComplete() const { return isComplete(getLastTaskId()); }
+
+  bool isComplete(std::uint64_t task) const {
+    return semaphore.getCounterValue() >= task;
+  }
+
+  bool empty() const { return getLastTaskId() == GpuTaskLayout::kInvalidId; }
+
+  void wait(std::uint64_t task = GpuTaskLayout::kInvalidId) const {
+    if (empty()) {
+      return;
+    }
+
+    if (task == GpuTaskLayout::kInvalidId) {
+      task = getLastTaskId();
+    }
+
+    Verify() << semaphore.wait(task, UINT64_MAX);
+  }
+};
+
+class GpuScheduler {
+  std::list<std::thread> workThreads;
+  std::vector<GpuTaskLayout> tasks;
+  std::vector<GpuTaskLayout> delayedTasks;
+  std::mutex taskMtx;
+  std::condition_variable taskCv;
+  std::atomic<bool> exit{false};
+  std::string debugName;
+
+public:
+  explicit GpuScheduler(std::span<std::pair<VkQueue, std::uint32_t>> queues,
+                        std::string debugName)
+      : debugName(debugName) {
+    for (std::size_t index = 0; auto [queue, queueFamilyIndex] : queues) {
+      workThreads.push_back(std::thread{[=, this] {
+        setThreadName(
+            ("GPU " + std::to_string(index) + " " + debugName).c_str());
+        entry(queue, queueFamilyIndex);
+      }});
+
+      ++index;
+    }
+  }
+
+  ~GpuScheduler() {
+    exit = true;
+    taskCv.notify_all();
+
+    for (auto &thread : workThreads) {
+      thread.join();
+    }
+  }
+
+  void enqueue(GpuTaskLayout &&task) {
+    std::lock_guard lock(taskMtx);
+    tasks.push_back(std::move(task));
+    taskCv.notify_one();
+  }
+
+private:
+  void submitTask(VkCommandPool pool, VkQueue queue, GpuTaskLayout &task) {
+    VkCommandBuffer cmdBuffer;
+    {
+      VkCommandBufferAllocateInfo allocateInfo{
+          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+          .commandPool = pool,
+          .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+          .commandBufferCount = 1,
+      };
+
+      Verify() << vkAllocateCommandBuffers(vk::g_vkDevice, &allocateInfo,
+                                           &cmdBuffer);
+
+      VkCommandBufferBeginInfo beginInfo{
+          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+          .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+      };
+
+      vkBeginCommandBuffer(cmdBuffer, &beginInfo);
+    }
+
+    task.invoke(cmdBuffer);
+
+    vkEndCommandBuffer(cmdBuffer);
+
+    if (task.submit) {
+      task.submit(queue, cmdBuffer);
+      return;
+    }
+
+    VkSemaphoreSubmitInfo signalSemSubmitInfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore = task.chain->semaphore.getHandle(),
+        .value = task.id,
+        .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+    };
+
+    VkSemaphoreSubmitInfo waitSemSubmitInfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+        .semaphore = task.chain->semaphore.getHandle(),
+        .value = task.waitId,
+        .stageMask = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+    };
+
+    VkCommandBufferSubmitInfo cmdBufferSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+        .commandBuffer = cmdBuffer,
+    };
+
+    VkSubmitInfo2 submitInfo{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount =
+            static_cast<std::uint32_t>(task.waitId ? 1 : 0),
+        .pWaitSemaphoreInfos = &waitSemSubmitInfo,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &cmdBufferSubmitInfo,
+        .signalSemaphoreInfoCount = 1,
+        .pSignalSemaphoreInfos = &signalSemSubmitInfo,
+    };
+
+    Verify() << vkQueueSubmit2(queue, 1, &submitInfo, VK_NULL_HANDLE);
+
+    // if (task.signalChain->semaphore.wait(
+    //         task.id, std::chrono::duration_cast<std::chrono::nanoseconds>(
+    //                      std::chrono::seconds(10))
+    //                      .count())) {
+    //   util::unreachable("gpu operation takes too long time. wait id = %lu\n",
+    //                     task.waitId);
+    // }
+  }
+
+  void entry(VkQueue queue, std::uint32_t queueFamilyIndex) {
+    VkCommandPool pool;
+    {
+      VkCommandPoolCreateInfo poolCreateInfo{
+          .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+          .queueFamilyIndex = queueFamilyIndex};
+
+      Verify() << vkCreateCommandPool(vk::g_vkDevice, &poolCreateInfo,
+                                      vk::g_vkAllocator, &pool);
+    }
+
+    while (!exit.load(std::memory_order::relaxed)) {
+      GpuTaskLayout task;
+
+      {
+        std::unique_lock lock(taskMtx);
+
+        while (tasks.empty()) {
+          if (tasks.empty() && delayedTasks.empty()) {
+            taskCv.wait(lock);
+          }
+
+          if (tasks.empty()) {
+            std::swap(delayedTasks, tasks);
+          }
+        }
+
+        task = std::move(tasks.back());
+        tasks.pop_back();
+      }
+
+      if (task.waitId != GpuTaskLayout::kInvalidId &&
+          !task.chain->isComplete(task.waitId)) {
+        std::unique_lock lock(taskMtx);
+        delayedTasks.push_back(std::move(task));
+        taskCv.notify_one();
+        continue;
+      }
+
+      submitTask(pool, queue, task);
+    }
+
+    vkDestroyCommandPool(vk::g_vkDevice, pool, vk::g_vkAllocator);
+  }
+};
+
+inline std::uint64_t
+TaskChain::add(ProcessQueue queue, std::uint64_t waitId,
+               std::function<void(VkCommandBuffer)> invoke) {
+  VkPipelineStageFlags waitStage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+  if (waitId == GpuTaskLayout::kInvalidId) {
+    waitId = getLastTaskId();
+    waitStage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+  }
+  auto id = nextTaskId++;
+
+  getGpuScheduler(queue).enqueue({
+      .chain = Ref(this),
+      .id = id,
+      .waitId = waitId,
+      .waitStage = waitStage,
+      .invoke = std::move(invoke),
+  });
+
+  return id;
+}
+
+GpuScheduler &getTransferQueueScheduler();
+GpuScheduler &getComputeQueueScheduler();
+GpuScheduler &getGraphicsQueueScheduler();
+} // namespace amdgpu::device
--- a/hw/amdgpu/device/include/amdgpu/device/scheduler.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/scheduler.hpp
@ -1,14 +1,23 @@
 #pragma once

+#include "util/unreachable.hpp"
 #include <atomic>
 #include <bit>
+#include <cassert>
+#include <concepts>
 #include <condition_variable>
+#include <functional>
 #include <mutex>
+#include <pthread.h>
 #include <thread>
 #include <utility>
 #include <vector>

 namespace amdgpu::device {
+inline void setThreadName(const char *name) {
+  pthread_setname_np(pthread_self(), name);
+}
+
 template <typename T> class Ref {
  T *m_ref = nullptr;

@ -95,11 +104,13 @@ public:
 template <typename T> Ref(T *) -> Ref<T>;
 template <typename T> Ref(Ref<T>) -> Ref<T>;

-enum class TaskState { InProgress, Complete, Canceled };
+enum class TaskState { Created, InProgress, Complete, Canceled };
+enum class TaskResult { Complete, Canceled, Reschedule };

 struct AsyncTaskCtl {
  std::atomic<unsigned> refs{0};
-  std::atomic<TaskState> stateStorage{TaskState::InProgress};
+  std::atomic<TaskState> stateStorage{TaskState::Created};
+  std::atomic<bool> cancelRequested{false};

  virtual ~AsyncTaskCtl() = default;

@ -110,48 +121,29 @@ struct AsyncTaskCtl {
    }
  }

-  bool isCanceled() const {
-    return stateStorage.load(std::memory_order::relaxed) == TaskState::Canceled;
+  bool isCancelRequested() const {
+    return cancelRequested.load(std::memory_order::relaxed) == true;
  }
-  bool isComplete() const {
-    return stateStorage.load(std::memory_order::relaxed) == TaskState::Complete;
-  }
-  bool isInProgress() const {
-    return stateStorage.load(std::memory_order::relaxed) ==
-           TaskState::InProgress;
+  bool isCanceled() const { return getState() == TaskState::Canceled; }
+  bool isComplete() const { return getState() == TaskState::Complete; }
+  bool isInProgress() const { return getState() == TaskState::InProgress; }
+
+  TaskState getState() const {
+    return stateStorage.load(std::memory_order::relaxed);
  }

-  void cancel() {
-    auto state = TaskState::InProgress;
-
-    while (state == TaskState::InProgress) {
-      if (stateStorage.compare_exchange_weak(state, TaskState::Canceled,
-                                             std::memory_order::relaxed)) {
-        break;
-      }
-    }
-
-    stateStorage.notify_all();
-  }
-
-  void complete() {
-    auto state = TaskState::InProgress;
-
-    while (state != TaskState::Complete) {
-      if (stateStorage.compare_exchange_weak(state, TaskState::Complete,
-                                             std::memory_order::relaxed)) {
-        break;
-      }
-    }
-
-    stateStorage.notify_all();
-  }
+  void cancel() { cancelRequested.store(true, std::memory_order::relaxed); }

  void wait() {
+    if (stateStorage.load(std::memory_order::relaxed) == TaskState::Created) {
+      util::unreachable("attempt to wait task that wasn't scheduled\n");
+    }
    stateStorage.wait(TaskState::InProgress, std::memory_order::relaxed);
  }
+};

-  virtual void invoke() = 0;
+struct CpuTaskCtl : AsyncTaskCtl {
+  virtual TaskResult invoke() = 0;
 };

 namespace detail {
@ -159,69 +151,100 @@ template <typename T>
 concept LambdaWithoutClosure = requires(T t) { +t; };
 }

-template <typename T> struct AsyncTask;
+template <typename T> struct AsyncCpuTask;

 template <typename T>
-  requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
-           detail::LambdaWithoutClosure<T>)
-struct AsyncTask<T> : AsyncTaskCtl {
-  static constexpr bool (*fn)(const AsyncTaskCtl &) = +std::declval<T>();
+  requires requires(T t, const AsyncTaskCtl &ctl) {
+    { t(ctl) } -> std::same_as<TaskResult>;
+    requires detail::LambdaWithoutClosure<T>;
+  }
+struct AsyncCpuTask<T> : CpuTaskCtl {
+  static constexpr TaskResult (*fn)(const AsyncTaskCtl &) = +std::declval<T>();

-  AsyncTask() = default;
-  AsyncTask(T &&) {}
+  AsyncCpuTask() = default;
+  AsyncCpuTask(T &&) {}

-  void invoke() override {
+  TaskResult invoke() override {
    auto &base = *static_cast<const AsyncTaskCtl *>(this);

-    if (fn(base)) {
-      complete();
-    }
+    return fn(base);
  }
 };

 template <typename T>
-  requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
-Ref<AsyncTaskCtl> createTask(T &&task) {
-  return Ref<AsyncTaskCtl>(new AsyncTask<T>(std::forward<T>(task)));
+  requires requires(T t, const AsyncTaskCtl &ctl) {
+    { t(ctl) } -> std::same_as<TaskResult>;
+    requires !detail::LambdaWithoutClosure<T>;
+  }
+struct AsyncCpuTask<T> : CpuTaskCtl {
+  alignas(T) std::byte taskStorage[sizeof(T)];
+
+  AsyncCpuTask(T &&t) { new (taskStorage) T(std::forward<T>(t)); }
+  ~AsyncCpuTask() { std::bit_cast<T *>(&taskStorage)->~T(); }
+
+  TaskResult invoke() override {
+    auto &lambda = *std::bit_cast<T *>(&taskStorage);
+    auto &base = *static_cast<const AsyncTaskCtl *>(this);
+    return lambda(base);
+  }
+};
+
+template <typename T>
+  requires requires(T t, const AsyncTaskCtl &ctl) {
+    { t(ctl) } -> std::same_as<TaskResult>;
+  }
+Ref<CpuTaskCtl> createCpuTask(T &&task) {
+  return Ref<CpuTaskCtl>(new AsyncCpuTask<T>(std::forward<T>(task)));
 }

 template <typename T>
-  requires(std::is_invocable_r_v<bool, T, const AsyncTaskCtl &> &&
-           !detail::LambdaWithoutClosure<T>)
-struct AsyncTask<T> : AsyncTaskCtl {
-  alignas(T) std::byte taskStorage[sizeof(T)];
-
-  AsyncTask() = default;
-  AsyncTask(T &&t) { new (taskStorage) T(std::forward<T>(t)); }
-  AsyncTask &operator=(T &&t) {
-    new (taskStorage) T(std::forward<T>(t));
-    return *this;
+  requires requires(T t) {
+    { t() } -> std::same_as<TaskResult>;
  }
+Ref<CpuTaskCtl> createCpuTask(T &&task) {
+  return createCpuTask(
+      [task = std::forward<T>(task)](
+          const AsyncTaskCtl &) mutable -> TaskResult { return task(); });
+}

-  ~AsyncTask() {
-    if (isInProgress()) {
-      std::bit_cast<T *>(&taskStorage)->~T();
-    }
+template <typename T>
+  requires requires(T t) {
+    { t() } -> std::same_as<void>;
  }
-
-  void invoke() override {
-    auto &lambda = *std::bit_cast<T *>(&taskStorage);
-    auto &base = *static_cast<const AsyncTaskCtl *>(this);
-
-    if (lambda(base)) {
-      complete();
+Ref<CpuTaskCtl> createCpuTask(T &&task) {
+  return createCpuTask([task = std::forward<T>(task)](
+                           const AsyncTaskCtl &ctl) mutable -> TaskResult {
+    if (ctl.isCancelRequested()) {
+      return TaskResult::Canceled;
    }

-    std::bit_cast<T *>(&taskStorage)->~T();
+    task();
+    return TaskResult::Complete;
+  });
+}
+
+template <typename T>
+  requires requires(T t, const AsyncTaskCtl &ctl) {
+    { t(ctl) } -> std::same_as<void>;
  }
-};
+Ref<CpuTaskCtl> createCpuTask(T &&task) {
+  return createCpuTask([task = std::forward<T>(task)](const AsyncTaskCtl &ctl) {
+    if (ctl.isCancelRequested()) {
+      return TaskResult::Canceled;
+    }
+
+    task(ctl);
+    return TaskResult::Complete;
+  });
+}

 class Scheduler;
-class TaskSet {
-  std::vector<Ref<AsyncTaskCtl>> tasks;
+
+class CpuTaskSet {
+  std::vector<Ref<CpuTaskCtl>> tasks;

 public:
-  void append(Ref<AsyncTaskCtl> task) { tasks.push_back(std::move(task)); }
+  void append(Ref<CpuTaskCtl> task) { tasks.push_back(std::move(task)); }

  void wait() {
    for (auto task : tasks) {
@ -234,9 +257,91 @@ public:
  void enqueue(Scheduler &scheduler);
 };

+class TaskSet {
+  struct TaskEntry {
+    Ref<AsyncTaskCtl> ctl;
+    std::function<void()> schedule;
+  };
+
+  std::vector<TaskEntry> tasks;
+
+public:
+  template <typename Scheduler, typename Task>
+    requires requires(Scheduler &sched, Ref<Task> task) {
+      sched.enqueue(std::move(task));
+      task->wait();
+      static_cast<Ref<AsyncTaskCtl>>(task);
+    }
+  void append(Scheduler &sched, Ref<Task> task) {
+    Ref<AsyncTaskCtl> rawTask = task;
+    auto schedFn = [sched = &sched, task = std::move(task)] {
+      sched->enqueue(std::move(task));
+    };
+
+    tasks.push_back({
+        .ctl = std::move(rawTask),
+        .schedule = std::move(schedFn),
+    });
+  }
+
+  void schedule() {
+    for (auto &task : tasks) {
+      if (auto schedule = std::exchange(task.schedule, nullptr)) {
+        schedule();
+      }
+    }
+  }
+
+  bool isCanceled() const {
+    for (auto &task : tasks) {
+      if (task.ctl->isCanceled()) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  bool isComplete() const {
+    for (auto &task : tasks) {
+      if (!task.ctl->isComplete()) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool isInProgress() const {
+    for (auto &task : tasks) {
+      if (task.ctl->isInProgress()) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  void clear() { tasks.clear(); }
+
+  void wait() const {
+    for (auto &task : tasks) {
+      assert(task.schedule == nullptr);
+      task.ctl->wait();
+    }
+  }
+
+  void cancel() {
+    for (auto &task : tasks) {
+      task.ctl->cancel();
+    }
+  }
+};
+
 class Scheduler {
  std::vector<std::thread> workThreads;
-  std::vector<Ref<AsyncTaskCtl>> tasks;
+  std::vector<Ref<CpuTaskCtl>> tasks;
+  std::vector<Ref<CpuTaskCtl>> rescheduleTasks;
  std::mutex taskMtx;
  std::condition_variable taskCv;
  std::atomic<bool> exit{false};
@ -244,7 +349,10 @@ class Scheduler {
 public:
  explicit Scheduler(std::size_t threadCount) {
    for (std::size_t i = 0; i < threadCount; ++i) {
-      workThreads.push_back(std::thread{[this] { entry(); }});
+      workThreads.push_back(std::thread{[this, i] {
+        setThreadName(("CPU " + std::to_string(i)).c_str());
+        entry();
+      }});
    }
  }

@ -257,53 +365,88 @@ public:
    }
  }

-  template <typename T>
-    requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
-  Ref<AsyncTaskCtl> enqueue(T &&task) {
-    auto taskHandle = createTask(std::forward<T>(task));
-    enqueue(taskHandle);
-    return taskHandle;
-  }
-
-  void enqueue(Ref<AsyncTaskCtl> task) {
+  void enqueue(Ref<CpuTaskCtl> task) {
    std::lock_guard lock(taskMtx);
+    TaskState prevState = TaskState::Created;
+    if (!task->stateStorage.compare_exchange_strong(
+            prevState, TaskState::InProgress, std::memory_order::relaxed)) {
+      util::unreachable("attempt to schedule cpu task in wrong state %u",
+                        (unsigned)prevState);
+    }
    tasks.push_back(std::move(task));
    taskCv.notify_one();
  }

  template <typename T>
-    requires std::is_invocable_r_v<bool, T, const AsyncTaskCtl &>
-  void enqueue(TaskSet &set, T &&task) {
+    requires requires(T &&task) { createCpuTask(std::forward<T>(task)); }
+  Ref<AsyncTaskCtl> enqueue(T &&task) {
+    auto taskHandle = createCpuTask(std::forward<T>(task));
+    enqueue(taskHandle);
+    return taskHandle;
+  }
+
+  template <typename T>
+    requires requires(T &&task) { createCpuTask(std::forward<T>(task)); }
+  void enqueue(CpuTaskSet &set, T &&task) {
    auto taskCtl = enqueue(std::forward<T>(task));
    set.append(taskCtl);
  }

 private:
-  void entry() {
-    while (!exit.load(std::memory_order::relaxed)) {
-      Ref<AsyncTaskCtl> task;
+  Ref<CpuTaskCtl> fetchTask() {
+    std::unique_lock lock(taskMtx);

-      {
-        std::unique_lock lock(taskMtx);
-
-        if (tasks.empty()) {
-          taskCv.wait(lock);
-        }
-
-        if (tasks.empty()) {
-          continue;
-        }
-
-        task = std::move(tasks.back());
-        tasks.pop_back();
+    while (tasks.empty()) {
+      if (rescheduleTasks.empty() && tasks.empty()) {
+        taskCv.wait(lock);
      }

-      task->invoke();
+      if (tasks.empty()) {
+        std::swap(rescheduleTasks, tasks);
+      }
+    }
+
+    auto result = std::move(tasks.back());
+    tasks.pop_back();
+    return result;
+  }
+
+  Ref<CpuTaskCtl> invokeTask(Ref<CpuTaskCtl> task) {
+    switch (task->invoke()) {
+    case TaskResult::Complete:
+      task->stateStorage.store(TaskState::Complete, std::memory_order::relaxed);
+      task->stateStorage.notify_all();
+      return {};
+
+    case TaskResult::Canceled:
+      task->stateStorage.store(TaskState::Canceled, std::memory_order::relaxed);
+      task->stateStorage.notify_all();
+      return {};
+
+    case TaskResult::Reschedule:
+      return task;
+    }
+
+    std::abort();
+  }
+
+  void entry() {
+    while (!exit.load(std::memory_order::relaxed)) {
+      Ref<CpuTaskCtl> task = fetchTask();
+
+      auto rescheduleTask = invokeTask(std::move(task));
+      if (rescheduleTask == nullptr) {
+        continue;
+      }
+
+      std::unique_lock lock(taskMtx);
+      rescheduleTasks.push_back(std::move(rescheduleTask));
+      taskCv.notify_one();
    }
  }
 };

-inline void TaskSet::enqueue(Scheduler &scheduler) {
+inline void CpuTaskSet::enqueue(Scheduler &scheduler) {
  for (auto task : tasks) {
    scheduler.enqueue(std::move(task));
  }
--- a/hw/amdgpu/device/include/amdgpu/device/vk.hpp
+++ b/hw/amdgpu/device/include/amdgpu/device/vk.hpp
@ -342,6 +342,89 @@ public:
  bool operator!=(std::nullptr_t) const { return mSemaphore != nullptr; }
 };

+struct BinSemaphore {
+  VkSemaphore mSemaphore = VK_NULL_HANDLE;
+
+public:
+  BinSemaphore(const BinSemaphore &) = delete;
+
+  BinSemaphore() = default;
+  BinSemaphore(BinSemaphore &&other) { *this = std::move(other); }
+
+  BinSemaphore &operator=(BinSemaphore &&other) {
+    std::swap(mSemaphore, other.mSemaphore);
+    return *this;
+  }
+
+  ~BinSemaphore() {
+    if (mSemaphore != VK_NULL_HANDLE) {
+      vkDestroySemaphore(g_vkDevice, mSemaphore, nullptr);
+    }
+  }
+
+  static BinSemaphore Create() {
+    VkSemaphoreTypeCreateInfo typeCreateInfo = {
+        VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO, nullptr,
+        VK_SEMAPHORE_TYPE_BINARY, 0};
+
+    VkSemaphoreCreateInfo createInfo = {VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+                                        &typeCreateInfo, 0};
+
+    BinSemaphore result;
+    Verify() << vkCreateSemaphore(g_vkDevice, &createInfo, nullptr,
+                                  &result.mSemaphore);
+    return result;
+  }
+
+  VkSemaphore getHandle() const { return mSemaphore; }
+
+  bool operator==(std::nullptr_t) const { return mSemaphore == nullptr; }
+};
+
+struct Fence {
+  VkFence mFence = VK_NULL_HANDLE;
+
+public:
+  Fence(const Fence &) = delete;
+
+  Fence() = default;
+  Fence(Fence &&other) { *this = std::move(other); }
+
+  Fence &operator=(Fence &&other) {
+    std::swap(mFence, other.mFence);
+    return *this;
+  }
+
+  ~Fence() {
+    if (mFence != VK_NULL_HANDLE) {
+      vkDestroyFence(g_vkDevice, mFence, nullptr);
+    }
+  }
+
+  static Fence Create() {
+    VkFenceCreateInfo fenceCreateInfo = {VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+                                         nullptr, 0};
+    Fence result;
+    Verify() << vkCreateFence(g_vkDevice, &fenceCreateInfo, nullptr,
+                              &result.mFence);
+    return result;
+  }
+
+  void wait() const {
+    Verify() << vkWaitForFences(g_vkDevice, 1, &mFence, 1, UINT64_MAX);
+  }
+
+  bool isComplete() const {
+    return vkGetFenceStatus(g_vkDevice, mFence) == VK_SUCCESS;
+  }
+
+  void reset() { vkResetFences(g_vkDevice, 1, &mFence); }
+
+  VkFence getHandle() const { return mFence; }
+
+  bool operator==(std::nullptr_t) const { return mFence == nullptr; }
+};
+
 struct CommandBuffer {
  VkCommandBuffer mCmdBuffer = VK_NULL_HANDLE;

@ -641,7 +724,7 @@ public:
  void readFromBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
                      VkImageAspectFlags destAspect,
                      VkDeviceSize bufferOffset = 0) {
-    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_GENERAL);

    VkBufferImageCopy region{};
    region.bufferOffset = bufferOffset;
@ -654,13 +737,13 @@ public:
    region.imageOffset = {0, 0, 0};
    region.imageExtent = {mWidth, mHeight, 1};

-    vkCmdCopyBufferToImage(cmdBuffer, buffer, mImage,
-                           VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &region);
+    vkCmdCopyBufferToImage(cmdBuffer, buffer, mImage, VK_IMAGE_LAYOUT_GENERAL,
+                           1, &region);
  }

  void writeToBuffer(VkCommandBuffer cmdBuffer, VkBuffer buffer,
                     VkImageAspectFlags sourceAspect) {
-    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+    transitionLayout(cmdBuffer, VK_IMAGE_LAYOUT_GENERAL);

    VkBufferImageCopy region{};
    region.bufferOffset = 0;
@ -673,9 +756,8 @@ public:
    region.imageOffset = {0, 0, 0};
    region.imageExtent = {mWidth, mHeight, 1};

-    vkCmdCopyImageToBuffer(cmdBuffer, mImage,
-                           VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, 1,
-                           &region);
+    vkCmdCopyImageToBuffer(cmdBuffer, mImage, VK_IMAGE_LAYOUT_GENERAL, buffer,
+                           1, &region);
  }

  [[nodiscard]] Buffer writeToBuffer(VkCommandBuffer cmdBuffer,
@ -738,6 +820,7 @@ public:
        -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
      switch (layout) {
      case VK_IMAGE_LAYOUT_UNDEFINED:
+      case VK_IMAGE_LAYOUT_GENERAL:
      case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
        return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};

--- a/hw/amdgpu/device/src/device.cpp
+++ b/hw/amdgpu/device/src/device.cpp
--- a/rpcsx-gpu/main.cpp
+++ b/rpcsx-gpu/main.cpp
@ -1,4 +1,5 @@
 #include "amdgpu/RemoteMemory.hpp"
+#include "amdgpu/device/gpu-scheduler.hpp"
 #include "amdgpu/device/vk.hpp"
 #include <algorithm>
 #include <amdgpu/bridge/bridge.hpp>
@ -45,6 +46,33 @@ static void usage(std::FILE *out, const char *argv0) {

 enum class PresenterMode { Window };

+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
+              VkDebugUtilsMessageTypeFlagsEXT messageType,
+              const VkDebugUtilsMessengerCallbackDataEXT *pCallbackData,
+              void *pUserData) {
+
+  std::fprintf(stderr, "validation layer: %s\n", pCallbackData->pMessage);
+
+  if (messageSeverity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
+    std::abort();
+  }
+  return VK_FALSE;
+}
+
+static VkResult _vkCreateDebugUtilsMessengerEXT(
+    VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT *pCreateInfo,
+    const VkAllocationCallbacks *pAllocator,
+    VkDebugUtilsMessengerEXT *pDebugMessenger) {
+  static auto func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(
+      instance, "vkCreateDebugUtilsMessengerEXT");
+  if (func != nullptr) {
+    return func(instance, pCreateInfo, pAllocator, pDebugMessenger);
+  } else {
+    return VK_ERROR_EXTENSION_NOT_PRESENT;
+  }
+}
+
 int main(int argc, const char *argv[]) {
  if (argc == 2 && (argv[1] == std::string_view("-h") ||
                    argv[1] == std::string_view("--help"))) {
@ -172,19 +200,39 @@ int main(int argc, const char *argv[]) {
      .apiVersion = VK_API_VERSION_1_3,
  };

+  VkDebugUtilsMessengerCreateInfoEXT debugCreateInfo{};
+  debugCreateInfo.sType =
+      VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+  debugCreateInfo.messageSeverity =
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT |
+      VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+  debugCreateInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT |
+                                VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT |
+                                0
+      // VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT
+      ;
+  debugCreateInfo.pfnUserCallback = debugCallback;
+
  VkInstanceCreateInfo instanceCreateInfo = {};
  instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
-  instanceCreateInfo.pNext = NULL;
+  instanceCreateInfo.pNext = &debugCreateInfo;
  instanceCreateInfo.pApplicationInfo = &appInfo;
  instanceCreateInfo.enabledExtensionCount = requiredInstanceExtensions.size();
  instanceCreateInfo.ppEnabledExtensionNames =
      requiredInstanceExtensions.data();

+  std::vector<const char *> enabledLayers;
+  // enabledLayers.push_back("VK_LAYER_KHRONOS_shader_object");
+
  if (enableValidation) {
-    instanceCreateInfo.ppEnabledLayerNames = &validationLayerName;
-    instanceCreateInfo.enabledLayerCount = 1;
+    enabledLayers.push_back(validationLayerName);
  }

+  instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
+  instanceCreateInfo.enabledLayerCount = enabledLayers.size();
+
  VkInstance vkInstance;
  Verify() << vkCreateInstance(&instanceCreateInfo, nullptr, &vkInstance);
  auto getVkPhyDevice = [&](unsigned index) {
@ -195,6 +243,10 @@ int main(int argc, const char *argv[]) {
    return devices[index];
  };

+  VkDebugUtilsMessengerEXT debugMessenger;
+  _vkCreateDebugUtilsMessengerEXT(vkInstance, &debugCreateInfo, nullptr,
+                                  &debugMessenger);
+
  auto vkPhysicalDevice = getVkPhyDevice(gpuIndex);

  VkPhysicalDeviceProperties vkPhyDeviceProperties;
@ -342,7 +394,7 @@ int main(int argc, const char *argv[]) {
  std::vector<VkDeviceQueueCreateInfo> requestedQueues;

  std::vector<float> defaultQueuePriorities;
-  defaultQueuePriorities.resize(8);
+  defaultQueuePriorities.resize(32);

  for (uint32_t queueFamily = 0; queueFamily < queueFamiliesCount;
       ++queueFamily) {
@ -350,7 +402,10 @@ int main(int argc, const char *argv[]) {
      requestedQueues.push_back(
          {.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
           .queueFamilyIndex = queueFamily,
-           .queueCount = 1,
+           .queueCount =
+               std::min<uint32_t>(queueFamilyProperties[queueFamily]
+                                      .queueFamilyProperties.queueCount,
+                                  defaultQueuePriorities.size()),
           .pQueuePriorities = defaultQueuePriorities.data()});
    } else if (queueFamiliesWithComputeSupport.contains(queueFamily) ||
               queueFamiliesWithTransferSupport.contains(queueFamily)) {
@ -365,56 +420,6 @@ int main(int argc, const char *argv[]) {
    }
  }

-  // try to find queue that not graphics queue
-  bool requestedPresentQueue = false;
-  for (auto queueFamily : queueFamiliesWithPresentSupport) {
-    if (queueFamiliesWithGraphicsSupport.contains(queueFamily)) {
-      continue;
-    }
-
-    bool alreadyRequested = false;
-
-    for (auto &requested : requestedQueues) {
-      if (requested.queueFamilyIndex == queueFamily) {
-        alreadyRequested = true;
-        break;
-      }
-    }
-
-    if (!alreadyRequested) {
-      requestedQueues.push_back(
-          {.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-           .queueFamilyIndex = queueFamily,
-           .queueCount = 1,
-           .pQueuePriorities = defaultQueuePriorities.data()});
-    }
-
-    requestedPresentQueue = true;
-  }
-
-  if (!requestedPresentQueue) {
-    for (auto queueFamily : queueFamiliesWithPresentSupport) {
-      bool alreadyRequested = false;
-
-      for (auto &requested : requestedQueues) {
-        if (requested.queueFamilyIndex == queueFamily) {
-          alreadyRequested = true;
-          break;
-        }
-      }
-
-      if (!alreadyRequested) {
-        requestedQueues.push_back(
-            {.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-             .queueFamilyIndex = queueFamily,
-             .queueCount = 1,
-             .pQueuePriorities = defaultQueuePriorities.data()});
-      }
-
-      requestedPresentQueue = true;
-    }
-  }
-
  VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures{
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_OBJECT_FEATURES_EXT,
      .shaderObject = VK_TRUE};
@ -422,6 +427,7 @@ int main(int argc, const char *argv[]) {
  VkPhysicalDeviceVulkan13Features phyDevFeatures13{
      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_3_FEATURES,
      .pNext = &shaderObjectFeatures,
+      .synchronization2 = VK_TRUE,
      .dynamicRendering = VK_TRUE,
      .maintenance4 = VK_TRUE,
  };
@ -601,26 +607,42 @@ int main(int argc, const char *argv[]) {
  std::vector<std::pair<VkQueue, unsigned>> transferQueues;
  std::vector<std::pair<VkQueue, unsigned>> graphicsQueues;
  VkQueue presentQueue = VK_NULL_HANDLE;
+  unsigned presentQueueFamily;

  for (auto &queueInfo : requestedQueues) {
-    if (queueFamiliesWithComputeSupport.contains(queueInfo.queueFamilyIndex)) {
-      for (uint32_t queueIndex = 0; queueIndex < queueInfo.queueCount;
-           ++queueIndex) {
-        auto &[queue, index] = computeQueues.emplace_back();
-        index = queueInfo.queueFamilyIndex;
-        vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, queueIndex,
-                         &queue);
-      }
-    }
-
    if (queueFamiliesWithGraphicsSupport.contains(queueInfo.queueFamilyIndex)) {
      for (uint32_t queueIndex = 0; queueIndex < queueInfo.queueCount;
           ++queueIndex) {
+
+        if (presentQueue == VK_NULL_HANDLE &&
+            queueFamiliesWithPresentSupport.contains(
+                queueInfo.queueFamilyIndex)) {
+          presentQueueFamily = queueInfo.queueFamilyIndex;
+          vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, 0,
+                           &presentQueue);
+
+          continue;
+        }
+
        auto &[queue, index] = graphicsQueues.emplace_back();
        index = queueInfo.queueFamilyIndex;
        vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, queueIndex,
                         &queue);
      }
+
+      continue;
+    }
+
+    if (queueFamiliesWithComputeSupport.contains(queueInfo.queueFamilyIndex)) {
+      uint32_t queueIndex = 0;
+      for (; queueIndex < queueInfo.queueCount; ++queueIndex) {
+        auto &[queue, index] = computeQueues.emplace_back();
+        index = queueInfo.queueFamilyIndex;
+        vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, queueIndex,
+                         &queue);
+      }
+
+      continue;
    }

    if (queueFamiliesWithTransferSupport.contains(queueInfo.queueFamilyIndex)) {
@ -631,14 +653,15 @@ int main(int argc, const char *argv[]) {
        vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, queueIndex,
                         &queue);
      }
-    }

-    if (presentQueue == VK_NULL_HANDLE &&
-        queueFamiliesWithPresentSupport.contains(queueInfo.queueFamilyIndex)) {
-      vkGetDeviceQueue(vkDevice, queueInfo.queueFamilyIndex, 0, &presentQueue);
+      continue;
    }
  }

+  if (graphicsQueues.empty() && presentQueue != VK_NULL_HANDLE) {
+    graphicsQueues.push_back({presentQueue, presentQueueFamily});
+  }
+
  Verify() << (computeQueues.size() > 1);
  Verify() << (transferQueues.size() > 0);
  Verify() << (graphicsQueues.size() > 0);
@ -651,19 +674,12 @@ int main(int argc, const char *argv[]) {
  VkCommandPoolCreateInfo commandPoolCreateInfo = {
      .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
      .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-      .queueFamilyIndex = graphicsQueues.front().second,
+      .queueFamilyIndex = presentQueueFamily,
  };

  VkCommandPool commandPool;
  Verify() << vkCreateCommandPool(vkDevice, &commandPoolCreateInfo, nullptr,
                                  &commandPool);
-
-  amdgpu::device::DrawContext dc{
-      // TODO
-      .queue = graphicsQueues.front().first,
-      .commandPool = commandPool,
-  };
-
  std::vector<VkFence> inFlightFences(swapchainImages.size());

  for (auto &fence : inFlightFences) {
@ -734,7 +750,7 @@ int main(int argc, const char *argv[]) {
  g_hostMemory = memory;

  {
-    amdgpu::device::AmdgpuDevice device(dc, bridgePuller.header);
+    amdgpu::device::AmdgpuDevice device(bridgePuller.header);

    for (std::uint32_t end = bridge->memoryAreaCount, i = 0; i < end; ++i) {
      auto area = bridge->memoryAreas[i];
@ -747,22 +763,21 @@ int main(int argc, const char *argv[]) {
      VkCommandBufferAllocateInfo allocInfo{};
      allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
      allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-      allocInfo.commandPool = dc.commandPool;
+      allocInfo.commandPool = commandPool;
      allocInfo.commandBufferCount = presentCmdBuffers.size();
      vkAllocateCommandBuffers(vkDevice, &allocInfo, presentCmdBuffers.data());
    }

+    std::vector<amdgpu::device::Ref<amdgpu::device::TaskChain>> flipTaskChain(
+        swapchainImages.size());
+
+    for (auto &chain : flipTaskChain) {
+      chain = amdgpu::device::TaskChain::Create();
+    }
    std::printf("Initialization complete\n");

    uint32_t imageIndex = 0;
    bool isImageAcquired = false;
-    std::vector<std::vector<VkBuffer>> swapchainBufferHandles;
-    swapchainBufferHandles.resize(swapchainImages.size());
-    std::vector<std::vector<VkImage>> swapchainImageHandles;
-    swapchainImageHandles.resize(swapchainImages.size());
-
-    VkPipelineStageFlags submitPipelineStages =
-        VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;

    while (!glfwWindowShouldClose(window)) {
      glfwPollEvents();
@ -808,54 +823,27 @@ int main(int argc, const char *argv[]) {

          vkBeginCommandBuffer(presentCmdBuffers[imageIndex], &beginInfo);

-          for (auto handle : swapchainBufferHandles[imageIndex]) {
-            vkDestroyBuffer(vkDevice, handle, nullptr);
-          }
-
-          for (auto handle : swapchainImageHandles[imageIndex]) {
-            vkDestroyImage(vkDevice, handle, nullptr);
-          }
-
-          swapchainBufferHandles[imageIndex].clear();
-          swapchainImageHandles[imageIndex].clear();
-
-          if (device.handleFlip(cmd.flip.bufferIndex, cmd.flip.arg,
-                                presentCmdBuffers[imageIndex],
-                                swapchainImages[imageIndex], swapchainExtent,
-                                swapchainBufferHandles[imageIndex],
-                                swapchainImageHandles[imageIndex])) {
-            vkEndCommandBuffer(presentCmdBuffers[imageIndex]);
-
-            VkSubmitInfo submitInfo{};
-            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-            submitInfo.commandBufferCount = 1;
-            submitInfo.pCommandBuffers = &presentCmdBuffers[imageIndex];
-            submitInfo.waitSemaphoreCount = 1;
-            submitInfo.signalSemaphoreCount = 1;
-            submitInfo.pSignalSemaphores = &renderCompleteSemaphore;
-            submitInfo.pWaitSemaphores = &presentCompleteSemaphore;
-            submitInfo.pWaitDstStageMask = &submitPipelineStages;
-
-            Verify() << vkQueueSubmit(dc.queue, 1, &submitInfo,
-                                      inFlightFences[imageIndex]);
-
-            VkPresentInfoKHR presentInfo{};
-            presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-            presentInfo.waitSemaphoreCount = 1;
-            presentInfo.pWaitSemaphores = &renderCompleteSemaphore;
-            presentInfo.swapchainCount = 1;
-            presentInfo.pSwapchains = &swapchain;
-            presentInfo.pImageIndices = &imageIndex;
-
+          if (device.handleFlip(
+                  presentQueue, presentCmdBuffers[imageIndex],
+                  *flipTaskChain[imageIndex].get(), cmd.flip.bufferIndex,
+                  cmd.flip.arg, swapchainImages[imageIndex], swapchainExtent,
+                  presentCompleteSemaphore, renderCompleteSemaphore,
+                  inFlightFences[imageIndex])) {
+            VkPresentInfoKHR presentInfo{
+                .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+                .waitSemaphoreCount = 1,
+                .pWaitSemaphores = &renderCompleteSemaphore,
+                .swapchainCount = 1,
+                .pSwapchains = &swapchain,
+                .pImageIndices = &imageIndex,
+            };
            if (vkQueuePresentKHR(presentQueue, &presentInfo) != VK_SUCCESS) {
              std::printf("swapchain was invalidated\n");
              createSwapchain();
            }
-            // std::this_thread::sleep_for(std::chrono::seconds(3));
          } else {
            isImageAcquired = true;
          }
-
          break;
        }

@ -876,17 +864,6 @@ int main(int argc, const char *argv[]) {
    vkDestroySemaphore(vkDevice, presentCompleteSemaphore, nullptr);
    vkDestroySemaphore(vkDevice, renderCompleteSemaphore, nullptr);
    vkDestroyCommandPool(vkDevice, commandPool, nullptr);
-
-    for (auto &handles : swapchainImageHandles) {
-      for (auto handle : handles) {
-        vkDestroyImage(vkDevice, handle, nullptr);
-      }
-    }
-    for (auto &handles : swapchainBufferHandles) {
-      for (auto handle : handles) {
-        vkDestroyBuffer(vkDevice, handle, nullptr);
-      }
-    }
  }

  vkDestroySwapchainKHR(vkDevice, swapchain, nullptr);