gpu rewrite initial commit

2026-01-05 08:10:10 +01:00 · 2024-09-25 16:00:55 +03:00 · 2024-09-25 16:00:55 +03:00 · 4cf808facd
parent 0d4ed51cd9
commit 4cf808facd
133 changed files with 35491 additions and 4 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,7 +3,8 @@ project(rpcsx)

 set(CMAKE_CXX_EXTENSIONS off)
 set(CMAKE_CXX_STANDARD 23)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_STANDARD_REQUIRED on)
+set(CMAKE_BUILD_RPATH_USE_ORIGIN on)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

@ -41,7 +42,7 @@ function(add_precompiled_vulkan_spirv target)

        add_custom_command(
            OUTPUT ${outputpath}
-            COMMAND $<TARGET_FILE:glslang::glslang-standalone> -V --target-env vulkan1.3 --vn "${varname}" -o "${outputpath}" "${CMAKE_CURRENT_SOURCE_DIR}/${input}"
+            COMMAND $<TARGET_FILE:glslang::glslang-standalone> -V --target-env vulkan1.2 --vn "${varname}" -o "${outputpath}" "${CMAKE_CURRENT_SOURCE_DIR}/${input}"
            DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${input}" glslang::glslang-standalone
            COMMENT "Generating ${outputname}..."
        )
@ -65,6 +66,7 @@ add_subdirectory(tools)
 add_subdirectory(orbis-kernel)
 add_subdirectory(rpcsx-os)
 add_subdirectory(rpcsx-gpu)
+add_subdirectory(rpcsx-gpu2)
 add_subdirectory(hw/amdgpu)
 add_subdirectory(rx)

--- a/hw/amdgpu/device/src/rect_list.geom.glsl
+++ b/hw/amdgpu/device/src/rect_list.geom.glsl
@ -1,6 +1,6 @@
 #version 450

-layout (triangles) in;
+layout (triangles, invocations = 1) in;
 layout (triangle_strip, max_vertices = 4) out;

 void main(void)
--- a/rpcsx-gpu2/CMakeLists.txt
+++ b/rpcsx-gpu2/CMakeLists.txt
@ -0,0 +1,36 @@
+find_package(glfw3 3.3 REQUIRED)
+
+add_precompiled_vulkan_spirv(rpcsx-gpu-shaders
+    shaders/fill_red.frag.glsl
+    shaders/flip.frag.glsl
+    shaders/flip.vert.glsl
+    shaders/rect_list.geom.glsl
+)
+
+add_executable(rpcsx-gpu2
+    Cache.cpp
+    main.cpp
+    Device.cpp
+    Pipe.cpp
+    Registers.cpp
+    Renderer.cpp
+)
+
+target_link_libraries(rpcsx-gpu2
+PUBLIC
+    rpcsx-gpu-shaders
+    amdgpu::bridge
+    rx
+    gcn-shader
+    glfw
+    amdgpu::tiler::cpu
+    amdgpu::tiler::vulkan
+    rdna-semantic-spirv
+    gnm::vulkan
+    gnm
+)
+
+install(TARGETS rpcsx-gpu2 RUNTIME DESTINATION bin)
+set_target_properties(rpcsx-gpu2 PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+add_subdirectory(lib)
--- a/rpcsx-gpu2/Cache.cpp
+++ b/rpcsx-gpu2/Cache.cpp
--- a/rpcsx-gpu2/Cache.hpp
+++ b/rpcsx-gpu2/Cache.hpp
@ -0,0 +1,333 @@
+#pragma once
+
+#include "Pipe.hpp"
+#include "amdgpu/tiler.hpp"
+#include "gnm/constants.hpp"
+#include "rx/die.hpp"
+#include "shader/Access.hpp"
+#include "shader/GcnConverter.hpp"
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <rx/MemoryTable.hpp>
+#include <shader/gcn.hpp>
+#include <vulkan/vulkan_core.h>
+
+namespace amdgpu {
+using Access = shader::Access;
+
+struct ShaderKey {
+  std::uint64_t address;
+  shader::gcn::Stage stage;
+  shader::gcn::Environment env;
+};
+
+struct ImageKey {
+  std::uint64_t address;
+  gnm::TextureType type;
+  gnm::DataFormat dfmt;
+  gnm::NumericFormat nfmt;
+  TileMode tileMode = {};
+  VkOffset3D offset = {};
+  VkExtent3D extent = {1, 1, 1};
+  std::uint32_t pitch = 1;
+  unsigned baseMipLevel = 0;
+  unsigned mipCount = 1;
+  unsigned baseArrayLayer = 0;
+  unsigned arrayLayerCount = 1;
+  bool pow2pad = false;
+
+  static ImageKey createFrom(const gnm::TBuffer &tbuffer);
+};
+
+struct ImageViewKey : ImageKey {
+  gnm::Swizzle R = gnm::Swizzle::R;
+  gnm::Swizzle G = gnm::Swizzle::G;
+  gnm::Swizzle B = gnm::Swizzle::B;
+  gnm::Swizzle A = gnm::Swizzle::A;
+
+  static ImageViewKey createFrom(const gnm::TBuffer &tbuffer);
+};
+
+struct SamplerKey {
+  VkFilter magFilter;
+  VkFilter minFilter;
+  VkSamplerMipmapMode mipmapMode;
+  VkSamplerAddressMode addressModeU;
+  VkSamplerAddressMode addressModeV;
+  VkSamplerAddressMode addressModeW;
+  float mipLodBias;
+  float maxAnisotropy;
+  VkCompareOp compareOp;
+  float minLod;
+  float maxLod;
+  VkBorderColor borderColor;
+  bool anisotropyEnable;
+  bool compareEnable;
+  bool unnormalizedCoordinates;
+
+  static SamplerKey createFrom(const gnm::SSampler &sampler);
+
+  auto operator<=>(const SamplerKey &other) const = default;
+};
+
+struct Cache {
+  static constexpr std::array kGraphicsStages = {
+      VK_SHADER_STAGE_VERTEX_BIT,
+      VK_SHADER_STAGE_GEOMETRY_BIT,
+      VK_SHADER_STAGE_FRAGMENT_BIT,
+      VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT,
+      VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
+  };
+
+  static constexpr std::array kDescriptorBindings = {
+      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      VK_DESCRIPTOR_TYPE_SAMPLER,
+      VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 1 * 1000),
+      VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 2 * 1000),
+      VkDescriptorType(VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE + 3 * 1000),
+      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+  };
+
+  static constexpr int getStageIndex(VkShaderStageFlagBits stage) {
+    auto it = std::find(kGraphicsStages.begin(), kGraphicsStages.end(), stage);
+
+    if (it == kGraphicsStages.end()) {
+      return -1;
+    }
+
+    return it - kGraphicsStages.begin();
+  }
+
+  static constexpr int getDescriptorBinding(VkDescriptorType type, int dim = 0) {
+    auto it =
+        std::find(kDescriptorBindings.begin(), kDescriptorBindings.end(), type + dim * 1000);
+
+    if (it == kDescriptorBindings.end()) {
+      return -1;
+    }
+
+    return it - kDescriptorBindings.begin();
+  }
+
+  enum class TagId : std::uint64_t {};
+  struct Entry;
+
+  int vmId = -1;
+
+  struct Shader {
+    VkShaderEXT handle;
+    shader::gcn::ShaderInfo *info;
+    VkShaderStageFlagBits stage;
+  };
+
+  struct Sampler {
+    VkSampler handle;
+  };
+
+  struct Buffer {
+    VkBuffer handle;
+    std::uint64_t offset;
+    std::uint64_t deviceAddress;
+    TagId tagId;
+    std::byte *data;
+  };
+
+  struct IndexBuffer {
+    VkBuffer handle;
+    std::uint64_t offset;
+    std::uint32_t indexCount;
+    gnm::PrimitiveType primType;
+    gnm::IndexType indexType;
+  };
+
+  struct Image {
+    VkImage handle;
+  };
+
+  struct ImageView {
+    VkImageView handle;
+    VkImage imageHandle;
+  };
+
+  class Tag {
+    Cache *mParent = nullptr;
+    Scheduler *mScheduler = nullptr;
+    TagId mTagId{};
+
+    std::vector<std::shared_ptr<Entry>> mAcquiredResources;
+    std::vector<std::array<VkDescriptorSet, kGraphicsStages.size()>>
+        mGraphicsDescriptorSets;
+
+    std::vector<VkDescriptorSet> mComputeDescriptorSets;
+
+  public:
+    Tag() = default;
+    Tag(Cache *parent, Scheduler &scheduler, TagId id)
+        : mParent(parent), mScheduler(&scheduler), mTagId(id) {}
+    Tag(const Tag &) = delete;
+    Tag(Tag &&other) { other.swap(*this); }
+    Tag &operator=(Tag &&other) {
+      other.swap(*this);
+      return *this;
+    }
+
+    void submitAndWait() {
+      mScheduler->submit();
+      mScheduler->wait();
+    }
+
+    ~Tag() { release(); }
+
+    TagId getReadId() const { return TagId{std::uint64_t(mTagId) - 1}; }
+    TagId getWriteId() const { return mTagId; }
+
+    void swap(Tag &other) {
+      std::swap(mParent, other.mParent);
+      std::swap(mScheduler, other.mScheduler);
+      std::swap(mTagId, other.mTagId);
+      std::swap(mAcquiredResources, other.mAcquiredResources);
+      std::swap(mGraphicsDescriptorSets, other.mGraphicsDescriptorSets);
+      std::swap(mComputeDescriptorSets, other.mComputeDescriptorSets);
+    }
+
+    Cache *getCache() const { return mParent; }
+    Device *getDevice() const { return mParent->mDevice; }
+    int getVmId() const { return mParent->mVmIm; }
+
+    Shader getShader(const ShaderKey &key,
+                     const ShaderKey *dependedKey = nullptr);
+    Sampler getSampler(const SamplerKey &key);
+    Buffer getBuffer(std::uint64_t address, std::uint64_t size, Access access);
+    Buffer getInternalBuffer(std::uint64_t size);
+    IndexBuffer getIndexBuffer(std::uint64_t address, std::uint32_t indexCount,
+                               gnm::PrimitiveType primType,
+                               gnm::IndexType indexType);
+    Image getImage(const ImageKey &key, Access access);
+    ImageView getImageView(const ImageViewKey &key, Access access);
+    void readMemory(void *target, std::uint64_t address, std::uint64_t size);
+    void writeMemory(const void *source, std::uint64_t address,
+                     std::uint64_t size);
+    int compareMemory(const void *source, std::uint64_t address,
+                      std::uint64_t size);
+    void release();
+
+    VkPipelineLayout getGraphicsPipelineLayout() const {
+      return getCache()->getGraphicsPipelineLayout();
+    }
+
+    VkPipelineLayout getComputePipelineLayout() const {
+      return getCache()->getComputePipelineLayout();
+    }
+
+    std::array<VkDescriptorSet, kGraphicsStages.size()>
+    createGraphicsDescriptorSets() {
+      auto result = getCache()->createGraphicsDescriptorSets();
+      mGraphicsDescriptorSets.push_back(result);
+      return result;
+    }
+
+    VkDescriptorSet createComputeDescriptorSet() {
+      auto result = getCache()->createComputeDescriptorSet();
+      mComputeDescriptorSets.push_back(result);
+      return result;
+    }
+
+    std::shared_ptr<Entry> findShader(const ShaderKey &key,
+                                      const ShaderKey *dependedKey = nullptr);
+  };
+
+  Cache(Device *device, int vmId);
+  ~Cache();
+  Tag createTag(Scheduler &scheduler);
+
+  vk::Buffer &getMemoryTableBuffer() { return mMemoryTableBuffer; }
+  vk::Buffer &getGdsBuffer() { return mGdsBuffer; }
+
+  void addFrameBuffer(Scheduler &scheduler, int index, std::uint64_t address,
+                      std::uint32_t width, std::uint32_t height, int format,
+                      TileMode tileMode);
+  void removeFrameBuffer(Scheduler &scheduler, int index);
+  VkImage getFrameBuffer(Scheduler &scheduler, int index);
+  void invalidate(Scheduler &scheduler, std::uint64_t address,
+                  std::uint64_t size);
+
+  void invalidate(Scheduler &scheduler) {
+    invalidate(scheduler, 0, ~static_cast<std::uint64_t>(0));
+  }
+
+  void flush(Scheduler &scheduler, std::uint64_t address, std::uint64_t size);
+  void flush(Scheduler &scheduler) {
+    flush(scheduler, 0, ~static_cast<std::uint64_t>(0));
+  }
+
+  const std::array<VkDescriptorSetLayout, kGraphicsStages.size()> &
+  getGraphicsDescriptorSetLayouts() const {
+    return mGraphicsDescriptorSetLayouts;
+  }
+
+  VkDescriptorSetLayout
+  getGraphicsDescriptorSetLayout(VkShaderStageFlagBits stage) const {
+    int index = getStageIndex(stage);
+    rx::dieIf(index < 0, "getGraphicsDescriptorSetLayout: unexpected stage");
+    return mGraphicsDescriptorSetLayouts[index];
+  }
+
+  VkDescriptorSetLayout getComputeDescriptorSetLayout() const {
+    return mComputeDescriptorSetLayout;
+  }
+  VkPipelineLayout getGraphicsPipelineLayout() const {
+    return mGraphicsPipelineLayout;
+  }
+
+  VkPipelineLayout getComputePipelineLayout() const {
+    return mComputePipelineLayout;
+  }
+
+  std::array<VkDescriptorSet, kGraphicsStages.size()>
+  createGraphicsDescriptorSets();
+  VkDescriptorSet createComputeDescriptorSet();
+
+  void destroyGraphicsDescriptorSets(
+      const std::array<VkDescriptorSet, kGraphicsStages.size()> &set) {
+    std::lock_guard lock(mDescriptorMtx);
+    mGraphicsDescriptorSets.push_back(set);
+  }
+
+  void destroyComputeDescriptorSet(VkDescriptorSet set) {
+    std::lock_guard lock(mDescriptorMtx);
+    mComputeDescriptorSets.push_back(set);
+  }
+
+private:
+  TagId getSyncTag(std::uint64_t address, std::uint64_t size, TagId currentTag);
+
+  Device *mDevice;
+  int mVmIm;
+  TagId mNextTagId{2};
+  vk::Buffer mMemoryTableBuffer;
+  vk::Buffer mGdsBuffer;
+
+  std::mutex mDescriptorMtx;
+  std::array<VkDescriptorSetLayout, kGraphicsStages.size()>
+      mGraphicsDescriptorSetLayouts{};
+  VkDescriptorSetLayout mComputeDescriptorSetLayout{};
+  VkPipelineLayout mGraphicsPipelineLayout{};
+  VkPipelineLayout mComputePipelineLayout{};
+  VkDescriptorPool mGraphicsDescriptorPool{};
+  VkDescriptorPool mComputeDescriptorPool{};
+  std::vector<std::array<VkDescriptorSet, kGraphicsStages.size()>>
+      mGraphicsDescriptorSets;
+  std::vector<VkDescriptorSet> mComputeDescriptorSets;
+  std::map<SamplerKey, VkSampler> mSamplers;
+
+  std::shared_ptr<Entry> mFrameBuffers[10];
+
+  rx::MemoryTableWithPayload<std::shared_ptr<Entry>> mBuffers;
+  rx::MemoryTableWithPayload<std::shared_ptr<Entry>> mIndexBuffers;
+  rx::MemoryTableWithPayload<std::shared_ptr<Entry>> mImages;
+  rx::MemoryTableWithPayload<std::shared_ptr<Entry>> mShaders;
+
+  rx::MemoryTableWithPayload<std::shared_ptr<Entry>> mSyncTable;
+};
+} // namespace amdgpu
--- a/rpcsx-gpu2/Device.cpp
+++ b/rpcsx-gpu2/Device.cpp
@ -0,0 +1,508 @@
+#include "Device.hpp"
+#include "Renderer.hpp"
+#include "amdgpu/tiler.hpp"
+#include "gnm/constants.hpp"
+#include "gnm/pm4.hpp"
+#include "rx/bits.hpp"
+#include "rx/die.hpp"
+#include "rx/mem.hpp"
+#include "shader/spv.hpp"
+#include "shaders/rdna-semantic-spirv.hpp"
+#include "vk.hpp"
+#include <fcntl.h>
+#include <sys/mman.h>
+
+using namespace amdgpu;
+
+Device::Device() {
+  if (!shader::spv::validate(g_rdna_semantic_spirv)) {
+    shader::spv::dump(g_rdna_semantic_spirv, true);
+    rx::die("builtin semantic validation failed");
+  }
+
+  if (auto sem = shader::spv::deserialize(
+          shaderSemanticContext, g_rdna_semantic_spirv,
+          shaderSemanticContext.getUnknownLocation())) {
+    auto shaderSemantic = *sem;
+    shader::gcn::canonicalizeSemantic(shaderSemanticContext, shaderSemantic);
+    shader::gcn::collectSemanticModuleInfo(gcnSemanticModuleInfo,
+                                           shaderSemantic);
+    gcnSemantic = shader::gcn::collectSemanticInfo(gcnSemanticModuleInfo);
+  } else {
+    rx::die("failed to deserialize builtin semantics\n");
+  }
+
+  for (int index = 0; auto &cache : caches) {
+    cache.vmId = index++;
+  }
+
+  for (auto &pipe : graphicsPipes) {
+    pipe.device = this;
+  }
+
+  // for (auto &pipe : computePipes) {
+  //   pipe.device = this;
+  // }
+}
+
+Device::~Device() {
+  for (auto fd : dmemFd) {
+    if (fd >= 0) {
+      ::close(fd);
+    }
+  }
+
+  for (auto &[pid, info] : processInfo) {
+    if (info.vmFd >= 0) {
+      ::close(info.vmFd);
+    }
+  }
+}
+
+void Device::mapProcess(std::int64_t pid, int vmId, const char *shmName) {
+  auto &process = processInfo[pid];
+  process.vmId = vmId;
+
+  auto memory = amdgpu::RemoteMemory{vmId};
+
+  std::string pidVmName = shmName;
+  pidVmName += '-';
+  pidVmName += std::to_string(pid);
+  int memoryFd = ::shm_open(pidVmName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+  process.vmFd = memoryFd;
+
+  if (memoryFd < 0) {
+    std::printf("failed to process %x shared memory\n", (int)pid);
+    std::abort();
+  }
+
+  for (auto [startAddress, endAddress, slot] : process.vmTable) {
+    auto gpuProt = slot.prot >> 4;
+    if (gpuProt == 0) {
+      continue;
+    }
+
+    auto devOffset = slot.offset + startAddress - slot.baseAddress;
+    int mapFd = memoryFd;
+
+    if (slot.memoryType >= 0) {
+      mapFd = dmemFd[slot.memoryType];
+    }
+
+    auto mmapResult =
+        ::mmap(memory.getPointer(startAddress), endAddress - startAddress,
+               gpuProt, MAP_FIXED | MAP_SHARED, mapFd, devOffset);
+
+    if (mmapResult == MAP_FAILED) {
+      std::printf("failed to map process %x memory, address %lx-%lx, type %x\n",
+                  (int)pid, startAddress, endAddress, slot.memoryType);
+      std::abort();
+    }
+
+    handleProtectChange(vmId, startAddress, endAddress - startAddress,
+                        slot.prot);
+  }
+}
+
+void Device::unmapProcess(std::int64_t pid) {
+  auto &process = processInfo[pid];
+  auto startAddress = static_cast<std::uint64_t>(process.vmId) << 40;
+  auto size = static_cast<std::uint64_t>(1) << 40;
+  rx::mem::reserve(reinterpret_cast<void *>(startAddress), size);
+
+  ::close(process.vmFd);
+  process.vmFd = -1;
+  process.vmId = -1;
+}
+
+void Device::protectMemory(int pid, std::uint64_t address, std::uint64_t size,
+                           int prot) {
+  auto &process = processInfo[pid];
+
+  auto vmSlotIt = process.vmTable.queryArea(address);
+  if (vmSlotIt == process.vmTable.end()) {
+    std::abort();
+  }
+
+  auto vmSlot = (*vmSlotIt).payload;
+
+  process.vmTable.map(address, address + size,
+                      VmMapSlot{
+                          .memoryType = vmSlot.memoryType,
+                          .prot = static_cast<int>(prot),
+                          .offset = vmSlot.offset,
+                          .baseAddress = vmSlot.baseAddress,
+                      });
+
+  if (process.vmId >= 0) {
+    auto memory = amdgpu::RemoteMemory{process.vmId};
+    rx::mem::protect(memory.getPointer(address), size, prot >> 4);
+    handleProtectChange(process.vmId, address, size, prot);
+  }
+}
+
+void Device::onCommandBuffer(std::int64_t pid, int cmdHeader,
+                             std::uint64_t address, std::uint64_t size) {
+  auto &process = processInfo[pid];
+  if (process.vmId < 0) {
+    return;
+  }
+
+  auto memory = RemoteMemory{process.vmId};
+
+  auto op = rx::getBits(cmdHeader, 15, 8);
+
+  if (op == gnm::IT_INDIRECT_BUFFER_CNST) {
+    graphicsPipes[0].setCeQueue(Queue::createFromRange(
+        process.vmId, memory.getPointer<std::uint32_t>(address),
+        size / sizeof(std::uint32_t)));
+  } else if (op == gnm::IT_INDIRECT_BUFFER) {
+    graphicsPipes[0].setDeQueue(
+        Queue::createFromRange(process.vmId,
+                               memory.getPointer<std::uint32_t>(address),
+                               size / sizeof(std::uint32_t)),
+        1);
+  } else {
+    rx::die("unimplemented command buffer %x", cmdHeader);
+  }
+}
+
+bool Device::processPipes() {
+  bool allProcessed = true;
+
+  // for (auto &pipe : computePipes) {
+  //   if (!pipe.processAllRings()) {
+  //     allProcessed = false;
+  //   }
+  // }
+
+  for (auto &pipe : graphicsPipes) {
+    if (!pipe.processAllRings()) {
+      allProcessed = false;
+    }
+  }
+
+  return allProcessed;
+}
+
+static void
+transitionImageLayout(VkCommandBuffer commandBuffer, VkImage image,
+                      VkImageLayout oldLayout, VkImageLayout newLayout,
+                      const VkImageSubresourceRange &subresourceRange) {
+  VkImageMemoryBarrier barrier{};
+  barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+  barrier.oldLayout = oldLayout;
+  barrier.newLayout = newLayout;
+  barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+  barrier.image = image;
+  barrier.subresourceRange = subresourceRange;
+
+  auto layoutToStageAccess = [](VkImageLayout layout)
+      -> std::pair<VkPipelineStageFlags, VkAccessFlags> {
+    switch (layout) {
+    case VK_IMAGE_LAYOUT_UNDEFINED:
+    case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR:
+    case VK_IMAGE_LAYOUT_GENERAL:
+      return {VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0};
+
+    case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL:
+      return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT};
+
+    case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL:
+      return {VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT};
+
+    case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL:
+      return {VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_SHADER_READ_BIT};
+
+    case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL:
+      return {VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT,
+              VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
+                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT};
+
+    case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL:
+      return {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+              VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
+                  VK_ACCESS_COLOR_ATTACHMENT_READ_BIT};
+
+    default:
+      std::abort();
+    }
+  };
+
+  auto [sourceStage, sourceAccess] = layoutToStageAccess(oldLayout);
+  auto [destinationStage, destinationAccess] = layoutToStageAccess(newLayout);
+
+  barrier.srcAccessMask = sourceAccess;
+  barrier.dstAccessMask = destinationAccess;
+
+  vkCmdPipelineBarrier(commandBuffer, sourceStage, destinationStage, 0, 0,
+                       nullptr, 0, nullptr, 1, &barrier);
+}
+
+bool Device::flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
+                  VkCommandBuffer commandBuffer, VkImage swapchainImage,
+                  VkImageView swapchainImageView, VkFence fence) {
+  auto &pipe = graphicsPipes[0];
+  auto &scheduler = pipe.scheduler;
+  auto &process = processInfo[pid];
+  if (process.vmId < 0) {
+    return false;
+  }
+
+  auto &buffer = process.buffers[bufferIndex];
+  auto &bufferAttr = process.bufferAttributes[buffer.attrId];
+
+  gnm::DataFormat dfmt;
+  gnm::NumericFormat nfmt;
+  CbCompSwap compSwap;
+  switch (bufferAttr.pixelFormat) {
+  case 0x80000000:
+    // bgra
+    dfmt = gnm::kDataFormat8_8_8_8;
+    nfmt = gnm::kNumericFormatSNormNoZero;
+    compSwap = CbCompSwap::Alt;
+    break;
+
+  case 0x80002200:
+    // rgba
+    dfmt = gnm::kDataFormat8_8_8_8;
+    nfmt = gnm::kNumericFormatSNormNoZero;
+    compSwap = CbCompSwap::Std;
+    break;
+
+  case 0x88060000:
+    // bgra
+    dfmt = gnm::kDataFormat2_10_10_10;
+    nfmt = gnm::kNumericFormatSNormNoZero;
+    compSwap = CbCompSwap::Alt;
+    break;
+
+  default:
+    rx::die("unimplemented color buffer format %x", bufferAttr.pixelFormat);
+  }
+
+  // std::printf("displaying buffer %lx\n", buffer.address);
+  VkCommandBufferBeginInfo beginInfo{};
+  beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+  beginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+
+  vkBeginCommandBuffer(commandBuffer, &beginInfo);
+
+  auto cacheTag = getCacheTag(process.vmId, scheduler);
+
+  if (true) {
+    transitionImageLayout(commandBuffer, swapchainImage,
+                          VK_IMAGE_LAYOUT_UNDEFINED,
+                          VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                          {
+                              .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                              .levelCount = 1,
+                              .layerCount = 1,
+                          });
+
+    amdgpu::flip(cacheTag, commandBuffer, vk::context->swapchainExtent,
+                 buffer.address, swapchainImageView,
+                 {bufferAttr.width, bufferAttr.height}, compSwap,
+                 getDefaultTileModes()[13], dfmt, nfmt);
+
+    transitionImageLayout(commandBuffer, swapchainImage,
+                          VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                          VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+                          {
+                              .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                              .levelCount = 1,
+                              .layerCount = 1,
+                          });
+  } else {
+    ImageKey frameKey{
+        .address = buffer.address,
+        .type = gnm::TextureType::Dim2D,
+        .dfmt = dfmt,
+        .nfmt = nfmt,
+        .tileMode = getDefaultTileModes()[13],
+        .extent =
+            {
+                .width = bufferAttr.width,
+                .height = bufferAttr.height,
+                .depth = 1,
+            },
+        .pitch = bufferAttr.width,
+        .mipCount = 1,
+        .arrayLayerCount = 1,
+    };
+
+    auto image = cacheTag.getImage(frameKey, Access::Read);
+
+    scheduler.submit();
+    scheduler.wait();
+
+    transitionImageLayout(commandBuffer, swapchainImage,
+                          VK_IMAGE_LAYOUT_UNDEFINED,
+                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                          {
+                              .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                              .levelCount = 1,
+                              .layerCount = 1,
+                          });
+
+    VkImageBlit region{
+        .srcSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                           .mipLevel = 0,
+                           .baseArrayLayer = 0,
+                           .layerCount = 1},
+        .srcOffsets = {{},
+                       {static_cast<int32_t>(bufferAttr.width),
+                        static_cast<int32_t>(bufferAttr.height), 1}},
+        .dstSubresource = {.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                           .mipLevel = 0,
+                           .baseArrayLayer = 0,
+                           .layerCount = 1},
+        .dstOffsets =
+            {{},
+             {static_cast<int32_t>(vk::context->swapchainExtent.width),
+              static_cast<int32_t>(vk::context->swapchainExtent.height), 1}},
+    };
+
+    vkCmdBlitImage(commandBuffer, image.handle, VK_IMAGE_LAYOUT_GENERAL,
+                   swapchainImage, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1,
+                   &region, VK_FILTER_LINEAR);
+
+    transitionImageLayout(commandBuffer, swapchainImage,
+                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                          VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+                          {
+                              .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                              .levelCount = 1,
+                              .layerCount = 1,
+                          });
+  }
+
+  auto submitCompleteTask = scheduler.createExternalSubmit();
+
+  {
+    vkEndCommandBuffer(commandBuffer);
+
+    VkSemaphoreSubmitInfo signalSemSubmitInfos[] = {
+        {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+            .semaphore = vk::context->renderCompleteSemaphore,
+            .value = 1,
+            .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+        },
+        {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+            .semaphore = scheduler.getSemaphoreHandle(),
+            .value = submitCompleteTask,
+            .stageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+        },
+    };
+
+    VkSemaphoreSubmitInfo waitSemSubmitInfos[] = {
+        {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+            .semaphore = vk::context->presentCompleteSemaphore,
+            .value = 1,
+            .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+        },
+        {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
+            .semaphore = scheduler.getSemaphoreHandle(),
+            .value = submitCompleteTask - 1,
+            .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+        },
+    };
+
+    VkCommandBufferSubmitInfo cmdBufferSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+        .commandBuffer = commandBuffer,
+    };
+
+    VkSubmitInfo2 submitInfo{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount = 1,
+        .pWaitSemaphoreInfos = waitSemSubmitInfos,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &cmdBufferSubmitInfo,
+        .signalSemaphoreInfoCount = 2,
+        .pSignalSemaphoreInfos = signalSemSubmitInfos,
+    };
+
+    vkQueueSubmit2(vk::context->presentQueue, 1, &submitInfo, fence);
+    // vkQueueWaitIdle(queue);
+  }
+
+  scheduler.then([=, this, cacheTag = std::move(cacheTag)] {
+    bridge->flipBuffer[process.vmId] = bufferIndex;
+    bridge->flipArg[process.vmId] = arg;
+    bridge->flipCount[process.vmId] = bridge->flipCount[process.vmId] + 1;
+
+    auto mem = RemoteMemory{process.vmId};
+    auto bufferInUse =
+        mem.getPointer<std::uint64_t>(bridge->bufferInUseAddress[process.vmId]);
+    if (bufferInUse != nullptr) {
+      bufferInUse[bufferIndex] = 0;
+    }
+  });
+
+  return true;
+}
+
+void Device::mapMemory(std::int64_t pid, std::uint64_t address,
+                       std::uint64_t size, int memoryType, int dmemIndex,
+                       int prot, std::int64_t offset) {
+  auto &process = processInfo[pid];
+
+  process.vmTable.map(address, address + size,
+                      VmMapSlot{
+                          .memoryType = memoryType >= 0 ? dmemIndex : -1,
+                          .prot = prot,
+                          .offset = offset,
+                          .baseAddress = address,
+                      });
+
+  if (process.vmId < 0) {
+    return;
+  }
+
+  auto memory = amdgpu::RemoteMemory{process.vmId};
+
+  int mapFd = process.vmFd;
+
+  if (memoryType >= 0) {
+    mapFd = dmemFd[dmemIndex];
+  }
+
+  auto mmapResult = ::mmap(memory.getPointer(address), size, prot >> 4,
+                           MAP_FIXED | MAP_SHARED, mapFd, offset);
+
+  if (mmapResult == MAP_FAILED) {
+    rx::die("failed to map process %x memory, address %lx-%lx, type %x",
+            (int)pid, address, address + size, memoryType);
+  }
+
+  handleProtectChange(process.vmId, address, size, prot);
+}
+
+void Device::registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer) {
+  auto &process = processInfo[pid];
+
+  if (buffer.attrId >= 10 || buffer.index >= 10) {
+    rx::die("out of buffers %u, %u", buffer.attrId, buffer.index);
+  }
+
+  process.buffers[buffer.index] = buffer;
+}
+
+void Device::registerBufferAttribute(std::int64_t pid,
+                                     bridge::CmdBufferAttribute attr) {
+  auto &process = processInfo[pid];
+  if (attr.attrId >= 10) {
+    rx::die("out of buffer attributes %u", attr.attrId);
+  }
+
+  process.bufferAttributes[attr.attrId] = attr;
+}
+
+void Device::handleProtectChange(int vmId, std::uint64_t address,
+                                 std::uint64_t size, int prot) {}
--- a/rpcsx-gpu2/Device.hpp
+++ b/rpcsx-gpu2/Device.hpp
@ -0,0 +1,91 @@
+#pragma once
+#include "Cache.hpp"
+#include "Pipe.hpp"
+#include "amdgpu/bridge/bridge.hpp"
+#include "amdgpu/tiler_vulkan.hpp"
+#include "gnm/descriptors.hpp"
+#include "rx/MemoryTable.hpp"
+#include "shader/SemanticInfo.hpp"
+#include "shader/SpvConverter.hpp"
+#include "shader/gcn.hpp"
+#include <unordered_map>
+#include <vulkan/vulkan_core.h>
+
+namespace amdgpu {
+
+struct VmMapSlot {
+  int memoryType;
+  int prot;
+  std::int64_t offset;
+  std::uint64_t baseAddress;
+
+  auto operator<=>(const VmMapSlot &) const = default;
+};
+
+struct ProcessInfo {
+  int vmId = -1;
+  int vmFd = -1;
+  amdgpu::bridge::CmdBufferAttribute bufferAttributes[10];
+  amdgpu::bridge::CmdBuffer buffers[10];
+  rx::MemoryTableWithPayload<VmMapSlot> vmTable;
+};
+
+struct RemoteMemory {
+  int vmId;
+
+  template <typename T = void> T *getPointer(std::uint64_t address) const {
+    return address ? reinterpret_cast<T *>(
+                         static_cast<std::uint64_t>(vmId) << 40 | address)
+                   : nullptr;
+  }
+};
+
+struct Device {
+  static constexpr auto kComputePipeCount = 8;
+  static constexpr auto kGfxPipeCount = 2;
+
+  shader::SemanticInfo gcnSemantic;
+  shader::spv::Context shaderSemanticContext;
+  shader::gcn::SemanticModuleInfo gcnSemanticModuleInfo;
+  amdgpu::bridge::BridgeHeader *bridge;
+
+  Registers::Config config;
+
+  GpuTiler tiler;
+
+  GraphicsPipe graphicsPipes[kGfxPipeCount]{0, 1};
+  // ComputePipe computePipes[kComputePipeCount]{0, 1, 2, 3, 4, 5, 6, 7};
+
+  int dmemFd[3] = {-1, -1, -1};
+  std::unordered_map<std::int64_t, ProcessInfo> processInfo;
+
+  Cache caches[6]{
+      {this, 0}, {this, 1}, {this, 2}, {this, 3}, {this, 4}, {this, 5},
+  };
+
+  Device();
+  ~Device();
+
+  Cache::Tag getCacheTag(int vmId, Scheduler &scheduler) {
+    return caches[vmId].createTag(scheduler);
+  }
+
+  void mapProcess(std::int64_t pid, int vmId, const char *shmName);
+  void unmapProcess(std::int64_t pid);
+  void protectMemory(int pid, std::uint64_t address, std::uint64_t size,
+                     int prot);
+  void onCommandBuffer(std::int64_t pid, int cmdHeader, std::uint64_t address,
+                       std::uint64_t size);
+  bool processPipes();
+  bool flip(std::int64_t pid, int bufferIndex, std::uint64_t arg,
+            VkCommandBuffer commandBuffer, VkImage swapchainImage,
+            VkImageView swapchainImageView, VkFence fence);
+  void mapMemory(std::int64_t pid, std::uint64_t address, std::uint64_t size,
+                 int memoryType, int dmemIndex, int prot, std::int64_t offset);
+  void registerBuffer(std::int64_t pid, bridge::CmdBuffer buffer);
+  void registerBufferAttribute(std::int64_t pid,
+                               bridge::CmdBufferAttribute attr);
+  void handleProtectChange(int vmId, std::uint64_t address, std::uint64_t size,
+                           int prot);
+};
+} // namespace amdgpu
--- a/rpcsx-gpu2/Pipe.cpp
+++ b/rpcsx-gpu2/Pipe.cpp
@ -0,0 +1,987 @@
+#include "Pipe.hpp"
+#include "Device.hpp"
+#include "Registers.hpp"
+#include "Renderer.hpp"
+#include "gnm/mmio.hpp"
+#include "gnm/pm4.hpp"
+#include "vk.hpp"
+#include <cstdio>
+#include <rx/bits.hpp>
+#include <rx/die.hpp>
+#include <vulkan/vulkan_core.h>
+
+using namespace amdgpu;
+
+static Scheduler createGfxScheduler(int index) {
+  auto queue = vk::context->presentQueue;
+  auto family = vk::context->presentQueueFamily;
+
+  if (index != 0) {
+    for (auto [otherQueue, otherFamily] : vk::context->graphicsQueues) {
+      if (family != otherFamily) {
+        queue = otherQueue;
+        family = otherFamily;
+      }
+    }
+  }
+
+  return Scheduler{queue, family};
+}
+
+static Scheduler createComputeScheduler(int index) {
+  auto &compQueues = vk::context->computeQueues;
+  auto [queue, family] = compQueues[index % compQueues.size()];
+
+  return Scheduler{queue, family};
+}
+
+static bool compare(int cmpFn, std::uint32_t poll, std::uint32_t mask,
+                    std::uint32_t ref) {
+  poll &= mask;
+  ref &= mask;
+
+  switch (cmpFn) {
+  case 0:
+    return true;
+  case 1:
+    return poll < ref;
+  case 2:
+    return poll <= ref;
+  case 3:
+    return poll == ref;
+  case 4:
+    return poll != ref;
+  case 5:
+    return poll >= ref;
+  case 6:
+    return poll > ref;
+  }
+
+  return false;
+}
+
+ComputePipe::ComputePipe(int index) : scheduler(createComputeScheduler(index)) {
+  for (auto &handler : commandHandlers) {
+    handler = &ComputePipe::unknownPacket;
+  }
+
+  commandHandlers[gnm::IT_NOP] = &ComputePipe::handleNop;
+}
+
+bool ComputePipe::processAllRings() {
+  bool allProcessed = true;
+
+  for (auto &ring : queues) {
+    processRing(ring);
+
+    if (ring.rptr != ring.wptr) {
+      allProcessed = false;
+      break;
+    }
+  }
+
+  return allProcessed;
+}
+
+void ComputePipe::processRing(Queue &queue) {
+  while (queue.rptr != queue.wptr) {
+    if (queue.rptr >= queue.base + queue.size) {
+      queue.rptr = queue.base;
+    }
+
+    auto header = *queue.rptr;
+    auto type = rx::getBits(header, 31, 30);
+
+    if (type == 3) {
+      auto op = rx::getBits(header, 15, 8);
+      auto len = rx::getBits(header, 29, 16) + 2;
+
+      // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel,
+      //              gnm::pm4OpcodeToString(op));
+
+      if (op == gnm::IT_COND_EXEC) {
+        rx::die("unimplemented COND_EXEC");
+      }
+
+      auto handler = commandHandlers[op];
+      if (!(this->*handler)(queue)) {
+        return;
+      }
+
+      queue.rptr += len;
+      continue;
+    }
+
+    if (type == 2) {
+      ++queue.rptr;
+      continue;
+    }
+
+    rx::die("unexpected pm4 packet type %u", type);
+  }
+}
+
+bool ComputePipe::unknownPacket(Queue &queue) {
+  auto op = rx::getBits(queue.rptr[0], 15, 8);
+
+  rx::die("unimplemented compute pm4 packet: %s, queue %u\n",
+          gnm::pm4OpcodeToString(op), queue.indirectLevel);
+
+  return true;
+}
+
+bool ComputePipe::handleNop(Queue &queue) { return true; }
+
+GraphicsPipe::GraphicsPipe(int index) : scheduler(createGfxScheduler(index)) {
+  for (auto &processorHandlers : commandHandlers) {
+    for (auto &handler : processorHandlers) {
+      handler = &GraphicsPipe::unknownPacket;
+    }
+
+    processorHandlers[gnm::IT_NOP] = &GraphicsPipe::handleNop;
+  }
+
+  auto &dataHandlers = commandHandlers[2];
+  auto &deHandlers = commandHandlers[1];
+  auto &ceHandlers = commandHandlers[0];
+
+  deHandlers[gnm::IT_SET_BASE] = &GraphicsPipe::setBase;
+  deHandlers[gnm::IT_CLEAR_STATE] = &GraphicsPipe::clearState;
+
+  deHandlers[gnm::IT_INDEX_BUFFER_SIZE] = &GraphicsPipe::indexBufferSize;
+  deHandlers[gnm::IT_DISPATCH_DIRECT] = &GraphicsPipe::dispatchDirect;
+  deHandlers[gnm::IT_DISPATCH_INDIRECT] = &GraphicsPipe::dispatchIndirect;
+
+  // IT_ATOMIC_GDS
+  // IT_OCCLUSION_QUERY
+  deHandlers[gnm::IT_SET_PREDICATION] = &GraphicsPipe::setPredication;
+
+  // IT_REG_RMW
+
+  // IT_COND_EXEC
+  // IT_PRED_EXEC
+
+  deHandlers[gnm::IT_DRAW_INDIRECT] = &GraphicsPipe::drawIndirect;
+  deHandlers[gnm::IT_DRAW_INDEX_INDIRECT] = &GraphicsPipe::drawIndexIndirect;
+  deHandlers[gnm::IT_INDEX_BASE] = &GraphicsPipe::indexBase;
+  deHandlers[gnm::IT_DRAW_INDEX_2] = &GraphicsPipe::drawIndex2;
+
+  deHandlers[gnm::IT_CONTEXT_CONTROL] = &GraphicsPipe::contextControl;
+
+  deHandlers[gnm::IT_INDEX_TYPE] = &GraphicsPipe::indexType;
+  // IT_DRAW_INDIRECT_MULTI
+  deHandlers[gnm::IT_DRAW_INDEX_AUTO] = &GraphicsPipe::drawIndexAuto;
+  deHandlers[gnm::IT_NUM_INSTANCES] = &GraphicsPipe::numInstances;
+  deHandlers[gnm::IT_DRAW_INDEX_MULTI_AUTO] = &GraphicsPipe::drawIndexMultiAuto;
+
+  // IT_INDIRECT_BUFFER_CNST
+  // IT_STRMOUT_BUFFER_UPDATE
+
+  deHandlers[gnm::IT_DRAW_INDEX_OFFSET_2] = &GraphicsPipe::drawIndexOffset2;
+  deHandlers[gnm::IT_DRAW_PREAMBLE] = &GraphicsPipe::drawPreamble;
+
+  deHandlers[gnm::IT_WRITE_DATA] = &GraphicsPipe::writeData;
+  deHandlers[gnm::IT_MEM_SEMAPHORE] = &GraphicsPipe::memSemaphore;
+  // IT_COPY_DW
+  deHandlers[gnm::IT_WAIT_REG_MEM] = &GraphicsPipe::waitRegMem;
+  deHandlers[gnm::IT_INDIRECT_BUFFER] = &GraphicsPipe::indirectBuffer;
+  // IT_COPY_DATA
+  deHandlers[gnm::IT_PFP_SYNC_ME] = &GraphicsPipe::pfpSyncMe;
+  // IT_SURFACE_SYNC
+  deHandlers[gnm::IT_COND_WRITE] = &GraphicsPipe::condWrite;
+  deHandlers[gnm::IT_EVENT_WRITE] = &GraphicsPipe::eventWrite;
+  deHandlers[gnm::IT_EVENT_WRITE_EOP] = &GraphicsPipe::eventWriteEop;
+  deHandlers[gnm::IT_EVENT_WRITE_EOS] = &GraphicsPipe::eventWriteEos;
+  deHandlers[gnm::IT_RELEASE_MEM] = &GraphicsPipe::releaseMem;
+  // IT_PREAMBLE_CNTL
+  deHandlers[gnm::IT_DMA_DATA] = &GraphicsPipe::dmaData;
+  deHandlers[gnm::IT_ACQUIRE_MEM] = &GraphicsPipe::acquireMem;
+  // IT_REWIND
+
+  // IT_LOAD_UCONFIG_REG
+  // IT_LOAD_SH_REG
+  // IT_LOAD_CONFIG_REG
+  // IT_LOAD_CONTEXT_REG
+  deHandlers[gnm::IT_SET_CONFIG_REG] = &GraphicsPipe::setConfigReg;
+  deHandlers[gnm::IT_SET_CONTEXT_REG] = &GraphicsPipe::setContextReg;
+  // IT_SET_CONTEXT_REG_INDIRECT
+  deHandlers[gnm::IT_SET_SH_REG] = &GraphicsPipe::setShReg;
+  // IT_SET_SH_REG_OFFSET
+  // IT_SET_QUEUE_REG
+  deHandlers[gnm::IT_SET_UCONFIG_REG] = &GraphicsPipe::setUConfigReg;
+  // IT_SCRATCH_RAM_WRITE
+  // IT_SCRATCH_RAM_READ
+  deHandlers[gnm::IT_INCREMENT_DE_COUNTER] = &GraphicsPipe::incrementDeCounter;
+  deHandlers[gnm::IT_WAIT_ON_CE_COUNTER] = &GraphicsPipe::waitOnCeCounter;
+  deHandlers[gnm::IT_SET_CE_DE_COUNTERS] = &GraphicsPipe::setCeDeCounters;
+  // IT_WAIT_ON_AVAIL_BUFFER
+  // IT_SWITCH_BUFFER
+  // IT_SET_RESOURCES
+  // IT_MAP_PROCESS
+  // IT_MAP_QUEUES
+  // IT_UNMAP_QUEUES
+  // IT_QUERY_STATUS
+  // IT_RUN_LIST
+  // IT_DISPATCH_DRAW_PREAMBLE
+  // IT_DISPATCH_DRAW
+
+  ceHandlers[gnm::IT_WAIT_ON_DE_COUNTER_DIFF] =
+      &GraphicsPipe::waitOnDeCounterDiff;
+  ceHandlers[gnm::IT_INCREMENT_CE_COUNTER] = &GraphicsPipe::incrementCeCounter;
+  ceHandlers[gnm::IT_LOAD_CONST_RAM] = &GraphicsPipe::loadConstRam;
+  ceHandlers[gnm::IT_WRITE_CONST_RAM] = &GraphicsPipe::writeConstRam;
+  ceHandlers[gnm::IT_DUMP_CONST_RAM] = &GraphicsPipe::dumpConstRam;
+}
+
+void GraphicsPipe::setCeQueue(Queue queue) {
+  queue.indirectLevel = -1;
+  ceQueue = queue;
+}
+
+void GraphicsPipe::setDeQueue(Queue queue, int ring) {
+  rx::dieIf(ring > 2, "out of indirect gfx rings, %u", ring);
+  queue.indirectLevel = 2 - ring;
+  deQueues[ring] = queue;
+}
+
+std::uint32_t *GraphicsPipe::getMmRegister(std::uint32_t dwAddress) {
+  // if (dwAddress >= Registers::Config::kMmioOffset &&
+  //     dwAddress < Registers::Config::kMmioOffset +
+  //     sizeof(Registers::Config) / sizeof(std::uint32_t)) {
+  //   return reinterpret_cast<std::uint32_t *>(&config) + (dwAddress -
+  //   Registers::Config::kMmioOffset);
+  // }
+
+  if (dwAddress >= Registers::ShaderConfig::kMmioOffset &&
+      dwAddress < Registers::ShaderConfig::kMmioOffset +
+                      sizeof(Registers::ShaderConfig) / sizeof(std::uint32_t)) {
+    return reinterpret_cast<std::uint32_t *>(&sh) +
+           (dwAddress - Registers::ShaderConfig::kMmioOffset);
+  }
+
+  if (dwAddress >= Registers::UConfig::kMmioOffset &&
+      dwAddress < Registers::UConfig::kMmioOffset +
+                      sizeof(Registers::UConfig) / sizeof(std::uint32_t)) {
+    return reinterpret_cast<std::uint32_t *>(&uConfig) +
+           (dwAddress - Registers::UConfig::kMmioOffset);
+  }
+
+  if (dwAddress >= Registers::Context::kMmioOffset &&
+      dwAddress < Registers::Context::kMmioOffset +
+                      sizeof(Registers::Context) / sizeof(std::uint32_t)) {
+    return reinterpret_cast<std::uint32_t *>(&context) +
+           (dwAddress - Registers::Context::kMmioOffset);
+  }
+
+  rx::die("unexpected memory mapped register address %x, %s", dwAddress,
+          gnm::mmio::registerName(dwAddress));
+}
+
+bool GraphicsPipe::processAllRings() {
+  bool allProcessed = true;
+
+  if (ceQueue.rptr != ceQueue.wptr) {
+    processRing(ceQueue);
+
+    if (ceQueue.rptr != ceQueue.wptr) {
+      allProcessed = false;
+    }
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    auto &queue = deQueues[i];
+    processRing(queue);
+
+    if (queue.rptr != queue.wptr) {
+      allProcessed = false;
+      break;
+    }
+  }
+
+  return allProcessed;
+}
+
+void GraphicsPipe::processRing(Queue &queue) {
+  auto cp = 1;
+  if (queue.indirectLevel < 0) {
+    cp = 0;
+  } else if (queue.indirectLevel == 2) {
+    cp = 2;
+  }
+
+  while (queue.rptr != queue.wptr) {
+    if (queue.rptr >= queue.base + queue.size) {
+      queue.rptr = queue.base;
+    }
+
+    auto header = *queue.rptr;
+    auto type = rx::getBits(header, 31, 30);
+
+    if (type == 3) {
+      auto op = rx::getBits(header, 15, 8);
+      auto len = rx::getBits(header, 29, 16) + 2;
+
+      // std::fprintf(stderr, "queue %d: %s\n", queue.indirectLevel,
+      //              gnm::pm4OpcodeToString(op));
+
+      if (op == gnm::IT_COND_EXEC) {
+        rx::die("unimplemented COND_EXEC");
+      }
+
+      auto handler = commandHandlers[cp][op];
+      if (!(this->*handler)(queue)) {
+        return;
+      }
+
+      queue.rptr += len;
+
+      if (op == gnm::IT_INDIRECT_BUFFER || op == gnm::IT_INDIRECT_BUFFER_CNST) {
+        break;
+      }
+
+      continue;
+    }
+
+    if (type == 2) {
+      ++queue.rptr;
+      continue;
+    }
+
+    rx::die("unexpected pm4 packet type %u", type);
+  }
+}
+
+bool GraphicsPipe::handleNop(Queue &queue) { return true; }
+
+bool GraphicsPipe::setBase(Queue &queue) {
+  auto baseIndex = queue.rptr[1] & 0xf;
+
+  switch (baseIndex) {
+  case 0: {
+    auto address0 = queue.rptr[2] & ~3;
+    auto address1 = queue.rptr[3] & ((1 << 16) - 1);
+
+    displayListPatchBase =
+        address0 | (static_cast<std::uint64_t>(address1) << 32);
+    break;
+  }
+  case 1: {
+    auto address0 = queue.rptr[2] & ~3;
+    auto address1 = queue.rptr[3] & ((1 << 16) - 1);
+
+    drawIndexIndirPatchBase =
+        address0 | (static_cast<std::uint64_t>(address1) << 32);
+    break;
+  }
+
+  case 2: {
+    auto cs1Index = queue.rptr[2] & ((1 << 16) - 1);
+    auto cs2Index = queue.rptr[3] & ((1 << 16) - 1);
+    gdsPartitionBases[0] = cs1Index;
+    gdsPartitionBases[1] = cs2Index;
+    break;
+  }
+
+  case 3: {
+    auto cs1Index = queue.rptr[2] & ((1 << 16) - 1);
+    auto cs2Index = queue.rptr[3] & ((1 << 16) - 1);
+    cePartitionBases[0] = cs1Index;
+    cePartitionBases[1] = cs2Index;
+    break;
+  }
+
+  default:
+    rx::die("pm4: unknown SET_BASE index %u", baseIndex);
+  }
+
+  return true;
+}
+
+bool GraphicsPipe::clearState(Queue &queue) {
+  context = Registers::Context::Default;
+  return true;
+}
+
+bool GraphicsPipe::contextControl(Queue &queue) { return true; }
+bool GraphicsPipe::acquireMem(Queue &queue) { return true; }
+bool GraphicsPipe::releaseMem(Queue &queue) {
+  auto eventCntl = queue.rptr[1];
+  auto dataCntl = queue.rptr[2];
+  auto addressLo = queue.rptr[3] & ~3;
+  auto addressHi = queue.rptr[3] & ~3;
+  auto dataLo = queue.rptr[4];
+  auto dataHi = queue.rptr[5];
+
+  auto eventIndex = rx::getBits(eventCntl, 11, 8);
+  auto eventType = rx::getBits(eventCntl, 5, 0);
+  auto dataSel = rx::getBits(dataCntl, 31, 29);
+  auto intSel = rx::getBits(dataCntl, 25, 24);
+
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint64_t>(address);
+
+  context.vgtEventInitiator = eventType;
+
+  switch (dataSel) {
+  case 0: // none
+    break;
+  case 1: // 32 bit, low
+    *reinterpret_cast<std::uint32_t *>(pointer) = dataLo;
+    break;
+  case 2: // 64 bit
+    *pointer = dataLo | (static_cast<std::uint64_t>(dataHi) << 32);
+    break;
+  case 3: // 64 bit, global GPU clock
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::system_clock::now().time_since_epoch())
+                   .count();
+    break;
+  case 4: // 64 bit, perf counter
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::steady_clock::now().time_since_epoch())
+                   .count();
+    break;
+
+  default:
+    rx::die("unimplemented event release mem data %#x", dataSel);
+  }
+
+  return true;
+}
+
+bool GraphicsPipe::drawPreamble(Queue &queue) { return true; }
+
+bool GraphicsPipe::indexBufferSize(Queue &queue) {
+  vgtIndexBufferSize = queue.rptr[1];
+  return true;
+}
+bool GraphicsPipe::dispatchDirect(Queue &queue) {
+  auto dimX = queue.rptr[1];
+  auto dimY = queue.rptr[2];
+  auto dimZ = queue.rptr[3];
+  auto dispatchInitiator = queue.rptr[4];
+  sh.compute.computeDispatchInitiator = dispatchInitiator;
+
+  // FIXME
+  return true;
+}
+bool GraphicsPipe::dispatchIndirect(Queue &queue) {
+  auto offset = queue.rptr[1];
+  auto dispatchInitiator = queue.rptr[2];
+
+  sh.compute.computeDispatchInitiator = dispatchInitiator;
+  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+      drawIndexIndirPatchBase + offset);
+
+  auto dimX = buffer[0];
+  auto dimY = buffer[1];
+  auto dimZ = buffer[2];
+
+  // FIXME
+  return true;
+}
+
+bool GraphicsPipe::setPredication(Queue &queue) {
+  auto startAddressLo = queue.rptr[1] & ~0xf;
+  auto predProperties = queue.rptr[2];
+
+  auto startAddressHi = rx::getBits(predProperties, 15, 0);
+  auto predBool = rx::getBit(predProperties, 8);
+  auto hint = rx::getBit(predProperties, 12);
+  auto predOp = rx::getBits(predProperties, 18, 16);
+  auto cont = rx::getBit(predProperties, 31);
+
+  switch (predOp) {
+  case 0: // clear predicate
+  case 1: // set ZPass predicate
+  case 2: // set PrimCount predicate
+    break;
+  }
+
+  // TODO
+
+  return true;
+}
+bool GraphicsPipe::drawIndirect(Queue &queue) {
+  auto dataOffset = queue.rptr[1];
+  auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1);
+  auto startInstLoc = queue.rptr[3] & ((1 << 16) - 1);
+  auto drawInitiator = queue.rptr[4];
+
+  context.vgtDrawInitiator = drawInitiator;
+
+  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+      drawIndexIndirPatchBase + dataOffset);
+
+  std::uint32_t vertexCountPerInstance = buffer[0];
+  std::uint32_t instanceCount = buffer[1];
+  std::uint32_t startVertexLocation = buffer[2];
+  std::uint32_t startInstanceLocation = buffer[3];
+
+  // FIXME
+  rx::die("drawIndirect");
+  return true;
+}
+bool GraphicsPipe::drawIndexIndirect(Queue &queue) {
+  auto dataOffset = queue.rptr[1];
+  auto baseVtxLoc = queue.rptr[2] & ((1 << 16) - 1);
+  auto drawInitiator = queue.rptr[3];
+
+  auto buffer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(
+      drawIndexIndirPatchBase + dataOffset);
+
+  context.vgtDrawInitiator = drawInitiator;
+
+  std::uint32_t indexCountPerInstance = buffer[0];
+  std::uint32_t instanceCount = buffer[1];
+  std::uint32_t startIndexLocation = buffer[2];
+  std::uint32_t baseVertexLocation = buffer[3];
+  std::uint32_t startInstanceLocation = buffer[4];
+
+  // FIXME
+  rx::die("drawIndexIndirect");
+  return true;
+}
+bool GraphicsPipe::indexBase(Queue &queue) {
+  auto addressLo = queue.rptr[1] << 1;
+  auto addressHi = queue.rptr[2] & ((1 << 16) - 1);
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  vgtIndexBase = address;
+  return true;
+}
+bool GraphicsPipe::drawIndex2(Queue &queue) {
+  auto maxSize = queue.rptr[1];
+  auto indexOffset = queue.rptr[2];
+  auto indexCount = queue.rptr[3];
+  auto drawInitiator = queue.rptr[4];
+
+  context.vgtDrawInitiator = drawInitiator;
+  uConfig.vgtNumIndices = indexCount;
+
+  draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances,
+       vgtIndexBase + indexOffset, maxSize);
+  return true;
+}
+bool GraphicsPipe::indexType(Queue &queue) {
+  uConfig.vgtIndexType = static_cast<gnm::IndexType>(queue.rptr[1] & 1);
+  return true;
+}
+bool GraphicsPipe::drawIndexAuto(Queue &queue) {
+  auto indexCount = queue.rptr[1];
+  auto drawInitiator = queue.rptr[2];
+
+  uConfig.vgtNumIndices = indexCount;
+  context.vgtDrawInitiator = drawInitiator;
+
+  draw(*this, queue.vmId, 0, indexCount, 0, uConfig.vgtNumInstances, 0, 0);
+  return true;
+}
+bool GraphicsPipe::numInstances(Queue &queue) {
+  uConfig.vgtNumInstances = std::max(queue.rptr[1], 1u);
+  return true;
+}
+bool GraphicsPipe::drawIndexMultiAuto(Queue &queue) {
+  auto primCount = queue.rptr[1];
+  auto drawInitiator = queue.rptr[2];
+  auto control = queue.rptr[3];
+
+  auto indexOffset = rx::getBits(control, 15, 0);
+  auto primType = rx::getBits(control, 20, 16);
+  auto indexCount = rx::getBits(control, 31, 21);
+
+  context.vgtDrawInitiator = drawInitiator;
+  uConfig.vgtPrimitiveType = static_cast<gnm::PrimitiveType>(primType);
+  uConfig.vgtNumIndices = indexCount;
+
+  // FIXME
+  return true;
+}
+bool GraphicsPipe::drawIndexOffset2(Queue &queue) {
+  auto maxSize = queue.rptr[1];
+  auto indexOffset = queue.rptr[2];
+  auto indexCount = queue.rptr[3];
+  auto drawInitiator = queue.rptr[4];
+
+  context.vgtDrawInitiator = drawInitiator;
+  // FIXME
+  return true;
+}
+bool GraphicsPipe::writeData(Queue &queue) {
+  auto len = rx::getBits(queue.rptr[0], 29, 16) - 1;
+  auto control = queue.rptr[1];
+  auto dstAddressLo = queue.rptr[2];
+  auto dstAddressHi = queue.rptr[3];
+  auto data = queue.rptr + 4;
+
+  auto engineSel = rx::getBits(control, 31, 30);
+  auto wrConfirm = rx::getBit(control, 20);
+  auto wrOneAddress = rx::getBit(control, 16);
+  auto dstSel = rx::getBits(control, 11, 8);
+
+  std::uint32_t *dstPointer = nullptr;
+
+  switch (dstSel) {
+  case 0: // memory mapped register
+    dstPointer = getMmRegister(dstAddressLo & ((1 << 16) - 1));
+    break;
+
+  case 1:   // memory sync
+  case 5: { // memory async
+    auto address =
+        (dstAddressLo & ~3) | (static_cast<std::uint64_t>(dstAddressHi) << 32);
+    dstPointer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(address);
+    break;
+  }
+
+  default:
+    rx::die("unimplemented write data, dst sel = %#x", dstSel);
+  }
+
+  if (wrOneAddress) {
+    for (std::uint32_t i = 0; i < len; ++i) {
+      *dstPointer = data[i];
+    }
+  } else {
+    std::memcpy(dstPointer, data, len * sizeof(std::uint32_t));
+  }
+
+  return true;
+}
+bool GraphicsPipe::memSemaphore(Queue &queue) {
+  // FIXME
+  return true;
+}
+bool GraphicsPipe::waitRegMem(Queue &queue) {
+  auto engine = rx::getBit(queue.rptr[1], 8);
+  auto memSpace = rx::getBit(queue.rptr[1], 4);
+  auto function = rx::getBits(queue.rptr[1], 2, 0);
+  auto pollAddressLo = queue.rptr[2];
+  auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1);
+  auto reference = queue.rptr[4];
+  auto mask = queue.rptr[5];
+  auto pollInterval = queue.rptr[6];
+
+  std::uint32_t pollData;
+
+  if (memSpace == 0) {
+    pollData = *getMmRegister(pollAddressLo & ((1 << 16) - 1));
+  } else {
+    auto pollAddress = (pollAddressLo & ~3) |
+                       (static_cast<std::uint64_t>(pollAddressHi) << 32);
+    pollData = *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(pollAddress);
+  }
+
+  return compare(function, pollData, mask, reference);
+}
+bool GraphicsPipe::indirectBuffer(Queue &queue) {
+  rx::dieIf(queue.indirectLevel < 0, "unexpected indirect buffer from CP");
+
+  auto addressLo = queue.rptr[1] & ~3;
+  auto addressHi = queue.rptr[2] & ((1 << 16) - 1);
+  auto vmId = queue.rptr[3] >> 24;
+  auto ibSize = queue.rptr[4] & ((1 << 20) - 1);
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+
+  auto rptr = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(address);
+  setDeQueue(Queue::createFromRange(queue.vmId, rptr, ibSize),
+             queue.indirectLevel + 1);
+  return true;
+}
+bool GraphicsPipe::pfpSyncMe(Queue &queue) {
+  // TODO
+  return true;
+}
+bool GraphicsPipe::condWrite(Queue &queue) {
+  auto writeSpace = rx::getBit(queue.rptr[1], 8);
+  auto pollSpace = rx::getBit(queue.rptr[1], 4);
+  auto function = rx::getBits(queue.rptr[1], 2, 0);
+  auto pollAddressLo = queue.rptr[2];
+  auto pollAddressHi = queue.rptr[3] & ((1 << 16) - 1);
+  auto reference = queue.rptr[4];
+  auto mask = queue.rptr[5];
+  auto writeAddressLo = queue.rptr[6];
+  auto writeAddressHi = queue.rptr[7] & ((1 << 16) - 1);
+  auto writeData = queue.rptr[8];
+
+  std::uint32_t pollData;
+
+  if (pollSpace == 0) {
+    pollData = *getMmRegister(pollAddressLo & ((1 << 16) - 1));
+  } else {
+    auto pollAddress = (pollAddressLo & ~3) |
+                       (static_cast<std::uint64_t>(pollAddressHi) << 32);
+    pollData = *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(pollAddress);
+  }
+
+  if (compare(function, pollData, mask, reference)) {
+    if (writeSpace == 0) {
+      *getMmRegister(writeAddressLo & ((1 << 16) - 1)) = writeData;
+    } else {
+      auto writeAddress = (writeAddressLo & ~3) |
+                          (static_cast<std::uint64_t>(writeAddressHi) << 32);
+
+      *RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(writeAddress) =
+          writeData;
+    }
+  }
+
+  return true;
+}
+
+bool GraphicsPipe::eventWrite(Queue &queue) {
+  enum {
+    kEventZPassDone = 1,
+    kEventSamplePipelineStat = 2,
+    kEventSampleStreamOutStat = 3,
+    kEventPartialFlush = 4,
+  };
+
+  auto eventCntl = queue.rptr[1];
+  auto invL2 = rx::getBit(eventCntl, 20);
+  auto eventIndex = rx::getBits(eventCntl, 11, 8);
+  auto eventType = rx::getBits(eventCntl, 5, 0);
+
+  context.vgtEventInitiator = eventType;
+
+  if (eventIndex == kEventZPassDone || eventIndex == kEventSamplePipelineStat ||
+      eventIndex == kEventSampleStreamOutStat) {
+    auto addressLo = queue.rptr[2] & ~7;
+    auto addressHi = queue.rptr[3] & ((1 << 16) - 1);
+    auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+    rx::die("unimplemented event write, event index %#x, address %lx",
+            eventIndex, address);
+    return true;
+  }
+
+  // FIXME
+  return true;
+}
+
+bool GraphicsPipe::eventWriteEop(Queue &queue) {
+  auto eventCntl = queue.rptr[1];
+  auto addressLo = queue.rptr[2] & ~3;
+  auto dataCntl = queue.rptr[3];
+  auto dataLo = queue.rptr[4];
+  auto dataHi = queue.rptr[5];
+
+  auto invL2 = rx::getBit(eventCntl, 20);
+  auto eventIndex = rx::getBits(eventCntl, 11, 8);
+  auto eventType = rx::getBits(eventCntl, 5, 0);
+  auto dataSel = rx::getBits(dataCntl, 31, 29);
+  auto intSel = rx::getBits(dataCntl, 25, 24);
+  auto addressHi = rx::getBits(dataCntl, 15, 0);
+
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint64_t>(address);
+
+  context.vgtEventInitiator = eventType;
+
+  switch (dataSel) {
+  case 0: // none
+    break;
+  case 1: // 32 bit, low
+    *reinterpret_cast<std::uint32_t *>(pointer) = dataLo;
+    break;
+  case 2: // 64 bit
+    *pointer = dataLo | (static_cast<std::uint64_t>(dataHi) << 32);
+    break;
+  case 3: // 64 bit, global GPU clock
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::system_clock::now().time_since_epoch())
+                   .count();
+    break;
+  case 4: // 64 bit, perf counter
+    *pointer = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                   std::chrono::steady_clock::now().time_since_epoch())
+                   .count();
+    break;
+
+  default:
+    rx::die("unimplemented event write eop data %#x", dataSel);
+  }
+
+  return true;
+}
+
+bool GraphicsPipe::eventWriteEos(Queue &queue) {
+  auto eventCntl = queue.rptr[1];
+  auto addressLo = queue.rptr[2] & ~3;
+  auto cmdInfo = queue.rptr[3];
+  auto dataInfo = queue.rptr[4];
+
+  auto eventIndex = rx::getBits(eventCntl, 11, 8);
+  auto eventType = rx::getBits(eventCntl, 5, 0);
+  auto cmd = rx::getBits(cmdInfo, 31, 29);
+  auto addressHi = rx::getBits(cmdInfo, 15, 0);
+
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  auto pointer = RemoteMemory{queue.vmId}.getPointer<std::uint32_t>(address);
+
+  context.vgtEventInitiator = eventType;
+
+  switch (cmd) {
+  case 1: { // store GDS data to memory
+    auto sizeDw = rx::getBits(dataInfo, 31, 16);
+    auto gdsIndexDw = rx::getBits(dataInfo, 15, 0);
+    rx::die("unimplemented event write eos gds data");
+    break;
+  }
+
+  case 2: // after GDS writes confirm, store 32 bit DATA to memory as fence
+    *pointer = dataInfo;
+    break;
+
+  default:
+    rx::die("unexpected event write eos command: %#x", cmd);
+  }
+  return true;
+}
+
+bool GraphicsPipe::dmaData(Queue &queue) {
+  // FIXME
+  return true;
+}
+
+bool GraphicsPipe::setConfigReg(Queue &queue) {
+  rx::dieIf(queue.indirectLevel != 0, "setConfigReg from queue %d",
+            queue.indirectLevel);
+
+  auto len = rx::getBits(queue.rptr[0], 29, 16);
+  auto offset = queue.rptr[1];
+  auto data = queue.rptr + 2;
+
+  rx::dieIf(
+      (offset + len) * sizeof(std::uint32_t) > sizeof(device->config),
+      "out of Config regs, offset: %u, count %u, %s\n", offset, len,
+      gnm::mmio::registerName(decltype(device->config)::kMmioOffset + offset));
+
+  std::memcpy(reinterpret_cast<std::uint32_t *>(&device->config) + offset, data,
+              sizeof(std::uint32_t) * len);
+
+  return true;
+}
+
+bool GraphicsPipe::setShReg(Queue &queue) {
+  auto len = rx::getBits(queue.rptr[0], 29, 16);
+  auto offset = queue.rptr[1];
+  auto data = queue.rptr + 2;
+
+  rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(sh),
+            "out of SH regs, offset: %u, count %u, %s\n", offset, len,
+            gnm::mmio::registerName(decltype(sh)::kMmioOffset + offset));
+
+  std::memcpy(reinterpret_cast<std::uint32_t *>(&sh) + offset, data,
+              sizeof(std::uint32_t) * len);
+
+  return true;
+}
+
+bool GraphicsPipe::setUConfigReg(Queue &queue) {
+  auto len = rx::getBits(queue.rptr[0], 29, 16);
+  auto offset = queue.rptr[1];
+  auto data = queue.rptr + 2;
+
+  rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(uConfig),
+            "out of UConfig regs, offset: %u, count %u, %s\n", offset, len,
+            gnm::mmio::registerName(decltype(uConfig)::kMmioOffset + offset));
+
+  std::memcpy(reinterpret_cast<std::uint32_t *>(&uConfig) + offset, data,
+              sizeof(std::uint32_t) * len);
+
+  return true;
+}
+
+bool GraphicsPipe::setContextReg(Queue &queue) {
+  auto len = rx::getBits(queue.rptr[0], 29, 16);
+  auto offset = queue.rptr[1];
+  auto data = queue.rptr + 2;
+
+  rx::dieIf((offset + len) * sizeof(std::uint32_t) > sizeof(context),
+            "out of Context regs, offset: %u, count %u, %s\n", offset, len,
+            gnm::mmio::registerName(decltype(context)::kMmioOffset + offset));
+
+  std::memcpy(reinterpret_cast<std::uint32_t *>(&context) + offset, data,
+              sizeof(std::uint32_t) * len);
+
+  // for (std::size_t i = 0; i < len; ++i) {
+  //   std::fprintf(stderr,
+  //       "writing to %s value %x\n",
+  //       gnm::mmio::registerName(decltype(context)::kMmioOffset + offset + i),
+  //       data[i]);
+  // }
+  return true;
+}
+
+bool GraphicsPipe::setCeDeCounters(Queue &queue) {
+  auto counterLo = queue.rptr[1];
+  auto counterHi = queue.rptr[2];
+  auto counter = counterLo | (static_cast<std::uint64_t>(counterHi) << 32);
+  deCounter = counter;
+  ceCounter = counter;
+  return true;
+}
+
+bool GraphicsPipe::waitOnCeCounter(Queue &queue) {
+  auto counterLo = queue.rptr[1];
+  auto counterHi = queue.rptr[2];
+  auto counter = counterLo | (static_cast<std::uint64_t>(counterHi) << 32);
+  return deCounter >= counter;
+}
+
+bool GraphicsPipe::waitOnDeCounterDiff(Queue &queue) {
+  auto waitDiff = queue.rptr[1];
+  auto diff = ceCounter - deCounter;
+  return diff < waitDiff;
+}
+
+bool GraphicsPipe::incrementCeCounter(Queue &queue) {
+  ceCounter++;
+  return true;
+}
+
+bool GraphicsPipe::incrementDeCounter(Queue &queue) {
+  deCounter++;
+  return true;
+}
+
+bool GraphicsPipe::loadConstRam(Queue &queue) {
+  std::uint32_t addressLo = queue.rptr[1];
+  std::uint32_t addressHi = queue.rptr[2];
+  std::uint32_t numDw = queue.rptr[3] & ((1 << 15) - 1);
+  std::uint32_t offset =
+      (queue.rptr[4] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  std::memcpy(constantMemory + offset,
+              RemoteMemory{queue.vmId}.getPointer(address),
+              numDw * sizeof(std::uint32_t));
+
+  return true;
+}
+
+bool GraphicsPipe::writeConstRam(Queue &queue) {
+  std::uint32_t offset =
+      (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+  std::uint32_t data = queue.rptr[2];
+  std::memcpy(constantMemory + offset, &data, sizeof(std::uint32_t));
+  return true;
+}
+
+bool GraphicsPipe::dumpConstRam(Queue &queue) {
+  std::uint32_t offset =
+      (queue.rptr[1] & ((1 << 16) - 1)) / sizeof(std::uint32_t);
+  std::uint32_t numDw = queue.rptr[2] & ((1 << 15) - 1);
+  std::uint32_t addressLo = queue.rptr[3];
+  std::uint32_t addressHi = queue.rptr[4];
+  auto address = addressLo | (static_cast<std::uint64_t>(addressHi) << 32);
+  std::memcpy(RemoteMemory{queue.vmId}.getPointer(address),
+              constantMemory + offset, numDw * sizeof(std::uint32_t));
+
+  return true;
+}
+
+bool GraphicsPipe::unknownPacket(Queue &queue) {
+  auto op = rx::getBits(queue.rptr[0], 15, 8);
+
+  rx::die("unimplemented gfx pm4 packet: %s, queue %u\n",
+          gnm::pm4OpcodeToString(op), queue.indirectLevel);
+}
--- a/rpcsx-gpu2/Pipe.hpp
+++ b/rpcsx-gpu2/Pipe.hpp
@ -0,0 +1,135 @@
+#pragma once
+#include "Registers.hpp"
+#include "Scheduler.hpp"
+
+#include <cstdint>
+#include <vulkan/vulkan_core.h>
+
+namespace amdgpu {
+class Device;
+
+struct Queue {
+  int vmId = -1;
+  int indirectLevel = -1;
+  std::uint32_t *doorbell{};
+  std::uint32_t *base{};
+  std::uint64_t size{};
+  std::uint32_t *rptr{};
+  std::uint32_t *wptr{};
+
+  static Queue createFromRange(int vmId, std::uint32_t *base,
+                               std::uint64_t size, int indirectLevel = 0,
+                               std::uint32_t *doorbell = nullptr) {
+    Queue result;
+    result.vmId = vmId;
+    result.indirectLevel = indirectLevel;
+    result.doorbell = doorbell;
+    result.base = base;
+    result.size = size;
+    result.rptr = base;
+    result.wptr = base + size;
+    return result;
+  }
+};
+
+struct ComputePipe {
+  Device *device;
+  Scheduler scheduler;
+
+  using CommandHandler = bool (ComputePipe::*)(Queue &);
+  CommandHandler commandHandlers[255];
+  Queue queues[8];
+  Registers::ComputeConfig computeConfig;
+
+  ComputePipe(int index);
+
+  bool processAllRings();
+  void processRing(Queue &queue);
+  void mapQueue(int queueId, Queue queue);
+
+  bool setShReg(Queue &queue);
+  bool unknownPacket(Queue &queue);
+  bool handleNop(Queue &queue);
+};
+
+struct GraphicsPipe {
+  Device *device;
+  Scheduler scheduler;
+
+  std::uint64_t ceCounter = 0;
+  std::uint64_t deCounter = 0;
+  std::uint64_t displayListPatchBase = 0;
+  std::uint64_t drawIndexIndirPatchBase = 0;
+  std::uint64_t gdsPartitionBases[2]{};
+  std::uint64_t cePartitionBases[2]{};
+  std::uint64_t vgtIndexBase = 0;
+  std::uint32_t vgtIndexBufferSize = 0;
+
+  std::uint32_t constantMemory[(48 * 1024) / sizeof(std::uint32_t)]{};
+
+  Registers::ShaderConfig sh;
+  Registers::Context context;
+  Registers::UConfig uConfig;
+
+  Queue deQueues[3];
+  Queue ceQueue;
+
+  using CommandHandler = bool (GraphicsPipe::*)(Queue &);
+  CommandHandler commandHandlers[3][255];
+
+  GraphicsPipe(int index);
+
+  void setCeQueue(Queue queue);
+  void setDeQueue(Queue queue, int ring);
+
+  bool processAllRings();
+  void processRing(Queue &queue);
+
+  bool drawPreamble(Queue &queue);
+  bool indexBufferSize(Queue &queue);
+  bool handleNop(Queue &queue);
+  bool contextControl(Queue &queue);
+  bool acquireMem(Queue &queue);
+  bool releaseMem(Queue &queue);
+  bool dispatchDirect(Queue &queue);
+  bool dispatchIndirect(Queue &queue);
+  bool writeData(Queue &queue);
+  bool memSemaphore(Queue &queue);
+  bool waitRegMem(Queue &queue);
+  bool indirectBuffer(Queue &queue);
+  bool condWrite(Queue &queue);
+  bool eventWrite(Queue &queue);
+  bool eventWriteEop(Queue &queue);
+  bool eventWriteEos(Queue &queue);
+  bool dmaData(Queue &queue);
+  bool setBase(Queue &queue);
+  bool clearState(Queue &queue);
+  bool setPredication(Queue &queue);
+  bool drawIndirect(Queue &queue);
+  bool drawIndexIndirect(Queue &queue);
+  bool indexBase(Queue &queue);
+  bool drawIndex2(Queue &queue);
+  bool indexType(Queue &queue);
+  bool drawIndexAuto(Queue &queue);
+  bool numInstances(Queue &queue);
+  bool drawIndexMultiAuto(Queue &queue);
+  bool drawIndexOffset2(Queue &queue);
+  bool pfpSyncMe(Queue &queue);
+  bool setCeDeCounters(Queue &queue);
+  bool waitOnCeCounter(Queue &queue);
+  bool waitOnDeCounterDiff(Queue &queue);
+  bool incrementCeCounter(Queue &queue);
+  bool incrementDeCounter(Queue &queue);
+  bool loadConstRam(Queue &queue);
+  bool writeConstRam(Queue &queue);
+  bool dumpConstRam(Queue &queue);
+  bool setConfigReg(Queue &queue);
+  bool setShReg(Queue &queue);
+  bool setUConfigReg(Queue &queue);
+  bool setContextReg(Queue &queue);
+
+  bool unknownPacket(Queue &queue);
+
+  std::uint32_t *getMmRegister(std::uint32_t dwAddress);
+};
+} // namespace amdgpu
--- a/rpcsx-gpu2/Registers.cpp
+++ b/rpcsx-gpu2/Registers.cpp
@ -0,0 +1,52 @@
+#include "Registers.hpp"
+
+amdgpu::Registers::Context amdgpu::Registers::Context::Default = [] {
+  amdgpu::Registers::Context result{};
+  result.paScScreenScissor.bottom = 0x4000;
+  result.paScScreenScissor.right = 0x4000;
+
+  result.paScWindowScissor.top = 0x8000;
+  result.paScWindowScissor.bottom = 0x4000;
+  result.paScWindowScissor.right = 0x4000;
+
+  for (auto &clipRect : result.paScClipRect) {
+    clipRect.bottom = 0x4000;
+    clipRect.right = 0x4000;
+  }
+
+  result.unk_8c = 0xaa99aaaa;
+  result.paScGenericScissor.top = 0x8000;
+  result.paScGenericScissor.bottom = 0x4000;
+  result.paScGenericScissor.right = 0x4000;
+
+  for (auto &vportScissor : result.paScVportScissor) {
+    vportScissor.top = 0x8000;
+    vportScissor.bottom = 0x4000;
+    vportScissor.right = 0x4000;
+  }
+
+  for (auto &vportZ : result.paScVportZ) {
+    vportZ.min = 0.0f;
+    vportZ.max = 1.0f;
+  }
+
+  result.unk_d4 = 0x2a00161a;
+  result.spiPsInControl = 2;
+  result.paClClipCntl = 0x0009'0000;
+  result.paSuScModeCntl.polyMode = gnm::PolyMode::Dual;
+  result.vgtGsPerEs = 256;
+  result.vgtEsPerGs = 128;
+  result.vgtGsPerVs = 2;
+  result.iaMultiVgtParam = 0xff;
+  result.unk_2f7 = 0x00001000;
+  result.paSuVtxCntl.pixCenterHalf = true;
+  result.paSuVtxCntl.roundMode = gnm::RoundMode::RoundToEven;
+  result.paClGbVertClipAdj = 1.0f;
+  result.paClGbVertDiscAdj = 1.0f;
+  result.paClGbHorzClipAdj = 1.0f;
+  result.paClGbHorzDiscAdj = 1.0f;
+  result.unk_316 = 0xe;
+  result.vgtOutDeallocCntl = 0x10;
+  return result;
+}();
+
--- a/rpcsx-gpu2/Registers.hpp
+++ b/rpcsx-gpu2/Registers.hpp
@ -0,0 +1,931 @@
+#pragma once
+
+#include "amdgpu/tiler.hpp"
+#include "gnm/constants.hpp"
+#include <array>
+#include <cstdint>
+#include <type_traits>
+
+namespace amdgpu {
+enum class Engine {
+  ME,
+  PFP,
+  CE,
+};
+
+enum class EventIndex {
+  OTHER,
+  ZPASS_DONE,
+  SAMAPE_PIPELINE_STAT,
+  SAMPLE_STREAM_OUT_STATS,
+  CS_VS_PS_PARTIAL_FLUSH,
+  ANY_EOP_TIMESTAMP,
+  CS_PS_EOS,
+};
+
+enum class ProtectionFaultAccess : std::uint32_t {
+  Read = 0,
+  Write = 1,
+};
+
+namespace detail {
+#pragma pack(push, 1)
+template <std::size_t Count> struct Padding {
+private:
+  std::uint32_t _[Count];
+};
+} // namespace detail
+
+template <std::size_t Offset, typename ImplT = std::uint32_t>
+struct Register : detail::Padding<Offset>, ImplT {
+  Register() = default;
+  Register(const Register &) = default;
+  Register &operator=(const Register &) = default;
+
+  Register &operator=(const ImplT &newValue) {
+    *static_cast<ImplT *>(this) = newValue;
+    return *this;
+  }
+};
+
+template <std::size_t Offset, typename ImplT>
+  requires(std::is_integral_v<ImplT> || std::is_floating_point_v<ImplT> ||
+           std::is_enum_v<ImplT>)
+struct Register<Offset, ImplT> : detail::Padding<Offset> {
+  ImplT value;
+
+  Register() = default;
+  Register(const Register &) = default;
+  Register &operator=(const Register &) = default;
+  Register &operator=(ImplT newValue) {
+    value = newValue;
+    return *this;
+  }
+
+  operator ImplT() { return value; }
+};
+
+struct CbColorAttrib {
+  union {
+    struct {
+      std::uint32_t tileModeIndex : 5;
+      std::uint32_t fmaskTileModeIndex : 4;
+      std::uint32_t : 3;
+      std::uint32_t numSamples : 3;
+      std::uint32_t numFragments : 2;
+      std::uint32_t forceDstAlpha1 : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct CbColorView {
+  union {
+    struct {
+      std::uint32_t sliceStart : 11;
+      std::uint32_t : 2;
+      std::uint32_t sliceMax : 11;
+    };
+    std::uint32_t raw;
+  };
+};
+
+struct CbColorControl {
+  union {
+    struct {
+      std::uint32_t : 3;
+      std::uint32_t degammaEnable : 1;
+      gnm::CbMode mode : 3;
+      std::uint32_t : 9;
+      std::uint32_t rop3 : 8;
+    };
+    std::uint32_t raw;
+  };
+};
+
+struct CbShaderMask {
+  union {
+    struct {
+      std::uint32_t output0Enable : 4;
+      std::uint32_t output1Enable : 4;
+      std::uint32_t output2Enable : 4;
+      std::uint32_t output3Enable : 4;
+      std::uint32_t output4Enable : 4;
+      std::uint32_t output5Enable : 4;
+      std::uint32_t output6Enable : 4;
+      std::uint32_t output7Enable : 4;
+    };
+    std::uint32_t raw;
+  };
+};
+
+struct CbTargetMask {
+  union {
+    struct {
+      std::uint32_t target0Enable : 4;
+      std::uint32_t target1Enable : 4;
+      std::uint32_t target2Enable : 4;
+      std::uint32_t target3Enable : 4;
+      std::uint32_t target4Enable : 4;
+      std::uint32_t target5Enable : 4;
+      std::uint32_t target6Enable : 4;
+      std::uint32_t target7Enable : 4;
+    };
+    std::uint32_t raw;
+  };
+};
+
+enum class CbCompSwap : std::uint32_t {
+  Std,
+  Alt,
+  StdRev,
+  AltRev,
+};
+
+struct CbColorInfo {
+  union {
+    struct {
+      std::uint32_t endian : 2;
+      gnm::DataFormat dfmt : 5;
+      std::uint32_t linearGeneral : 1;
+      gnm::NumericFormat nfmt : 3;
+      CbCompSwap compSwap : 2;
+      std::uint32_t fastClear : 1;
+      std::uint32_t compression : 1;
+      std::uint32_t blendClamp : 1;
+      std::uint32_t blendBypass : 1;
+      std::uint32_t simpleFloat : 1;
+      std::uint32_t roundMode : 1;
+      std::uint32_t cmaskIsLinear : 1;
+      std::uint32_t blendOptDontRdDst : 3;
+      std::uint32_t blendOptDiscardPixel : 3;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct CbColor {
+  std::uint32_t base;
+  std::uint32_t pitch;
+  std::uint32_t slice;
+  CbColorView view;
+  CbColorInfo info;
+  CbColorAttrib attrib;
+  std::uint32_t dccBase;
+  std::uint32_t cmask;
+  std::uint32_t cmaskSlice : 14;
+  std::uint32_t fmask;
+  std::uint32_t fmaskSlice;
+  std::uint32_t clearWord0;
+  std::uint32_t clearWord1;
+  std::uint32_t clearWord2;
+  std::uint32_t clearWord3;
+};
+
+struct PaClVport {
+  float xScale;
+  float xOffset;
+  float yScale;
+  float yOffset;
+  float zScale;
+  float zOffset;
+};
+
+struct PaScVportZ {
+  float min;
+  float max;
+};
+
+struct PaScRect {
+  std::uint16_t left;
+  std::uint16_t top;
+  std::uint16_t right;
+  std::uint16_t bottom;
+};
+
+struct SpiShaderPgm {
+  std::uint32_t rsrc3;
+  std::uint64_t address;
+
+  union {
+    struct {
+      std::uint32_t vgprs : 6;
+      std::uint32_t sgprs : 4;
+      std::uint32_t priority : 2;
+      std::uint32_t floatMode : 8;
+      std::uint32_t priv : 1;
+      std::uint32_t dx10Clamp : 1;
+      std::uint32_t debugMode : 1;
+      std::uint32_t ieeeMode : 1;
+    };
+
+    struct {
+      std::uint32_t : 24;
+      std::uint32_t cuGroupEnable : 1;
+    } es;
+
+    struct {
+      std::uint32_t : 24;
+      std::uint32_t cuGroupEnable : 1;
+    } gs;
+
+    struct {
+      std::uint32_t : 24;
+      std::uint32_t vgprCompCnt : 2;
+    } ls;
+
+    struct {
+      std::uint32_t : 24;
+      std::uint32_t cuGroupDisable : 1;
+    } ps;
+
+    struct {
+      std::uint32_t : 24;
+      std::uint32_t vgprCompCnt : 2;
+      std::uint32_t cuGroupEnable : 1;
+    } vs;
+
+    std::uint8_t getVGprCount() const { return (vgprs + 1) * 4; }
+    std::uint8_t getSGprCount() const { return (sgprs + 1) * 8; }
+
+    std::uint32_t raw;
+  } rsrc1;
+
+  union {
+    struct {
+      std::uint32_t scratchEn : 1;
+      std::uint32_t userSgpr : 5;
+      std::uint32_t trapPresent : 1;
+    };
+
+    struct {
+      std::uint32_t : 7;
+      std::uint32_t ocLdsEn : 1;
+      std::uint32_t soBase0En : 1;
+      std::uint32_t soBase1En : 1;
+      std::uint32_t soBase2En : 1;
+      std::uint32_t soBase3En : 1;
+      std::uint32_t soEn : 1;
+      std::uint32_t excpEn : 7;
+    } vs;
+
+    struct {
+      std::uint32_t : 7;
+      std::uint32_t ocLdsEn : 1;
+      std::uint32_t excpEn : 7;
+    } es;
+
+    struct {
+      std::uint32_t : 7;
+      std::uint32_t excpEn : 7;
+    } gs;
+
+    struct {
+      std::uint32_t : 7;
+      std::uint32_t ocLdsEn : 1;
+      std::uint32_t tgSizeEn : 1;
+      std::uint32_t excpEn : 7;
+    } hs;
+
+    struct {
+      std::uint32_t : 7;
+      std::uint32_t ldsSize : 9;
+      std::uint32_t excpEn : 7;
+    } ls;
+    std::uint32_t raw;
+  } rsrc2;
+
+  std::array<std::uint32_t, 16> userData;
+};
+
+struct VmProtectionFault {
+  std::uint32_t protection : 8;
+  std::uint32_t : 4;
+  std::uint32_t client : 8;
+  std::uint32_t : 4;
+  ProtectionFaultAccess rw : 1;
+  std::uint32_t vmid : 4;
+  std::uint32_t : 3;
+};
+
+enum class LsStage : std::uint32_t {
+  LsOff,
+  LsOn,
+  CsOn,
+};
+
+enum class EsStage : std::uint32_t {
+  EsOff,
+  EsDs,
+  EsReal,
+};
+
+enum class VsStage : std::uint32_t {
+  VsReal,
+  VsDs,
+  VsCopy,
+};
+
+struct VgtShaderStagesEn {
+  union {
+    struct {
+      LsStage lsEn : 2;
+      bool hsEn : 1;
+      EsStage esEn : 2;
+      bool gsEn : 1;
+      VsStage vsEn : 2;
+      bool dynamicHs : 1;
+    };
+    std::uint32_t raw;
+  };
+};
+
+struct FbInfo {
+  std::uint16_t base; // address >> 24
+  std::uint16_t unk;
+};
+
+struct DbDepthControl {
+  union {
+    struct {
+      bool stencilEnable : 1;
+      bool depthEnable : 1;
+      bool depthWriteEnable : 1;
+      bool depthBoundsEnable : 1;
+      gnm::CompareFunc zFunc : 3;
+      bool backFaceEnable : 1;
+      gnm::CompareFunc stencilFunc : 3;
+      std::uint32_t : 9;
+      gnm::CompareFunc stencilFuncBackFace : 3;
+      std::uint32_t : 7;
+      bool enableColorWritesOnDepthFail : 1;
+      bool disableColorWritesOnDepthPass : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct DbZInfo {
+  union {
+    struct {
+      gnm::ZFormat format : 2;
+      std::uint32_t numSamples : 2;
+      std::uint32_t : 16;
+      std::uint32_t tileModeIndex : 3;
+      std::uint32_t : 4;
+      bool allowExpClear : 1;
+      std::uint32_t readSize : 1; // 0 - 256 bit, 1 - 512 bit
+      bool tileSurfaceEnable : 1;
+      std::uint32_t : 1;
+      bool zRangePrecision : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct DbRenderControl {
+  union {
+    struct {
+      bool depthClearEnable : 1;
+      bool stencilClearEnable : 1;
+      bool depthCopy : 1;
+      bool stencilCopy : 1;
+      bool resummarizeEnable : 1;
+      bool stencilCompressDisable : 1;
+      bool depthCompressDisable : 1;
+      bool copyCentroid : 1;
+      std::uint32_t copySample : 4;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct CbBlendControl {
+  union {
+    struct {
+      gnm::BlendMultiplier colorSrcBlend : 5;
+      gnm::BlendFunc colorCombFcn : 3;
+      gnm::BlendMultiplier colorDstBlend : 5;
+      std::uint32_t : 3;
+      gnm::BlendMultiplier alphaSrcBlend : 5;
+      gnm::BlendFunc alphaCombFcn : 3;
+      gnm::BlendMultiplier alphaDstBlend : 5;
+
+      bool separateAlphaBlend : 1;
+      bool enable : 1;
+      bool disableRop3 : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct PaSuScModeCntl {
+  union {
+    struct {
+      bool cullFront : 1;
+      bool cullBack : 1;
+      gnm::Face face : 1;
+      gnm::PolyMode polyMode : 2;
+      gnm::PolyModePtype polyModeFrontPtype : 3;
+      gnm::PolyModePtype polyModeBackPtype : 3;
+      bool polyOffsetFrontEnable : 1;
+      bool polyOffsetBackEnable : 1;
+      bool polyOffsetParaEnable : 1;
+      std::uint32_t : 2;
+      bool vtxWindowOffsetEnable : 1;
+      std::uint32_t : 2;
+      bool provokingVtxLast : 1;
+      bool perspCorrDis : 1;
+      bool multiPrimIbEna : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct PaSuVtxCntl {
+  union {
+    struct {
+      bool pixCenterHalf : 1;
+      gnm::RoundMode roundMode : 2;
+      gnm::QuantMode quantMode : 3;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+struct SpiPsInput {
+  union {
+    struct {
+      bool perspSampleEna : 1;
+      bool perspCenterEna : 1;
+      bool perspCentroidEna : 1;
+      bool perspPullModelEna : 1;
+      bool linearSampleEna : 1;
+      bool linearCenterEna : 1;
+      bool linearCentroidEna : 1;
+      bool lineStippleTexEna : 1;
+      bool posXFloatEna : 1;
+      bool posYFloatEna : 1;
+      bool posZFloatEna : 1;
+      bool posWFloatEna : 1;
+      bool frontFaceEna : 1;
+      bool ancillaryEna : 1;
+      bool sampleCoverageEna : 1;
+      bool posFixedPtEna : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+
+enum class SpiPsDefaultVal : std::uint8_t {
+  X0_Y0_Z0_W0,
+  X0_Y0_Z0_W1,
+  X1_Y1_Z1_W0,
+  X1_Y1_Z1_W1,
+};
+
+struct SpiPsInputCntl {
+  union {
+    struct {
+      std::uint32_t offset : 4;
+      bool useDefaultVal : 1;
+      std::uint32_t : 3;
+      SpiPsDefaultVal defaultVal : 2;
+      bool flatShade : 1;
+      std::uint32_t : 2;
+      std::uint32_t cylWrap : 4;
+      bool ptSpriteTex : 1;
+    };
+
+    std::uint32_t raw;
+  };
+};
+struct Registers {
+  static constexpr auto kRegisterCount = 0xf000;
+
+  struct Config {
+    static constexpr auto kMmioOffset = 0x2000;
+
+    Register<0xad, std::array<std::uint32_t, 3>> cpPrtLodStatsCntls;
+    Register<0x1c0> cpRbRptr;
+    Register<0x1bf> cpRb1Rptr;
+    Register<0x1be> cpRb2Rptr;
+    Register<0x232> vgtEsGsRingSize;
+    Register<0x233> vgtGsVsRingSize;
+    Register<0x262> vgtTfRingSize;
+    Register<0x26e> vgtTfMemoryBase;
+    Register<0x3c0, std::array<std::uint32_t, 4>> sqBufRsrcWords;
+    Register<0x3c4, std::array<std::uint32_t, 7>> sqImgRsrcWords;
+    Register<0x3cc, std::array<std::uint32_t, 4>> sqImgSampWords;
+    Register<0x644, std::array<TileMode, 32>> gbTileModes;
+    Register<0x664, std::array<MacroTileMode, 16>> gbMacroTileModes;
+  };
+
+  struct ComputeConfig {
+    static constexpr auto kMmioOffset = 0x2e00;
+
+    std::uint32_t computeDispatchInitiator;
+    std::uint32_t _pad0[6];
+    std::uint32_t computeNumThreadX;
+    std::uint32_t computeNumThreadY;
+    std::uint32_t computeNumThreadZ;
+    std::uint32_t _pad1[2];
+    std::uint32_t computePgmLo;
+    std::uint32_t computePgmHi;
+    std::uint32_t _pad2[4];
+    std::uint32_t computePgmRsrc1;
+    std::uint32_t computePgmRsrc2;
+    std::uint32_t _pad3[1];
+    std::uint32_t computeResourceLimits;
+    std::uint32_t computeStaticThreadMgmtSe0;
+    std::uint32_t computeStaticThreadMgmtSe1;
+    std::uint32_t computeTmpRingSize;
+    std::uint32_t _pad4[39];
+    std::array<std::uint32_t, 16> userData;
+  };
+
+  struct ShaderConfig {
+    static constexpr auto kMmioOffset = 0x2c00;
+
+    union {
+      Register<0x7, SpiShaderPgm> spiShaderPgmPs;
+      Register<0x47, SpiShaderPgm> spiShaderPgmVs;
+      Register<0x87, SpiShaderPgm> spiShaderPgmGs;
+      Register<0xc7, SpiShaderPgm> spiShaderPgmEs;
+      Register<0x107, SpiShaderPgm> spiShaderPgmHs;
+      Register<0x147, SpiShaderPgm> spiShaderPgmLs;
+      Register<0x200, ComputeConfig> compute;
+    };
+  };
+
+  struct Context {
+    static constexpr auto kMmioOffset = 0xa000;
+    static Context Default;
+
+    union {
+      Register<0x0, DbRenderControl> dbRenderControl;
+      Register<0x1> dbCountControl;
+      Register<0x2> dbDepthView;
+      Register<0x3> dbRenderOverride;
+      Register<0x4> dbRenderOverride2;
+      Register<0x5> dbHTileDataBase;
+      Register<0x8, float> dbDepthBoundsMin;
+      Register<0x9, float> dbDepthBoundsMax;
+      Register<0xa> dbStencilClear;
+      Register<0xb, float> dbDepthClear;
+      Register<0xc, PaScRect> paScScreenScissor;
+      Register<0xf> dbDepthInfo;
+      Register<0x10, DbZInfo> dbZInfo;
+      Register<0x11> dbStencilInfo;
+      Register<0x12> dbZReadBase;
+      Register<0x13> dbStencilReadBase;
+      Register<0x14> dbZWriteBase;
+      Register<0x15> dbStencilWriteBase;
+      Register<0x16> dbDepthSize;
+      Register<0x17> dbDepthSlice;
+      Register<0x20> taBcBaseAddr;
+      Register<0x80> paScWindowOffset;
+      Register<0x81, PaScRect> paScWindowScissor;
+      Register<0x83> paScClipRectRule;
+      Register<0x84, std::array<PaScRect, 4>> paScClipRect;
+      Register<0x8c> unk_8c;
+      Register<0x8d> paSuHardwareScreenOffset;
+      Register<0x8e, CbTargetMask> cbTargetMask;
+      Register<0x8f, CbShaderMask> cbShaderMask;
+      Register<0x90, PaScRect> paScGenericScissor;
+      Register<0x94, std::array<PaScRect, 16>> paScVportScissor;
+      Register<0xb4, std::array<PaScVportZ, 16>> paScVportZ;
+      Register<0xd4> unk_d4;
+      Register<0xd8> cpPerfMonCntxCntl;
+      Register<0x100> vgtMaxVtxIndx;
+      Register<0x101> vgtMinVtxIndx;
+      Register<0x102> vgtIndxOffset;
+      Register<0x103> vgtMultiPrimIbResetIndx;
+      Register<0x105, float> cbBlendRed;
+      Register<0x106, float> cbBlendGreen;
+      Register<0x107, float> cbBlendBlue;
+      Register<0x108, float> cbBlendAlpha;
+      Register<0x10b> dbStencilControl;
+      Register<0x10c> dbStencilRefMask;
+      Register<0x10d> dbStencilRefMaskBf;
+      Register<0x10f, std::array<PaClVport, 16>> paClVports;
+      Register<0x16f> paClUcp0X;
+      Register<0x170> paClUcp0Y;
+      Register<0x171> paClUcp0Z;
+      Register<0x172> paClUcp0W;
+      Register<0x191, std::array<SpiPsInputCntl, 32>> spiPsInputCntl;
+      Register<0x1b1> spiVsOutConfig;
+      Register<0x1b3, SpiPsInput> spiPsInputEna;
+      Register<0x1b4, SpiPsInput> spiPsInputAddr;
+      Register<0x1b6> spiPsInControl;
+      Register<0x1b8> spiBarycCntl;
+      Register<0x1ba> spiTmpRingSize;
+      Register<0x1c3> spiShaderPosFormat;
+      Register<0x1c4> spiShaderZFormat;
+      Register<0x1c5> spiShaderColFormat;
+      Register<0x1e0, std::array<CbBlendControl, 8>> cbBlendControl;
+      Register<0x1f9> vgtDmaBaseHi;
+      Register<0x1fa> vgtDmaBase;
+      Register<0x1fc> vgtDrawInitiator;
+      Register<0x1fd> vgtImmedData;
+      Register<0x200, DbDepthControl> dbDepthControl;
+      Register<0x201> dbEqaa;
+      Register<0x202, CbColorControl> cbColorControl;
+      Register<0x203> dbShaderControl;
+      Register<0x204> paClClipCntl;
+      Register<0x205, PaSuScModeCntl> paSuScModeCntl;
+      Register<0x206> paClVteCntl;
+      Register<0x207> paClVsOutCntl;
+      Register<0x280> paSuPointSize;
+      Register<0x281> paSuPointMinmax;
+      Register<0x282> paSuLineCntl;
+      Register<0x284> vgtOutputPathCntl;
+      Register<0x286> vgtHosMaxTessLevel;
+      Register<0x287> vgtHosMinTessLevel;
+      Register<0x290> vgtGsMode;
+      Register<0x291> vgtGsOnChipCntl;
+      Register<0x292> paScModeCntl0;
+      Register<0x293> paScModeCntl1;
+      Register<0x295> vgtGsPerEs;
+      Register<0x296> vgtEsPerGs;
+      Register<0x297> vgtGsPerVs;
+      Register<0x298, std::array<std::uint32_t, 3>> vgtGsVsRingOffsets;
+      Register<0x29b> vgtGsOutPrimType;
+      Register<0x29d> vgtDmaSize;
+      Register<0x29e> vgtDmaMaxSize;
+      Register<0x29f> vgtDmaIndexType;
+      Register<0x2a1> vgtPrimitiveIdEn;
+      Register<0x2a2> vgtDmaNumInstances;
+      Register<0x2a4> vgtEventInitiator;
+      Register<0x2a5> vgtMultiPrimIbResetEn;
+      Register<0x2a8> vgtInstanceStepRate0;
+      Register<0x2a9> vgtInstanceStepRate1;
+      Register<0x2aa> iaMultiVgtParam;
+      Register<0x2ab> vgtEsGsRingItemSize;
+      Register<0x2ac> vgtGsVsRingItemSize;
+      Register<0x2ad> vgtReuseOff;
+      Register<0x2ae> vgtVtxCntEn;
+      Register<0x2af> dbHTileSurface;
+      Register<0x2b0> dbSResultsCompareState0;
+      Register<0x2b1> dbSResultsCompareState1;
+      Register<0x2b4> vgtStrmOutBufferSize0;
+      Register<0x2b5> vgtStrmOutVtxStride0;
+      Register<0x2b8> vgtStrmOutBufferSize1;
+      Register<0x2b9> vgtStrmOutVtxStride1;
+      Register<0x2bc> vgtStrmOutBufferSize2;
+      Register<0x2bd> vgtStrmOutVtxStride2;
+      Register<0x2c0> vgtStrmOutBufferSize3;
+      Register<0x2c1> vgtStrmOutVtxStride3;
+      Register<0x2ca> vgtStrmOutDrawOpaqueOffset;
+      Register<0x2cb> vgtStrmOutDrawOpaqueBufferFilledSize;
+      Register<0x2cc> vgtStrmOutDrawOpaqueVertexStride;
+      Register<0x2ce> vgtGsMaxVertOut;
+      Register<0x2d5, VgtShaderStagesEn> vgtShaderStagesEn;
+      Register<0x2d6> vgtLsHsConfig;
+      Register<0x2d7, std::array<std::uint32_t, 4>> vgtGsVertItemSizes;
+      Register<0x2db> vgtTfParam;
+      Register<0x2dc> dbAlphaToMask;
+      Register<0x2dd> vgtDispatchDrawIndex;
+      Register<0x2de> paSuPolyOffsetDbFmtCntl;
+      Register<0x2df> paSuPolyOffsetClamp;
+      Register<0x2e0> paSuPolyOffsetFrontScale;
+      Register<0x2e1> paSuPolyOffsetFrontOffset;
+      Register<0x2e2> paSuPolyOffsetBackScale;
+      Register<0x2e3> paSuPolyOffsetBackOffset;
+      Register<0x2e4> vgtGsInstanceCnt;
+      Register<0x2e5> vgtStrmOutConfig;
+      Register<0x2e6> vgtStrmOutBufferConfig;
+      Register<0x2f5> paScCentroidPriority0;
+      Register<0x2f6> paScCentroidPriority1;
+      Register<0x2f7> unk_2f7;
+      Register<0x2f8> paScAaConfig;
+      Register<0x2f9, PaSuVtxCntl> paSuVtxCntl;
+      Register<0x2fa, float> paClGbVertClipAdj;
+      Register<0x2fb, float> paClGbVertDiscAdj;
+      Register<0x2fc, float> paClGbHorzClipAdj;
+      Register<0x2fd, float> paClGbHorzDiscAdj;
+      Register<0x2fe, std::array<std::uint32_t, 4>> paScAaSampleLocsPixelX0Y0;
+      Register<0x302, std::array<std::uint32_t, 4>> paScAaSampleLocsPixelX1Y0;
+      Register<0x306, std::array<std::uint32_t, 4>> paScAaSampleLocsPixelX0Y1;
+      Register<0x30a, std::array<std::uint32_t, 4>> paScAaSampleLocsPixelX1Y1;
+      Register<0x30e> paScAaMaskX0Y0_X1Y0;
+      Register<0x30f> paScAaMaskX0Y1_X1Y1;
+      Register<0x316> unk_316;
+      Register<0x317> vgtOutDeallocCntl;
+      Register<0x318, std::array<CbColor, 8>> cbColor;
+    };
+  };
+
+  struct UConfig {
+    static constexpr auto kMmioOffset = 0xc000;
+
+    union {
+      Register<0x3f> cpStrmOutCntl;
+      Register<0x79> cpCoherBaseHi;
+      Register<0x7d> cpCoherSize;
+      Register<0x7e> cpCoherBase;
+      Register<0x8b> cpDmaReadTags;
+      Register<0x8c> cpCoherSizeHi;
+      Register<0x200> grbmGfxIndex;
+      Register<0x242, gnm::PrimitiveType> vgtPrimitiveType;
+      Register<0x243, gnm::IndexType> vgtIndexType;
+      Register<0x24c> vgtNumIndices;
+      Register<0x24d> vgtNumInstances;
+      Register<0x340, std::array<std::uint32_t, 4>> sqThreadTraceUserdata;
+      Register<0x41d> gdsOaCntl;
+      Register<0x41e> gdsOaCounter;
+      Register<0x41f> gdsOaAddress;
+    };
+  };
+
+  struct Counters {
+    static constexpr auto kMmioOffset = 0xd000;
+
+    union {
+      Register<0x0, std::uint64_t> cpgPerfCounter1;
+      Register<0x2, std::uint64_t> cpgPerfCounter0;
+      Register<0x4, std::uint64_t> cpcPerfCounter1;
+      Register<0x6, std::uint64_t> cpcPerfCounter0;
+      Register<0x8, std::uint64_t> cpfPerfCounter1;
+      Register<0xa, std::uint64_t> cpfPerfCounter0;
+      Register<0x80, std::array<std::uint64_t, 4>> wdPerfCounters;
+      Register<0x88, std::array<std::uint64_t, 4>> iaPerfCounters;
+      Register<0x90, std::array<std::uint64_t, 4>> vgtPerfCounters;
+      Register<0x100, std::array<std::uint64_t, 4>> paSuPerfCounters;
+      Register<0x140, std::array<std::uint64_t, 8>> paScPerfCounters;
+      Register<0x180> spiPerfCounter0Hi;
+      Register<0x181> spiPerfCounter0Lo;
+      Register<0x182> spiPerfCounter1Hi;
+      Register<0x183> spiPerfCounter1Lo;
+      Register<0x184> spiPerfCounter2Hi;
+      Register<0x185> spiPerfCounter2Lo;
+      Register<0x186> spiPerfCounter3Hi;
+      Register<0x187> spiPerfCounter3Lo;
+      Register<0x188> spiPerfCounter4Hi;
+      Register<0x189> spiPerfCounter4Lo;
+      Register<0x18a> spiPerfCounter5Hi;
+      Register<0x18b> spiPerfCounter5Lo;
+      Register<0x1c0, std::array<std::uint64_t, 16>> sqPerfCounters;
+      Register<0x240, std::array<std::uint64_t, 4>> sxPerfCounters;
+      Register<0x280, std::array<std::uint64_t, 4>> gdsPerfCounters;
+      Register<0x2c0, std::array<std::uint64_t, 2>> taPerfCounters;
+      Register<0x300, std::array<std::uint64_t, 2>> tdPerfCounters;
+      Register<0x340, std::array<std::uint64_t, 4>> tcpPerfCounters;
+      Register<0x380, std::array<std::uint64_t, 4>> tccPerfCounters;
+      Register<0x390, std::array<std::uint64_t, 4>> tcaPerfCounters;
+      Register<0x3a0, std::array<std::uint64_t, 4>> tcsPerfCounters;
+      Register<0x406, std::array<std::uint64_t, 4>> cbPerfCounters;
+      Register<0x440, std::array<std::uint64_t, 4>> dbPerfCounters;
+      Register<0x800> cpgPerfCounter1Select;
+      Register<0x801> cpgPerfCounter0Select1;
+      Register<0x802> cpgPerfCounter0Select;
+      Register<0x803> cpcPerfCounter1Select;
+      Register<0x804> cpcPerfCounter0Select1;
+      Register<0x805> cpfPerfCounter1Select;
+      Register<0x806> cpfPerfCounter0Select1;
+      Register<0x807> cpfPerfCounter0Select;
+      Register<0x808> cpPerfMonCntl;
+      Register<0x809> cpcPerfCounter0Select;
+      Register<0x880> wdPerfCounter0Select;
+      Register<0x881> wdPerfCounter1Select;
+      Register<0x882> wdPerfCounter2Select;
+      Register<0x883> wdPerfCounter3Select;
+      Register<0x884> iaPerfCounter0Select;
+      Register<0x885> iaPerfCounter1Select;
+      Register<0x886> iaPerfCounter2Select;
+      Register<0x887> iaPerfCounter3Select;
+      Register<0x888> iaPerfCounter0Select1;
+      Register<0x88c> vgtPerfCounter0Select;
+      Register<0x88d> vgtPerfCounter1Select;
+      Register<0x88e> vgtPerfCounter2Select;
+      Register<0x88f> vgtPerfCounter3Select;
+      Register<0x890> vgtPerfCounter0Select1;
+      Register<0x891> vgtPerfCounter1Select1;
+      Register<0x900> paSuPerfCounter0Select;
+      Register<0x901> paSuPerfCounter0Select1;
+      Register<0x902> paSuPerfCounter1Select;
+      Register<0x903> paSuPerfCounter1Select1;
+      Register<0x904> paSuPerfCounter2Select;
+      Register<0x905> paSuPerfCounter3Select;
+      Register<0x940> paScPerfCounter0Select;
+      Register<0x941> paScPerfCounter0Select1;
+      Register<0x942> paScPerfCounter1Select;
+      Register<0x943> paScPerfCounter2Select;
+      Register<0x944> paScPerfCounter3Select;
+      Register<0x945> paScPerfCounter4Select;
+      Register<0x946> paScPerfCounter5Select;
+      Register<0x947> paScPerfCounter6Select;
+      Register<0x948> paScPerfCounter7Select;
+      Register<0x980> spiPerfCounter0Select;
+      Register<0x981> spiPerfCounter1Select;
+      Register<0x982> spiPerfCounter2Select;
+      Register<0x983> spiPerfCounter3Select;
+      Register<0x984> spiPerfCounter0Select1;
+      Register<0x985> spiPerfCounter1Select1;
+      Register<0x986> spiPerfCounter2Select1;
+      Register<0x987> spiPerfCounter3Select1;
+      Register<0x988> spiPerfCounter4Select;
+      Register<0x989> spiPerfCounter5Select;
+      Register<0x98a> spiPerfCounterBins;
+      Register<0x9c0, std::array<std::uint32_t, 16>> sqPerfCountersSelect;
+      Register<0x9e0> sqPerfCounterCtrl;
+      Register<0xa40> sxPerfCounter0Select;
+      Register<0xa41> sxPerfCounter1Select;
+      Register<0xa42> sxPerfCounter2Select;
+      Register<0xa43> sxPerfCounter3Select;
+      Register<0xa44> sxPerfCounter0Select1;
+      Register<0xa45> sxPerfCounter1Select1;
+      Register<0xa80> gdsPerfCounter0Select;
+      Register<0xa81> gdsPerfCounter1Select;
+      Register<0xa82> gdsPerfCounter2Select;
+      Register<0xa83> gdsPerfCounter3Select;
+      Register<0xa84> gdsPerfCounter0Select1;
+      Register<0xac0> taPerfCounter0Select;
+      Register<0xac1> taPerfCounter0Select1;
+      Register<0xac2> taPerfCounter1Select;
+      Register<0xb00> tdPerfCounter0Select;
+      Register<0xb01> tdPerfCounter0Select1;
+      Register<0xb02> tdPerfCounter1Select;
+      Register<0xb40> tcpPerfCounter0Select;
+      Register<0xb41> tcpPerfCounter0Select1;
+      Register<0xb42> tcpPerfCounter1Select;
+      Register<0xb43> tcpPerfCounter1Select1;
+      Register<0xb44> tcpPerfCounter2Select;
+      Register<0xb45> tcpPerfCounter3Select;
+      Register<0xb80> tccPerfCounter0Select;
+      Register<0xb81> tccPerfCounter0Select1;
+      Register<0xb82> tccPerfCounter1Select;
+      Register<0xb83> tccPerfCounter1Select1;
+      Register<0xb84> tccPerfCounter2Select;
+      Register<0xb85> tccPerfCounter3Select;
+      Register<0xb90> tcaPerfCounter0Select;
+      Register<0xb91> tcaPerfCounter0Select1;
+      Register<0xb92> tcaPerfCounter1Select;
+      Register<0xb93> tcaPerfCounter1Select1;
+      Register<0xb94> tcaPerfCounter2Select;
+      Register<0xb95> tcaPerfCounter3Select;
+      Register<0xba0> tcsPerfCounter0Select;
+      Register<0xba1> tcsPerfCounter0Select1;
+      Register<0xba2> tcsPerfCounter1Select;
+      Register<0xba3> tcsPerfCounter2Select;
+      Register<0xba4> tcsPerfCounter3Select;
+      Register<0xc00> cbPerfCounterFilter;
+      Register<0xc01> cbPerfCounter0Select;
+      Register<0xc02> cbPerfCounter0Select1;
+      Register<0xc03> cbPerfCounter1Select;
+      Register<0xc04> cbPerfCounter2Select;
+      Register<0xc05> cbPerfCounter3Select;
+      Register<0xc40> dbPerfCounter0Select;
+      Register<0xc41> dbPerfCounter0Select1;
+      Register<0xc42> dbPerfCounter1Select;
+      Register<0xc43> dbPerfCounter1Select1;
+      Register<0xc44> dbPerfCounter2Select;
+      Register<0xc46> dbPerfCounter3Select;
+    };
+  };
+
+  union {
+    Register<0x50c, std::uint32_t> vmContext0ProtectionIntrCtl;
+    Register<0x50d, std::uint32_t> vmContext1ProtectionIntrCtl;
+    Register<0x536, VmProtectionFault> vmContext0ProtectionFault;
+    Register<0x537, VmProtectionFault> vmContext1ProtectionFault;
+    Register<0x53e, std::uint32_t>
+        vmContext0ProtectionFaultPage; // address >> 12
+    Register<0x53f, std::uint32_t>
+        vmContext1ProtectionFaultPage; // address >> 12
+    Register<0x809, FbInfo> fbInfo;
+    Register<0xf82, std::uint32_t> ihRptr;
+    Register<0xf83, std::uint32_t> ihWptr;
+
+    Register<Config::kMmioOffset, Config> config;
+    Register<ShaderConfig::kMmioOffset, ShaderConfig> sh;
+
+    Register<0x3045> cpRbWptr;
+    Register<0x3064> cpRb1Wptr;
+    Register<0x3069> cpRb2Wptr;
+    Register<0x3049> cpIntCntl;
+    Register<0x304a> cpIntStatus;
+    Register<0x306a, std::array<std::uint32_t, 3>> cpIntCntlRings;
+    Register<0x306d, std::array<std::uint32_t, 3>> cpIntStatusRings;
+    Register<0x324b> cpHqdQueuePriority;
+    Register<0x324c> cpHqdQuantum;
+
+    Register<Context::kMmioOffset, Context> context;
+    Register<UConfig::kMmioOffset, UConfig> uconfig;
+    Register<Counters::kMmioOffset, Counters> counters;
+
+    std::uint32_t raw[kRegisterCount];
+  };
+};
+
+#pragma pack(pop)
+} // namespace amdgpu
--- a/rpcsx-gpu2/Renderer.cpp
+++ b/rpcsx-gpu2/Renderer.cpp
--- a/rpcsx-gpu2/Renderer.hpp
+++ b/rpcsx-gpu2/Renderer.hpp
@ -0,0 +1,17 @@
+#pragma once
+
+#include "Cache.hpp"
+#include "Pipe.hpp"
+#include <cstdint>
+#include <vulkan/vulkan_core.h>
+
+namespace amdgpu {
+void draw(GraphicsPipe &pipe, int vmId, std::uint32_t firstVertex,
+          std::uint32_t vertexCount, std::uint32_t firstInstance,
+          std::uint32_t instanceCount, std::uint64_t indiciesAddress,
+          std::uint32_t indexCount);
+void flip(Cache::Tag &cacheTag, VkCommandBuffer commandBuffer,
+          VkExtent2D targetExtent, std::uint64_t address, VkImageView target,
+          VkExtent2D imageExtent, CbCompSwap compSwap, TileMode tileMode,
+          gnm::DataFormat dfmt, gnm::NumericFormat nfmt);
+} // namespace amdgpu
--- a/rpcsx-gpu2/lib/CMakeLists.txt
+++ b/rpcsx-gpu2/lib/CMakeLists.txt
@ -0,0 +1,4 @@
+add_subdirectory(amdgpu-tiler)
+add_subdirectory(gcn-shader)
+add_subdirectory(vk)
+add_subdirectory(gnm)
--- a/rpcsx-gpu2/lib/amdgpu-tiler/CMakeLists.txt
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/CMakeLists.txt
@ -0,0 +1,22 @@
+add_precompiled_vulkan_spirv(amdgpu_tiler_vulkan_shaders
+    shaders/tiler1d.comp.glsl
+    shaders/tiler2d.comp.glsl
+    shaders/tilerLinear.comp.glsl
+    shaders/detiler1d.comp.glsl
+    shaders/detiler2d.comp.glsl
+    shaders/detilerLinear.comp.glsl
+)
+
+add_library(amdgpu_tiler STATIC src/tiler.cpp)
+target_include_directories(amdgpu_tiler PUBLIC include)
+
+add_library(amdgpu_tiler_cpu STATIC src/tiler_cpu.cpp)
+add_library(amdgpu_tiler_vulkan STATIC src/tiler_vulkan.cpp)
+
+target_link_libraries(amdgpu_tiler PUBLIC gnm)
+target_link_libraries(amdgpu_tiler_cpu PUBLIC amdgpu_tiler)
+target_link_libraries(amdgpu_tiler_vulkan PUBLIC amdgpu_tiler amdgpu_tiler_vulkan_shaders vk)
+
+add_library(amdgpu::tiler ALIAS amdgpu_tiler)
+add_library(amdgpu::tiler::cpu ALIAS amdgpu_tiler_cpu)
+add_library(amdgpu::tiler::vulkan ALIAS amdgpu_tiler_vulkan)
--- a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler.hpp
@ -0,0 +1,505 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <gnm/constants.hpp>
+#include <gnm/descriptors.hpp>
+
+namespace amdgpu {
+inline constexpr uint32_t kMicroTileWidth = 8;
+inline constexpr uint32_t kMicroTileHeight = 8;
+inline constexpr uint32_t kDramRowSize = 0x400;
+inline constexpr uint32_t kPipeInterleaveBytes = 256;
+
+enum ArrayMode {
+  kArrayModeLinearGeneral = 0x00000000,
+  kArrayModeLinearAligned = 0x00000001,
+  kArrayMode1dTiledThin = 0x00000002,
+  kArrayMode1dTiledThick = 0x00000003,
+  kArrayMode2dTiledThin = 0x00000004,
+  kArrayModeTiledThinPrt = 0x00000005,
+  kArrayMode2dTiledThinPrt = 0x00000006,
+  kArrayMode2dTiledThick = 0x00000007,
+  kArrayMode2dTiledXThick = 0x00000008,
+  kArrayModeTiledThickPrt = 0x00000009,
+  kArrayMode2dTiledThickPrt = 0x0000000a,
+  kArrayMode3dTiledThinPrt = 0x0000000b,
+  kArrayMode3dTiledThin = 0x0000000c,
+  kArrayMode3dTiledThick = 0x0000000d,
+  kArrayMode3dTiledXThick = 0x0000000e,
+  kArrayMode3dTiledThickPrt = 0x0000000f,
+};
+
+enum MicroTileMode {
+  kMicroTileModeDisplay = 0x00000000,
+  kMicroTileModeThin = 0x00000001,
+  kMicroTileModeDepth = 0x00000002,
+  kMicroTileModeRotated = 0x00000003,
+  kMicroTileModeThick = 0x00000004,
+};
+
+enum PipeConfig {
+  kPipeConfigP8_32x32_8x16 = 0x0000000a,
+  kPipeConfigP8_32x32_16x16 = 0x0000000c,
+  kPipeConfigP16 = 0x00000012,
+};
+
+enum TileSplit {
+  kTileSplit64B = 0x00000000,
+  kTileSplit128B = 0x00000001,
+  kTileSplit256B = 0x00000002,
+  kTileSplit512B = 0x00000003,
+  kTileSplit1KB = 0x00000004,
+  kTileSplit2KB = 0x00000005,
+  kTileSplit4KB = 0x00000006,
+};
+
+enum SampleSplit {
+  kSampleSplit1 = 0x00000000,
+  kSampleSplit2 = 0x00000001,
+  kSampleSplit4 = 0x00000002,
+  kSampleSplit8 = 0x00000003,
+};
+
+enum NumBanks {
+  kNumBanks2 = 0x00000000,
+  kNumBanks4 = 0x00000001,
+  kNumBanks8 = 0x00000002,
+  kNumBanks16 = 0x00000003,
+};
+
+enum BankWidth {
+  kBankWidth1 = 0x00000000,
+  kBankWidth2 = 0x00000001,
+  kBankWidth4 = 0x00000002,
+  kBankWidth8 = 0x00000003,
+};
+
+enum BankHeight {
+  kBankHeight1 = 0x00000000,
+  kBankHeight2 = 0x00000001,
+  kBankHeight4 = 0x00000002,
+  kBankHeight8 = 0x00000003,
+};
+
+enum MacroTileAspect {
+  kMacroTileAspect1 = 0x00000000,
+  kMacroTileAspect2 = 0x00000001,
+  kMacroTileAspect4 = 0x00000002,
+  kMacroTileAspect8 = 0x00000003,
+};
+
+struct TileMode {
+  std::uint32_t raw;
+
+  constexpr ArrayMode arrayMode() const {
+    return ArrayMode((raw & 0x0000003c) >> 2);
+  }
+  constexpr PipeConfig pipeConfig() const {
+    return PipeConfig((raw & 0x000007c0) >> 6);
+  }
+  constexpr TileSplit tileSplit() const {
+    return TileSplit((raw & 0x00003800) >> 11);
+  }
+  constexpr MicroTileMode microTileMode() const {
+    return MicroTileMode((raw & 0x01c00000) >> 22);
+  }
+  constexpr SampleSplit sampleSplit() const {
+    return SampleSplit((raw & 0x06000000) >> 25);
+  }
+  constexpr std::uint32_t altPipeConfig() const {
+    return (raw & 0xf8000000) >> 27;
+  }
+
+  constexpr TileMode &arrayMode(ArrayMode mode) {
+    raw = (raw & ~0x0000003c) |
+          (static_cast<std::uint32_t>(mode) << 2) & 0x0000003c;
+    return *this;
+  }
+  constexpr TileMode &pipeConfig(PipeConfig mode) {
+    raw = (raw & ~0x000007c0) |
+          (static_cast<std::uint32_t>(mode) << 6) & 0x000007c0;
+    return *this;
+  }
+  constexpr TileMode &tileSplit(TileSplit mode) {
+    raw = (raw & ~0x00003800) |
+          (static_cast<std::uint32_t>(mode) << 11) & 0x00003800;
+    return *this;
+  }
+  constexpr TileMode &microTileMode(MicroTileMode mode) {
+    raw = (raw & ~0x01c00000) |
+          (static_cast<std::uint32_t>(mode) << 22) & 0x01c00000;
+    return *this;
+  }
+  constexpr TileMode &sampleSplit(SampleSplit mode) {
+    raw = (raw & ~0x06000000) |
+          (static_cast<std::uint32_t>(mode) << 25) & 0x06000000;
+    return *this;
+  }
+};
+
+struct MacroTileMode {
+  std::uint32_t raw;
+
+  constexpr std::uint32_t bankWidth() const { return (raw & 0x00000003) >> 0; }
+  constexpr std::uint32_t bankHeight() const { return (raw & 0x0000000c) >> 2; }
+  constexpr MacroTileAspect macroTileAspect() const {
+    return MacroTileAspect((raw & 0x00000030) >> 4);
+  }
+  constexpr std::uint32_t numBanks() const { return (raw & 0x000000c0) >> 6; }
+
+  constexpr std::uint32_t altBankHeight() const {
+    return (raw & 0x00000300) >> 8;
+  }
+  constexpr std::uint32_t altMacroTileAspect() const {
+    return (raw & 0x00000c00) >> 10;
+  }
+  constexpr std::uint32_t altNumBanks() const {
+    return (raw & 0x00003000) >> 12;
+  }
+};
+
+struct SurfaceInfo {
+  std::uint32_t width;
+  std::uint32_t height;
+  std::uint32_t depth;
+  std::uint32_t pitch;
+  int arrayLayerCount;
+  int numFragments;
+  int bitsPerElement;
+  std::uint64_t totalSize;
+
+  struct SubresourceInfo {
+    std::uint32_t dataWidth;
+    std::uint32_t dataHeight;
+    std::uint32_t dataDepth;
+    std::uint64_t offset;
+    std::uint64_t tiledSize;
+    std::uint64_t linearSize;
+  };
+
+  SubresourceInfo subresources[16];
+
+  void setSubresourceInfo(int mipLevel, const SubresourceInfo &subresource) {
+    subresources[mipLevel] = subresource;
+  }
+
+  const SubresourceInfo &getSubresourceInfo(int mipLevel) const {
+    return subresources[mipLevel];
+  }
+};
+
+constexpr uint32_t getMicroTileThickness(ArrayMode arrayMode) {
+  switch (arrayMode) {
+  case kArrayMode1dTiledThick:
+  case kArrayMode2dTiledThick:
+  case kArrayMode3dTiledThick:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThickPrt:
+    return 4;
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledXThick:
+    return 8;
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode2dTiledThin:
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThin:
+    return 1;
+  }
+
+  std::abort();
+}
+
+constexpr bool isMacroTiled(ArrayMode arrayMode) {
+  switch (arrayMode) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+    return false;
+  case kArrayMode2dTiledThin:
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+  case kArrayMode3dTiledThickPrt:
+    return true;
+  }
+
+  std::abort();
+}
+
+constexpr bool isPrt(ArrayMode arrayMode) {
+  switch (arrayMode) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+  case kArrayMode2dTiledThin:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+    return false;
+
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThickPrt:
+    return true;
+  }
+
+  std::abort();
+}
+
+constexpr std::array<MacroTileMode, 16> getDefaultMacroTileModes() {
+  return {{
+      {.raw = 0x26e8},
+      {.raw = 0x26d4},
+      {.raw = 0x21d0},
+      {.raw = 0x21d0},
+      {.raw = 0x2080},
+      {.raw = 0x2040},
+      {.raw = 0x1000},
+      {.raw = 0x0000},
+      {.raw = 0x36ec},
+      {.raw = 0x26e8},
+      {.raw = 0x21d4},
+      {.raw = 0x20d0},
+      {.raw = 0x1080},
+      {.raw = 0x1040},
+      {.raw = 0x0000},
+      {.raw = 0x0000},
+  }};
+}
+
+constexpr std::array<TileMode, 32> getDefaultTileModes() {
+  return {{
+      {.raw = 0x90800310}, {.raw = 0x90800b10}, {.raw = 0x90801310},
+      {.raw = 0x90801b10}, {.raw = 0x90802310}, {.raw = 0x90800308},
+      {.raw = 0x90801318}, {.raw = 0x90802318}, {.raw = 0x90000304},
+      {.raw = 0x90000308}, {.raw = 0x92000310}, {.raw = 0x92000294},
+      {.raw = 0x92000318}, {.raw = 0x90400308}, {.raw = 0x92400310},
+      {.raw = 0x924002b0}, {.raw = 0x92400294}, {.raw = 0x92400318},
+      {.raw = 0x9240032c}, {.raw = 0x9100030c}, {.raw = 0x9100031c},
+      {.raw = 0x910002b4}, {.raw = 0x910002a4}, {.raw = 0x91000328},
+      {.raw = 0x910002bc}, {.raw = 0x91000320}, {.raw = 0x910002b8},
+      {.raw = 0x90c00308}, {.raw = 0x92c00310}, {.raw = 0x92c00294},
+      {.raw = 0x92c00318}, {.raw = 0x00000000},
+  }};
+}
+
+constexpr std::uint32_t getElementIndex(std::uint32_t x, std::uint32_t y,
+                                        std::uint32_t z,
+                                        std::uint32_t bitsPerElement,
+                                        MicroTileMode microTileMode,
+                                        ArrayMode arrayMode) {
+  std::uint32_t elem = 0;
+
+  if (microTileMode == kMicroTileModeDisplay) {
+    switch (bitsPerElement) {
+    case 8:
+      elem |= ((x >> 0) & 0x1) << 0;
+      elem |= ((x >> 1) & 0x1) << 1;
+      elem |= ((x >> 2) & 0x1) << 2;
+      elem |= ((y >> 1) & 0x1) << 3;
+      elem |= ((y >> 0) & 0x1) << 4;
+      elem |= ((y >> 2) & 0x1) << 5;
+      break;
+    case 16:
+      elem |= ((x >> 0) & 0x1) << 0;
+      elem |= ((x >> 1) & 0x1) << 1;
+      elem |= ((x >> 2) & 0x1) << 2;
+      elem |= ((y >> 0) & 0x1) << 3;
+      elem |= ((y >> 1) & 0x1) << 4;
+      elem |= ((y >> 2) & 0x1) << 5;
+      break;
+    case 32:
+      elem |= ((x >> 0) & 0x1) << 0;
+      elem |= ((x >> 1) & 0x1) << 1;
+      elem |= ((y >> 0) & 0x1) << 2;
+      elem |= ((x >> 2) & 0x1) << 3;
+      elem |= ((y >> 1) & 0x1) << 4;
+      elem |= ((y >> 2) & 0x1) << 5;
+      break;
+    case 64:
+      elem |= ((x >> 0) & 0x1) << 0;
+      elem |= ((y >> 0) & 0x1) << 1;
+      elem |= ((x >> 1) & 0x1) << 2;
+      elem |= ((x >> 2) & 0x1) << 3;
+      elem |= ((y >> 1) & 0x1) << 4;
+      elem |= ((y >> 2) & 0x1) << 5;
+      break;
+    default:
+      std::abort();
+    }
+  } else if (microTileMode == kMicroTileModeThin ||
+             microTileMode == kMicroTileModeDepth) {
+    elem |= ((x >> 0) & 0x1) << 0;
+    elem |= ((y >> 0) & 0x1) << 1;
+    elem |= ((x >> 1) & 0x1) << 2;
+    elem |= ((y >> 1) & 0x1) << 3;
+    elem |= ((x >> 2) & 0x1) << 4;
+    elem |= ((y >> 2) & 0x1) << 5;
+
+    switch (arrayMode) {
+    case kArrayMode2dTiledXThick:
+    case kArrayMode3dTiledXThick:
+      elem |= ((z >> 2) & 0x1) << 8;
+    case kArrayMode1dTiledThick:
+    case kArrayMode2dTiledThick:
+    case kArrayMode3dTiledThick:
+    case kArrayModeTiledThickPrt:
+    case kArrayMode2dTiledThickPrt:
+    case kArrayMode3dTiledThickPrt:
+      elem |= ((z >> 0) & 0x1) << 6;
+      elem |= ((z >> 1) & 0x1) << 7;
+    default:
+      break;
+    }
+  } else if (microTileMode == kMicroTileModeThick) {
+    switch (arrayMode) {
+    case kArrayMode2dTiledXThick:
+    case kArrayMode3dTiledXThick:
+      elem |= ((z >> 2) & 0x1) << 8;
+
+    case kArrayMode1dTiledThick:
+    case kArrayMode2dTiledThick:
+    case kArrayMode3dTiledThick:
+    case kArrayModeTiledThickPrt:
+    case kArrayMode2dTiledThickPrt:
+    case kArrayMode3dTiledThickPrt:
+      if (bitsPerElement == 8 || bitsPerElement == 16) {
+        elem |= ((x >> 0) & 0x1) << 0;
+        elem |= ((y >> 0) & 0x1) << 1;
+        elem |= ((x >> 1) & 0x1) << 2;
+        elem |= ((y >> 1) & 0x1) << 3;
+        elem |= ((z >> 0) & 0x1) << 4;
+        elem |= ((z >> 1) & 0x1) << 5;
+        elem |= ((x >> 2) & 0x1) << 6;
+        elem |= ((y >> 2) & 0x1) << 7;
+      } else if (bitsPerElement == 32) {
+        elem |= ((x >> 0) & 0x1) << 0;
+        elem |= ((y >> 0) & 0x1) << 1;
+        elem |= ((x >> 1) & 0x1) << 2;
+        elem |= ((z >> 0) & 0x1) << 3;
+        elem |= ((y >> 1) & 0x1) << 4;
+        elem |= ((z >> 1) & 0x1) << 5;
+        elem |= ((x >> 2) & 0x1) << 6;
+        elem |= ((y >> 2) & 0x1) << 7;
+      } else if (bitsPerElement == 64 || bitsPerElement == 128) {
+        elem |= ((x >> 0) & 0x1) << 0;
+        elem |= ((y >> 0) & 0x1) << 1;
+        elem |= ((z >> 0) & 0x1) << 2;
+        elem |= ((x >> 1) & 0x1) << 3;
+        elem |= ((y >> 1) & 0x1) << 4;
+        elem |= ((z >> 1) & 0x1) << 5;
+        elem |= ((x >> 2) & 0x1) << 6;
+        elem |= ((y >> 2) & 0x1) << 7;
+      } else {
+        std::abort();
+      }
+      break;
+    default:
+      std::abort();
+    }
+  }
+  return elem;
+}
+
+constexpr uint32_t getPipeIndex(uint32_t x, uint32_t y, PipeConfig pipeCfg) {
+  uint32_t pipe = 0;
+  switch (pipeCfg) {
+  case kPipeConfigP8_32x32_8x16:
+    pipe |= (((x >> 4) ^ (y >> 3) ^ (x >> 5)) & 0x1) << 0;
+    pipe |= (((x >> 3) ^ (y >> 4)) & 0x1) << 1;
+    pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
+    break;
+  case kPipeConfigP8_32x32_16x16:
+    pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0;
+    pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1;
+    pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
+    break;
+  case kPipeConfigP16:
+    pipe |= (((x >> 3) ^ (y >> 3) ^ (x >> 4)) & 0x1) << 0;
+    pipe |= (((x >> 4) ^ (y >> 4)) & 0x1) << 1;
+    pipe |= (((x >> 5) ^ (y >> 5)) & 0x1) << 2;
+    pipe |= (((x >> 6) ^ (y >> 5)) & 0x1) << 3;
+    break;
+  default:
+    std::abort();
+  }
+  return pipe;
+}
+
+constexpr uint32_t getBankIndex(std::uint32_t x, std::uint32_t y,
+                                std::uint32_t bank_width,
+                                std::uint32_t bank_height,
+                                std::uint32_t num_banks,
+                                std::uint32_t num_pipes) {
+  std::uint32_t x_shift_offset = std::countr_zero(bank_width * num_pipes);
+  std::uint32_t y_shift_offset = std::countr_zero(bank_height);
+  std::uint32_t xs = x >> x_shift_offset;
+  std::uint32_t ys = y >> y_shift_offset;
+  std::uint32_t bank = 0;
+  switch (num_banks) {
+  case 2:
+    bank |= (((xs >> 3) ^ (ys >> 3)) & 0x1) << 0;
+    break;
+  case 4:
+    bank |= (((xs >> 3) ^ (ys >> 4)) & 0x1) << 0;
+    bank |= (((xs >> 4) ^ (ys >> 3)) & 0x1) << 1;
+    break;
+  case 8:
+    bank |= (((xs >> 3) ^ (ys >> 5)) & 0x1) << 0;
+    bank |= (((xs >> 4) ^ (ys >> 4) ^ (ys >> 5)) & 0x1) << 1;
+    bank |= (((xs >> 5) ^ (ys >> 3)) & 0x1) << 2;
+    break;
+  case 16:
+    bank |= (((xs >> 3) ^ (ys >> 6)) & 0x1) << 0;
+    bank |= (((xs >> 4) ^ (ys >> 5) ^ (ys >> 6)) & 0x1) << 1;
+    bank |= (((xs >> 5) ^ (ys >> 4)) & 0x1) << 2;
+    bank |= (((xs >> 6) ^ (ys >> 3)) & 0x1) << 3;
+    break;
+  default:
+    std::abort();
+  }
+
+  return bank;
+}
+
+constexpr std::uint32_t getPipeCount(PipeConfig pipeConfig) {
+  switch (pipeConfig) {
+  case kPipeConfigP8_32x32_8x16:
+  case kPipeConfigP8_32x32_16x16:
+    return 8;
+  case kPipeConfigP16:
+    return 16;
+  default:
+    std::abort();
+  }
+}
+
+SurfaceInfo computeSurfaceInfo(TileMode tileMode, gnm::TextureType type,
+                               gnm::DataFormat dfmt, std::uint32_t width,
+                               std::uint32_t height, std::uint32_t depth,
+                               std::uint32_t pitch, int baseArrayLayer,
+                               int arrayCount, int baseMipLevel, int mipCount,
+                               bool pow2pad);
+SurfaceInfo computeSurfaceInfo(const gnm::TBuffer &tbuffer, TileMode tileMode);
+} // namespace amdgpu
--- a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_cpu.hpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_cpu.hpp
@ -0,0 +1,14 @@
+#pragma once
+
+#include "gnm/constants.hpp"
+#include "tiler.hpp"
+#include <cstdint>
+
+namespace amdgpu {
+std::uint64_t getTiledOffset(gnm::TextureType texType, bool isPow2Padded,
+                             int numFragments, gnm::DataFormat dfmt,
+                             amdgpu::TileMode tileMode,
+                             amdgpu::MacroTileMode macroTileMode, int mipLevel,
+                             int arraySlice, int width, int height, int depth,
+                             int pitch, int x, int y, int z, int fragmentIndex);
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/include/amdgpu/tiler_vulkan.hpp
@ -0,0 +1,24 @@
+#pragma once
+#include "tiler.hpp"
+#include <Scheduler.hpp>
+#include <memory>
+
+namespace amdgpu {
+struct GpuTiler {
+  struct Impl;
+  GpuTiler();
+  ~GpuTiler();
+
+  void detile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
+              amdgpu::TileMode tileMode, std::uint64_t srcTiledAddress,
+              std::uint64_t dstLinearAddress, int mipLevel, int baseArray,
+              int arrayCount);
+  void tile(Scheduler &scheduler, const amdgpu::SurfaceInfo &info,
+            amdgpu::TileMode tileMode, std::uint64_t srcLinearAddress,
+            std::uint64_t dstTiledAddress, int mipLevel, int baseArray,
+            int arrayCount);
+
+private:
+  std::unique_ptr<Impl> mImpl;
+};
+} // namespace amdgpu
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler1d.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = getTiledBitOffset1D(
+        config.tileMode,
+        pos,
+        config.dataSize,
+        config.bitsPerElement
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detiler2d.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = getTiledBitOffset1D(
+        config.tileMode,
+        pos,
+        config.dataSize,
+        config.bitsPerElement
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/detilerLinear.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = computeLinearOffset(
+        config.bitsPerElement,
+        config.dataSize.y,
+        config.dataSize.x,
+        pos
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint8_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint16_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint32_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + linearByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + tiledByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler.glsl
@ -0,0 +1,716 @@
+
+#define FOR_ALL_BASE_TYPES(OP) \
+    OP(int8_t) \
+    OP(uint8_t) \
+    OP(int16_t) \
+    OP(uint16_t) \
+    OP(float16_t) \
+    OP(int32_t) \
+    OP(uint32_t) \
+    OP(float32_t) \
+    OP(int64_t) \
+    OP(uint64_t) \
+    OP(float64_t) \
+
+#define DEFINE_BUFFER_REFERENCE(TYPE) \
+    layout(buffer_reference) buffer buffer_reference_##TYPE { \
+        TYPE data; \
+    }; \
+
+FOR_ALL_BASE_TYPES(DEFINE_BUFFER_REFERENCE)
+
+#define U32ARRAY_FETCH_BITS(ARRAY, START, BITCOUNT)  ((ARRAY[(START) >> 5] >> ((START) & 31)) & ((1 << (BITCOUNT)) - 1))
+#define U64ARRAY_FETCH_BITS(ARRAY, START, BITCOUNT)  ((ARRAY[(START) >> 6] >> ((START) & 63)) & ((uint64_t(1) << (BITCOUNT)) - 1))
+
+uint64_t tbuffer_base(u64vec4 tbuffer) {
+    return U64ARRAY_FETCH_BITS(tbuffer, 0, 38);
+}
+uint32_t tbuffer_mtype_L2(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 38, 2));
+}
+uint32_t tbuffer_min_lod(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 40, 12));
+}
+uint32_t tbuffer_dfmt(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 52, 6));
+}
+uint32_t tbuffer_nfmt(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 58, 4));
+}
+uint32_t tbuffer_mtype_l1(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 62, 2) | (U64ARRAY_FETCH_BITS(tbuffer, 122, 1) << 2));
+}
+uint32_t tbuffer_width(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 64, 14));
+}
+uint32_t tbuffer_height(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 78, 14));
+}
+uint32_t tbuffer_perfMod(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 92, 3));
+}
+bool tbuffer_interlaced(u64vec4 tbuffer) {
+    return U64ARRAY_FETCH_BITS(tbuffer, 95, 1) != 0;
+}
+uint32_t tbuffer_dst_sel_x(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 96, 3));
+}
+uint32_t tbuffer_dst_sel_y(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 99, 3));
+}
+uint32_t tbuffer_dst_sel_z(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 102, 3));
+}
+uint32_t tbuffer_dst_sel_w(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 105, 3));
+}
+uint32_t tbuffer_base_level(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 108, 4));
+}
+uint32_t tbuffer_last_level(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 112, 4));
+}
+uint32_t tbuffer_tiling_idx(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 116, 5));
+}
+bool tbuffer_pow2pad(u64vec4 tbuffer) {
+    return U64ARRAY_FETCH_BITS(tbuffer, 121, 1) != 0;
+}
+uint32_t tbuffer_type(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 124, 4));
+}
+uint32_t tbuffer_depth(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 128, 13));
+}
+uint32_t tbuffer_pitch(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 141, 14));
+}
+uint32_t tbuffer_base_array(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 160, 13));
+}
+uint32_t tbuffer_last_array(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 173, 13));
+}
+uint32_t tbuffer_min_lod_warn(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 192, 12));
+}
+uint32_t tbuffer_counter_bank_id(u64vec4 tbuffer) {
+    return uint32_t(U64ARRAY_FETCH_BITS(tbuffer, 204, 8));
+}
+bool tbuffer_LOD_hdw_cnt_en(u64vec4 tbuffer) {
+    return U64ARRAY_FETCH_BITS(tbuffer, 212, 1) != 0;
+}
+
+const int kTextureType1D = 8;
+const int kTextureType2D = 9;
+const int kTextureType3D = 10;
+const int kTextureTypeCube = 11;
+const int kTextureTypeArray1D = 12;
+const int kTextureTypeArray2D = 13;
+const int kTextureTypeMsaa2D = 14;
+const int kTextureTypeMsaaArray2D = 15;
+
+const uint32_t kMicroTileWidth = 8;
+const uint32_t kMicroTileHeight = 8;
+const uint32_t kDramRowSize = 0x400;
+const uint32_t kPipeInterleaveBytes = 256;
+
+
+const uint32_t kDataFormatInvalid = 0x00000000;
+const uint32_t kDataFormat8 = 0x00000001;
+const uint32_t kDataFormat16 = 0x00000002;
+const uint32_t kDataFormat8_8 = 0x00000003;
+const uint32_t kDataFormat32 = 0x00000004;
+const uint32_t kDataFormat16_16 = 0x00000005;
+const uint32_t kDataFormat10_11_11 = 0x00000006;
+const uint32_t kDataFormat11_11_10 = 0x00000007;
+const uint32_t kDataFormat10_10_10_2 = 0x00000008;
+const uint32_t kDataFormat2_10_10_10 = 0x00000009;
+const uint32_t kDataFormat8_8_8_8 = 0x0000000a;
+const uint32_t kDataFormat32_32 = 0x0000000b;
+const uint32_t kDataFormat16_16_16_16 = 0x0000000c;
+const uint32_t kDataFormat32_32_32 = 0x0000000d;
+const uint32_t kDataFormat32_32_32_32 = 0x0000000e;
+const uint32_t kDataFormat5_6_5 = 0x00000010;
+const uint32_t kDataFormat1_5_5_5 = 0x00000011;
+const uint32_t kDataFormat5_5_5_1 = 0x00000012;
+const uint32_t kDataFormat4_4_4_4 = 0x00000013;
+const uint32_t kDataFormat8_24 = 0x00000014;
+const uint32_t kDataFormat24_8 = 0x00000015;
+const uint32_t kDataFormatX24_8_32 = 0x00000016;
+const uint32_t kDataFormatGB_GR = 0x00000020;
+const uint32_t kDataFormatBG_RG = 0x00000021;
+const uint32_t kDataFormat5_9_9_9 = 0x00000022;
+const uint32_t kDataFormatBc1 = 0x00000023;
+const uint32_t kDataFormatBc2 = 0x00000024;
+const uint32_t kDataFormatBc3 = 0x00000025;
+const uint32_t kDataFormatBc4 = 0x00000026;
+const uint32_t kDataFormatBc5 = 0x00000027;
+const uint32_t kDataFormatBc6 = 0x00000028;
+const uint32_t kDataFormatBc7 = 0x00000029;
+const uint32_t kDataFormatFmask8_S2_F1 = 0x0000002C;
+const uint32_t kDataFormatFmask8_S4_F1 = 0x0000002D;
+const uint32_t kDataFormatFmask8_S8_F1 = 0x0000002E;
+const uint32_t kDataFormatFmask8_S2_F2 = 0x0000002F;
+const uint32_t kDataFormatFmask8_S4_F2 = 0x00000030;
+const uint32_t kDataFormatFmask8_S4_F4 = 0x00000031;
+const uint32_t kDataFormatFmask16_S16_F1 = 0x00000032;
+const uint32_t kDataFormatFmask16_S8_F2 = 0x00000033;
+const uint32_t kDataFormatFmask32_S16_F2 = 0x00000034;
+const uint32_t kDataFormatFmask32_S8_F4 = 0x00000035;
+const uint32_t kDataFormatFmask32_S8_F8 = 0x00000036;
+const uint32_t kDataFormatFmask64_S16_F4 = 0x00000037;
+const uint32_t kDataFormatFmask64_S16_F8 = 0x00000038;
+const uint32_t kDataFormat4_4 = 0x00000039;
+const uint32_t kDataFormat6_5_5 = 0x0000003A;
+const uint32_t kDataFormat1 = 0x0000003B;
+const uint32_t kDataFormat1Reversed = 0x0000003C;
+
+const uint32_t kNumericFormatUNorm = 0x00000000;
+const uint32_t kNumericFormatSNorm = 0x00000001;
+const uint32_t kNumericFormatUScaled = 0x00000002;
+const uint32_t kNumericFormatSScaled = 0x00000003;
+const uint32_t kNumericFormatUInt = 0x00000004;
+const uint32_t kNumericFormatSInt = 0x00000005;
+const uint32_t kNumericFormatSNormNoZero = 0x00000006;
+const uint32_t kNumericFormatFloat = 0x00000007;
+const uint32_t kNumericFormatSrgb = 0x00000009;
+const uint32_t kNumericFormatUBNorm = 0x0000000A;
+const uint32_t kNumericFormatUBNormNoZero = 0x0000000B;
+const uint32_t kNumericFormatUBInt = 0x0000000C;
+const uint32_t kNumericFormatUBScaled = 0x0000000D;
+
+const uint32_t kArrayModeLinearGeneral = 0x00000000;
+const uint32_t kArrayModeLinearAligned = 0x00000001;
+const uint32_t kArrayMode1dTiledThin = 0x00000002;
+const uint32_t kArrayMode1dTiledThick = 0x00000003;
+const uint32_t kArrayMode2dTiledThin = 0x00000004;
+const uint32_t kArrayModeTiledThinPrt = 0x00000005;
+const uint32_t kArrayMode2dTiledThinPrt = 0x00000006;
+const uint32_t kArrayMode2dTiledThick = 0x00000007;
+const uint32_t kArrayMode2dTiledXThick = 0x00000008;
+const uint32_t kArrayModeTiledThickPrt = 0x00000009;
+const uint32_t kArrayMode2dTiledThickPrt = 0x0000000a;
+const uint32_t kArrayMode3dTiledThinPrt = 0x0000000b;
+const uint32_t kArrayMode3dTiledThin = 0x0000000c;
+const uint32_t kArrayMode3dTiledThick = 0x0000000d;
+const uint32_t kArrayMode3dTiledXThick = 0x0000000e;
+const uint32_t kArrayMode3dTiledThickPrt = 0x0000000f;
+
+const uint32_t kMicroTileModeDisplay = 0x00000000;
+const uint32_t kMicroTileModeThin = 0x00000001;
+const uint32_t kMicroTileModeDepth = 0x00000002;
+const uint32_t kMicroTileModeRotated = 0x00000003;
+const uint32_t kMicroTileModeThick = 0x00000004;
+
+const uint32_t kPipeConfigP8_32x32_8x16 = 0x0000000a;
+const uint32_t kPipeConfigP8_32x32_16x16 = 0x0000000c;
+const uint32_t kPipeConfigP16 = 0x00000012;
+
+
+
+uint32_t getMicroTileThickness(uint32_t arrayMode) {
+  switch (arrayMode) {
+  case kArrayMode1dTiledThick:
+  case kArrayMode2dTiledThick:
+  case kArrayMode3dTiledThick:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThickPrt:
+    return 4;
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledXThick:
+    return 8;
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode2dTiledThin:
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThin:
+    return 1;
+  }
+
+  return 1;
+}
+
+bool isMacroTiled(uint32_t arrayMode) {
+  switch (arrayMode) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+    return false;
+  case kArrayMode2dTiledThin:
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+  case kArrayMode3dTiledThickPrt:
+    return true;
+  }
+
+  return false;
+}
+
+bool isPrt(uint32_t arrayMode) {
+  switch (arrayMode) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+  case kArrayMode2dTiledThin:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+    return false;
+
+  case kArrayModeTiledThinPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThickPrt:
+    return true;
+  }
+
+  return false;
+}
+
+int getTexelsPerElement(uint32_t dfmt) {
+  switch (dfmt) {
+  case kDataFormatBc1:
+  case kDataFormatBc2:
+  case kDataFormatBc3:
+  case kDataFormatBc4:
+  case kDataFormatBc5:
+  case kDataFormatBc6:
+  case kDataFormatBc7:
+    return 16;
+  case kDataFormat1:
+  case kDataFormat1Reversed:
+    return 8;
+  case kDataFormatGB_GR:
+  case kDataFormatBG_RG:
+    return 2;
+  default:
+    return 1;
+  }
+}
+
+int getBitsPerElement(uint32_t dfmt) {
+  switch (dfmt) {
+  case kDataFormatInvalid:
+    return 0;
+  case kDataFormat8:
+    return 8;
+  case kDataFormat16:
+    return 16;
+  case kDataFormat8_8:
+    return 16;
+  case kDataFormat32:
+    return 32;
+  case kDataFormat16_16:
+    return 32;
+  case kDataFormat10_11_11:
+    return 32;
+  case kDataFormat11_11_10:
+    return 32;
+  case kDataFormat10_10_10_2:
+    return 32;
+  case kDataFormat2_10_10_10:
+    return 32;
+  case kDataFormat8_8_8_8:
+    return 32;
+  case kDataFormat32_32:
+    return 64;
+  case kDataFormat16_16_16_16:
+    return 64;
+  case kDataFormat32_32_32:
+    return 96;
+  case kDataFormat32_32_32_32:
+    return 128;
+  case kDataFormat5_6_5:
+    return 16;
+  case kDataFormat1_5_5_5:
+    return 16;
+  case kDataFormat5_5_5_1:
+    return 16;
+  case kDataFormat4_4_4_4:
+    return 16;
+  case kDataFormat8_24:
+    return 32;
+  case kDataFormat24_8:
+    return 32;
+  case kDataFormatX24_8_32:
+    return 64;
+  case kDataFormatGB_GR:
+    return 16;
+  case kDataFormatBG_RG:
+    return 16;
+  case kDataFormat5_9_9_9:
+    return 32;
+  case kDataFormatBc1:
+    return 4;
+  case kDataFormatBc2:
+    return 8;
+  case kDataFormatBc3:
+    return 8;
+  case kDataFormatBc4:
+    return 4;
+  case kDataFormatBc5:
+    return 8;
+  case kDataFormatBc6:
+    return 8;
+  case kDataFormatBc7:
+    return 8;
+  case kDataFormatFmask8_S2_F1:
+    return 8;
+  case kDataFormatFmask8_S4_F1:
+    return 8;
+  case kDataFormatFmask8_S8_F1:
+    return 8;
+  case kDataFormatFmask8_S2_F2:
+    return 8;
+  case kDataFormatFmask8_S4_F2:
+    return 8;
+  case kDataFormatFmask8_S4_F4:
+    return 8;
+  case kDataFormatFmask16_S16_F1:
+    return 16;
+  case kDataFormatFmask16_S8_F2:
+    return 16;
+  case kDataFormatFmask32_S16_F2:
+    return 32;
+  case kDataFormatFmask32_S8_F4:
+    return 32;
+  case kDataFormatFmask32_S8_F8:
+    return 32;
+  case kDataFormatFmask64_S16_F4:
+    return 64;
+  case kDataFormatFmask64_S16_F8:
+    return 64;
+  case kDataFormat4_4:
+    return 8;
+  case kDataFormat6_5_5:
+    return 16;
+  case kDataFormat1:
+    return 1;
+  case kDataFormat1Reversed:
+    return 1;
+  }
+
+  return -1;
+}
+
+int getTotalBitsPerElement(uint32_t dfmt) {
+  return getBitsPerElement(dfmt) * getTexelsPerElement(dfmt);
+}
+
+int getNumComponentsPerElement(uint32_t dfmt) {
+  switch (dfmt) {
+  case kDataFormatInvalid:
+    return 0;
+  case kDataFormat8:
+    return 1;
+  case kDataFormat16:
+    return 1;
+  case kDataFormat8_8:
+    return 2;
+  case kDataFormat32:
+    return 1;
+  case kDataFormat16_16:
+    return 2;
+  case kDataFormat10_11_11:
+    return 3;
+  case kDataFormat11_11_10:
+    return 3;
+  case kDataFormat10_10_10_2:
+    return 4;
+  case kDataFormat2_10_10_10:
+    return 4;
+  case kDataFormat8_8_8_8:
+    return 4;
+  case kDataFormat32_32:
+    return 2;
+  case kDataFormat16_16_16_16:
+    return 4;
+  case kDataFormat32_32_32:
+    return 3;
+  case kDataFormat32_32_32_32:
+    return 4;
+  case kDataFormat5_6_5:
+    return 3;
+  case kDataFormat1_5_5_5:
+    return 4;
+  case kDataFormat5_5_5_1:
+    return 4;
+  case kDataFormat4_4_4_4:
+    return 4;
+  case kDataFormat8_24:
+    return 2;
+  case kDataFormat24_8:
+    return 2;
+  case kDataFormatX24_8_32:
+    return 2;
+  case kDataFormatGB_GR:
+    return 3;
+  case kDataFormatBG_RG:
+    return 3;
+  case kDataFormat5_9_9_9:
+    return 3;
+  case kDataFormatBc1:
+    return 4;
+  case kDataFormatBc2:
+    return 4;
+  case kDataFormatBc3:
+    return 4;
+  case kDataFormatBc4:
+    return 1;
+  case kDataFormatBc5:
+    return 2;
+  case kDataFormatBc6:
+    return 3;
+  case kDataFormatBc7:
+    return 4;
+  case kDataFormatFmask8_S2_F1:
+    return 2;
+  case kDataFormatFmask8_S4_F1:
+    return 2;
+  case kDataFormatFmask8_S8_F1:
+    return 2;
+  case kDataFormatFmask8_S2_F2:
+    return 2;
+  case kDataFormatFmask8_S4_F2:
+    return 2;
+  case kDataFormatFmask8_S4_F4:
+    return 2;
+  case kDataFormatFmask16_S16_F1:
+    return 2;
+  case kDataFormatFmask16_S8_F2:
+    return 2;
+  case kDataFormatFmask32_S16_F2:
+    return 2;
+  case kDataFormatFmask32_S8_F4:
+    return 2;
+  case kDataFormatFmask32_S8_F8:
+    return 2;
+  case kDataFormatFmask64_S16_F4:
+    return 2;
+  case kDataFormatFmask64_S16_F8:
+    return 2;
+  case kDataFormat4_4:
+    return 2;
+  case kDataFormat6_5_5:
+    return 3;
+  case kDataFormat1:
+    return 1;
+  case kDataFormat1Reversed:
+    return 1;
+  }
+
+  return -1;
+}
+
+uint32_t tileMode_getArrayMode(uint32_t tileMode) {
+    return (tileMode & 0x0000003c) >> 2;
+}
+uint32_t tileMode_getPipeConfig(uint32_t tileMode) {
+    return (tileMode & 0x000007c0) >> 6;
+}
+uint32_t tileMode_getTileSplit(uint32_t tileMode) {
+    return (tileMode & 0x00003800) >> 11;
+}
+uint32_t tileMode_getMicroTileMode(uint32_t tileMode) {
+    return (tileMode & 0x01c00000) >> 22;
+}
+uint32_t tileMode_getSampleSplit(uint32_t tileMode) {
+    return (tileMode & 0x06000000) >> 25;
+}
+
+uint32_t bit_ceil(uint32_t x) {
+  x = x - 1;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+	return x + 1;
+}
+
+uint32_t getElementIndex(uvec3 pos, uint32_t bitsPerElement, uint32_t microTileMode, uint32_t arrayMode) {
+  uint32_t elem = 0;
+
+  if (microTileMode == kMicroTileModeDisplay) {
+    switch (bitsPerElement) {
+    case 8:
+      elem |= ((pos.x >> 0) & 0x1) << 0;
+      elem |= ((pos.x >> 1) & 0x1) << 1;
+      elem |= ((pos.x >> 2) & 0x1) << 2;
+      elem |= ((pos.y >> 1) & 0x1) << 3;
+      elem |= ((pos.y >> 0) & 0x1) << 4;
+      elem |= ((pos.y >> 2) & 0x1) << 5;
+      break;
+    case 16:
+      elem |= ((pos.x >> 0) & 0x1) << 0;
+      elem |= ((pos.x >> 1) & 0x1) << 1;
+      elem |= ((pos.x >> 2) & 0x1) << 2;
+      elem |= ((pos.y >> 0) & 0x1) << 3;
+      elem |= ((pos.y >> 1) & 0x1) << 4;
+      elem |= ((pos.y >> 2) & 0x1) << 5;
+      break;
+    case 32:
+      elem |= ((pos.x >> 0) & 0x1) << 0;
+      elem |= ((pos.x >> 1) & 0x1) << 1;
+      elem |= ((pos.y >> 0) & 0x1) << 2;
+      elem |= ((pos.x >> 2) & 0x1) << 3;
+      elem |= ((pos.y >> 1) & 0x1) << 4;
+      elem |= ((pos.y >> 2) & 0x1) << 5;
+      break;
+    case 64:
+      elem |= ((pos.x >> 0) & 0x1) << 0;
+      elem |= ((pos.y >> 0) & 0x1) << 1;
+      elem |= ((pos.x >> 1) & 0x1) << 2;
+      elem |= ((pos.x >> 2) & 0x1) << 3;
+      elem |= ((pos.y >> 1) & 0x1) << 4;
+      elem |= ((pos.y >> 2) & 0x1) << 5;
+      break;
+    }
+  } else if (microTileMode == kMicroTileModeThin ||
+             microTileMode == kMicroTileModeDepth) {
+    elem |= ((pos.x >> 0) & 0x1) << 0;
+    elem |= ((pos.y >> 0) & 0x1) << 1;
+    elem |= ((pos.x >> 1) & 0x1) << 2;
+    elem |= ((pos.y >> 1) & 0x1) << 3;
+    elem |= ((pos.x >> 2) & 0x1) << 4;
+    elem |= ((pos.y >> 2) & 0x1) << 5;
+
+    switch (arrayMode) {
+    case kArrayMode2dTiledXThick:
+    case kArrayMode3dTiledXThick:
+      elem |= ((pos.z >> 2) & 0x1) << 8;
+    case kArrayMode1dTiledThick:
+    case kArrayMode2dTiledThick:
+    case kArrayMode3dTiledThick:
+    case kArrayModeTiledThickPrt:
+    case kArrayMode2dTiledThickPrt:
+    case kArrayMode3dTiledThickPrt:
+      elem |= ((pos.z >> 0) & 0x1) << 6;
+      elem |= ((pos.z >> 1) & 0x1) << 7;
+    default:
+      break;
+    }
+  } else if (microTileMode == kMicroTileModeThick) {
+    switch (arrayMode) {
+    case kArrayMode2dTiledXThick:
+    case kArrayMode3dTiledXThick:
+      elem |= ((pos.z >> 2) & 0x1) << 8;
+
+    case kArrayMode1dTiledThick:
+    case kArrayMode2dTiledThick:
+    case kArrayMode3dTiledThick:
+    case kArrayModeTiledThickPrt:
+    case kArrayMode2dTiledThickPrt:
+    case kArrayMode3dTiledThickPrt:
+      if (bitsPerElement == 8 || bitsPerElement == 16) {
+        elem |= ((pos.x >> 0) & 0x1) << 0;
+        elem |= ((pos.y >> 0) & 0x1) << 1;
+        elem |= ((pos.x >> 1) & 0x1) << 2;
+        elem |= ((pos.y >> 1) & 0x1) << 3;
+        elem |= ((pos.z >> 0) & 0x1) << 4;
+        elem |= ((pos.z >> 1) & 0x1) << 5;
+        elem |= ((pos.x >> 2) & 0x1) << 6;
+        elem |= ((pos.y >> 2) & 0x1) << 7;
+      } else if (bitsPerElement == 32) {
+        elem |= ((pos.x >> 0) & 0x1) << 0;
+        elem |= ((pos.y >> 0) & 0x1) << 1;
+        elem |= ((pos.x >> 1) & 0x1) << 2;
+        elem |= ((pos.z >> 0) & 0x1) << 3;
+        elem |= ((pos.y >> 1) & 0x1) << 4;
+        elem |= ((pos.z >> 1) & 0x1) << 5;
+        elem |= ((pos.x >> 2) & 0x1) << 6;
+        elem |= ((pos.y >> 2) & 0x1) << 7;
+      } else if (bitsPerElement == 64 || bitsPerElement == 128) {
+        elem |= ((pos.x >> 0) & 0x1) << 0;
+        elem |= ((pos.y >> 0) & 0x1) << 1;
+        elem |= ((pos.z >> 0) & 0x1) << 2;
+        elem |= ((pos.x >> 1) & 0x1) << 3;
+        elem |= ((pos.y >> 1) & 0x1) << 4;
+        elem |= ((pos.z >> 1) & 0x1) << 5;
+        elem |= ((pos.x >> 2) & 0x1) << 6;
+        elem |= ((pos.y >> 2) & 0x1) << 7;
+      }
+      break;
+    }
+  }
+  return elem;
+}
+
+uint64_t computeLinearElementByteOffset(
+    uvec3 pos, uint32_t fragmentIndex, uint32_t pitch,
+    uint32_t slicePitchElems, uint32_t bitsPerElement,
+    uint32_t numFragmentsPerPixel) {
+  uint64_t absoluteElementIndex = pos.z * slicePitchElems + pos.y * pitch + pos.x;
+  return ((absoluteElementIndex * bitsPerElement * numFragmentsPerPixel) +
+          (bitsPerElement * fragmentIndex)) / 8;
+}
+
+uint64_t computeLinearOffset(uint32_t bitsPerElement, uint height, uint pitch, uvec3 pos) {
+  uint paddedHeight = height;
+  uint paddedWidth = pitch;
+
+  if (bitsPerElement == 1) {
+    bitsPerElement *= 8;
+    paddedWidth = max((paddedWidth + 7) / 8, 1);
+  }
+
+  uint64_t tiledRowSizeBits = uint64_t(bitsPerElement) * paddedWidth;
+  uint64_t tiledSliceBits = uint64_t(paddedWidth) * paddedHeight * bitsPerElement;
+  return tiledSliceBits * pos.z + tiledRowSizeBits * pos.y + bitsPerElement * pos.x;
+}
+
+uint64_t getTiledBitOffset1D(uint32_t tileMode, uvec3 pos, uvec2 dataSize, uint32_t bitsPerElement) {
+    uint32_t arrayMode = tileMode_getArrayMode(tileMode);
+
+    uint32_t paddedWidth = dataSize.x;
+    uint32_t paddedHeight = dataSize.y;
+
+    int tileThickness = (arrayMode == kArrayMode1dTiledThick) ? 4 : 1;
+
+    uint64_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement + 7) / 8;
+    uint32_t tilesPerRow = paddedWidth / kMicroTileWidth;
+    uint32_t tilesPerSlice = max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1);
+
+    uint64_t elementIndex = getElementIndex(pos, bitsPerElement,
+                                            tileMode_getMicroTileMode(tileMode), arrayMode);
+
+    uint64_t sliceOffset = (pos.z / tileThickness) * tilesPerSlice * tileBytes;
+
+    uint64_t tileRowIndex = pos.y / kMicroTileHeight;
+    uint64_t tileColumnIndex = pos.x / kMicroTileWidth;
+    uint64_t tileOffset =
+        (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;
+
+    uint64_t elementOffset = elementIndex * bitsPerElement;
+    return (sliceOffset + tileOffset) * 8 + elementOffset;
+}
+
+layout(binding=0) uniform Config {
+    uint64_t srcAddress;
+    uint64_t dstAddress;
+    uvec2 dataSize;
+    uint32_t tileMode;
+    uint32_t numFragments;
+    uint32_t bitsPerElement;
+    uint32_t tiledSurfaceSize;
+    uint32_t linearSurfaceSize;
+} config;
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler1d.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = getTiledBitOffset1D(
+        config.tileMode,
+        pos,
+        config.dataSize,
+        config.bitsPerElement
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tiler2d.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = getTiledBitOffset1D(
+        config.tileMode,
+        pos,
+        config.dataSize,
+        config.bitsPerElement
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/shaders/tilerLinear.comp.glsl
@ -0,0 +1,76 @@
+#version 460
+
+#extension GL_GOOGLE_include_directive : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : enable
+#extension GL_EXT_shader_atomic_int64 : enable
+#extension GL_EXT_shader_atomic_float : enable
+#extension GL_EXT_shader_image_load_formatted : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#extension GL_EXT_shared_memory_block : enable
+#extension GL_EXT_scalar_block_layout : enable
+#extension GL_EXT_null_initializer : enable
+#extension GL_EXT_buffer_reference2 : enable
+#extension GL_EXT_buffer_reference_uvec2 : enable
+
+#include "tiler.glsl"
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID;
+    uint64_t tiledSliceOffset = 0;
+    uint64_t linearSliceOffset = 0;
+    if (config.tiledSurfaceSize != 0) {
+        tiledSliceOffset = pos.z * config.tiledSurfaceSize;
+        linearSliceOffset = pos.z * config.linearSurfaceSize;
+        pos.z = 0;
+    }
+
+    uint64_t tiledByteOffset = computeLinearOffset(
+        config.bitsPerElement,
+        config.dataSize.y,
+        config.dataSize.x,
+        pos
+    ) / 8;
+
+    tiledByteOffset += tiledSliceOffset;
+
+    uint64_t linearByteOffset = computeLinearElementByteOffset(
+        pos,
+        0,
+        config.dataSize.x,
+        config.dataSize.x * config.dataSize.y,
+        config.bitsPerElement,
+        1 << config.numFragments
+    );
+
+    linearByteOffset += linearSliceOffset;
+
+    switch ((config.bitsPerElement + 7) / 8) {
+    case 1:
+        buffer_reference_uint8_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint8_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 2:
+        buffer_reference_uint16_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint16_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 4:
+        buffer_reference_uint32_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint32_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 8:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        break;
+
+    case 16:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        break;
+
+    case 32:
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 8).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 8).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 16).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 16).data;
+        buffer_reference_uint64_t(config.dstAddress + tiledByteOffset + 24).data = buffer_reference_uint64_t(config.srcAddress + linearByteOffset + 24).data;
+        break;
+    }
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler.cpp
@ -0,0 +1,387 @@
+#include "gnm/constants.hpp"
+#include <amdgpu/tiler.hpp>
+#include <gnm/gnm.hpp>
+#include <bit>
+
+using namespace amdgpu;
+
+static constexpr SurfaceInfo
+computeTexture1dInfo(ArrayMode arrayMode, gnm::TextureType type,
+                     gnm::DataFormat dfmt, std::uint32_t width,
+                     std::uint32_t height, std::uint32_t depth,
+                     std::uint32_t pitch, int baseArrayLayer, int arrayCount,
+                     int baseMipLevel, int mipCount, bool pow2pad) {
+  bool isCubemap = type == gnm::TextureType::Cube;
+  bool isVolume = type == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  std::uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  int numFragments = (type == gnm::TextureType::Msaa2D ||
+                      type == gnm::TextureType::MsaaArray2D)
+                         ? (baseArrayLayer + arrayCount - 1)
+                         : 0;
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+
+  auto bitsPerElement = bitsPerFragment;
+  depth = isVolume ? depth : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (pow2pad) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+  }
+
+  std::uint64_t surfaceOffset = 0;
+  std::uint64_t surfaceSize = 0;
+
+  SurfaceInfo result;
+  result.width = width;
+  result.height = height;
+  result.depth = depth;
+  result.pitch = pitch;
+  result.numFragments = numFragments;
+  result.bitsPerElement = bitsPerElement;
+  result.arrayLayerCount = arraySliceCount;
+
+  auto thickness = getMicroTileThickness(arrayMode);
+
+  for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
+    std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
+    std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
+    std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
+
+    std::uint32_t linearPitch = elemPitch;
+    std::uint32_t linearWidth = elemWidth;
+    std::uint32_t linearHeight = elemHeight;
+    std::uint32_t linearDepth = elemDepth;
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
+        linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (pow2pad) {
+      linearPitch = std::bit_ceil(linearPitch);
+      linearWidth = std::bit_ceil(linearWidth);
+      linearHeight = std::bit_ceil(linearHeight);
+      linearDepth = std::bit_ceil(linearDepth);
+    }
+
+    if (mipLevel > 0 && pitch > 0) {
+      linearPitch = linearWidth;
+    }
+
+    std::uint32_t paddedPitch =
+        (linearPitch + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
+    std::uint32_t paddedHeight =
+        (linearHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
+    std::uint32_t paddedDepth = linearDepth;
+
+    if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
+      if (isCubemap) {
+        linearDepth = std::bit_ceil(linearDepth);
+      }
+
+      paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
+    }
+
+    std::uint32_t tempPitch = paddedPitch;
+    std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
+                                          paddedHeight * bitsPerElement *
+                                          numFragmentsPerPixel;
+    logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+
+    uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
+      tempPitch += kMicroTileWidth;
+      logicalSliceSizeBytes = std::uint64_t(tempPitch) * paddedHeight *
+                              bitsPerElement * numFragmentsPerPixel;
+      logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+      physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    }
+
+    surfaceSize = logicalSliceSizeBytes * paddedDepth;
+    auto linearSize =
+        linearDepth *
+        (linearPitch * linearHeight * bitsPerElement * numFragmentsPerPixel +
+         7) /
+        8;
+
+    result.setSubresourceInfo(mipLevel, {
+                                            .dataWidth = linearPitch,
+                                            .dataHeight = linearHeight,
+                                            .dataDepth = linearDepth,
+                                            .offset = surfaceOffset,
+                                            .tiledSize = surfaceSize,
+                                            .linearSize = linearSize,
+                                        });
+
+    surfaceOffset += arraySliceCount * surfaceSize;
+  }
+
+  result.totalSize = surfaceOffset;
+  return result;
+}
+
+static constexpr SurfaceInfo computeTextureLinearInfo(
+    ArrayMode arrayMode, gnm::TextureType type, gnm::DataFormat dfmt,
+    std::uint32_t width, std::uint32_t height, std::uint32_t depth,
+    std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
+    int mipCount, bool pow2pad) {
+  bool isCubemap = type == gnm::TextureType::Cube;
+  bool isVolume = type == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  std::uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  int numFragments = (type == gnm::TextureType::Msaa2D ||
+                      type == gnm::TextureType::MsaaArray2D)
+                         ? (baseArrayLayer + arrayCount - 1)
+                         : 0;
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+
+  auto bitsPerElement = bitsPerFragment;
+  depth = isVolume ? depth : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (pow2pad) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+  }
+
+  std::uint64_t surfaceOffset = 0;
+  std::uint64_t surfaceSize = 0;
+
+  SurfaceInfo result;
+  result.width = width;
+  result.height = height;
+  result.depth = depth;
+  result.pitch = pitch;
+  result.numFragments = numFragments;
+  result.bitsPerElement = bitsPerElement;
+  result.arrayLayerCount = arraySliceCount;
+
+  for (int mipLevel = 0; mipLevel < baseMipLevel + mipCount; mipLevel++) {
+    std::uint32_t elemWidth = std::max<std::uint64_t>(width >> mipLevel, 1);
+    std::uint32_t elemPitch = std::max<std::uint64_t>(pitch >> mipLevel, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> mipLevel, 1);
+    std::uint32_t elemDepth = std::max<std::uint64_t>(depth >> mipLevel, 1);
+
+    std::uint32_t linearPitch = elemPitch;
+    std::uint32_t linearWidth = elemWidth;
+    std::uint32_t linearHeight = elemHeight;
+    std::uint32_t linearDepth = elemDepth;
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 7) / 8, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        linearWidth = std::max<std::uint64_t>((linearWidth + 3) / 4, 1);
+        linearPitch = std::max<std::uint64_t>((linearPitch + 3) / 4, 1);
+        linearHeight = std::max<std::uint64_t>((linearHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (pow2pad) {
+      linearPitch = std::bit_ceil(linearPitch);
+      linearWidth = std::bit_ceil(linearWidth);
+      linearHeight = std::bit_ceil(linearHeight);
+      linearDepth = std::bit_ceil(linearDepth);
+    }
+
+    if (mipLevel > 0 && pitch > 0) {
+      linearPitch = linearWidth;
+    }
+
+    if (arrayMode == kArrayModeLinearGeneral) {
+      surfaceSize = (static_cast<uint64_t>(linearPitch) *
+                         (linearHeight)*bitsPerElement * numFragmentsPerPixel +
+                     7) /
+                    8;
+      surfaceSize *= linearDepth;
+
+      result.setSubresourceInfo(mipLevel, {
+                                              .dataWidth = linearPitch,
+                                              .dataHeight = linearHeight,
+                                              .dataDepth = linearDepth,
+                                              .offset = surfaceOffset,
+                                              .tiledSize = surfaceSize,
+                                              .linearSize = surfaceSize,
+                                          });
+    } else {
+      if (mipLevel > 0 && pitch > 0) {
+        linearPitch = linearWidth;
+      }
+
+      auto pitchAlign = std::max(8UL, 64UL / ((bitsPerElement + 7) / 8UL));
+      std::uint32_t paddedPitch =
+          (linearPitch + pitchAlign - 1) & ~(pitchAlign - 1);
+      std::uint32_t paddedHeight = linearHeight;
+      std::uint32_t paddedDepth = linearDepth;
+
+      if (!isCubemap || (mipLevel > 0 && linearDepth > 1)) {
+        if (isCubemap) {
+          linearDepth = std::bit_ceil(linearDepth);
+        }
+
+        auto thickness = getMicroTileThickness(arrayMode);
+        paddedDepth = (linearDepth + thickness - 1) & ~(thickness - 1);
+      }
+
+      std::uint32_t pixelsPerPipeInterleave =
+          kPipeInterleaveBytes / ((bitsPerElement + 7) / 8);
+      std::uint32_t sliceAlignInPixel =
+          pixelsPerPipeInterleave < 64 ? 64 : pixelsPerPipeInterleave;
+      auto pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
+                            numFragmentsPerPixel;
+      while (pixelsPerSlice % sliceAlignInPixel) {
+        paddedPitch += pitchAlign;
+        pixelsPerSlice = static_cast<uint64_t>(paddedPitch) * paddedHeight *
+                         numFragmentsPerPixel;
+      }
+
+      surfaceSize = (pixelsPerSlice * bitsPerElement + 7) / 8 * paddedDepth;
+
+      result.setSubresourceInfo(mipLevel, {
+                                              .dataWidth = paddedPitch,
+                                              .dataHeight = paddedHeight,
+                                              .dataDepth = paddedDepth,
+                                              .offset = surfaceOffset,
+                                              .tiledSize = surfaceSize,
+                                              .linearSize = surfaceSize,
+                                          });
+    }
+
+    surfaceOffset += arraySliceCount * surfaceSize;
+  }
+
+  result.totalSize = surfaceOffset;
+  return result;
+}
+
+SurfaceInfo amdgpu::computeSurfaceInfo(
+    TileMode tileMode, gnm::TextureType type, gnm::DataFormat dfmt,
+    std::uint32_t width, std::uint32_t height, std::uint32_t depth,
+    std::uint32_t pitch, int baseArrayLayer, int arrayCount, int baseMipLevel,
+    int mipCount, bool pow2pad) {
+  switch (tileMode.arrayMode()) {
+  case kArrayModeLinearGeneral:
+  case kArrayModeLinearAligned:
+    return computeTextureLinearInfo(
+        tileMode.arrayMode(), type, dfmt, width, height, depth, pitch,
+        baseArrayLayer, arrayCount, baseMipLevel, mipCount, pow2pad);
+
+  case kArrayMode1dTiledThin:
+  case kArrayMode1dTiledThick:
+    return computeTexture1dInfo(tileMode.arrayMode(), type, dfmt, width, height,
+                                depth, pitch, baseArrayLayer, arrayCount,
+                                baseMipLevel, mipCount, pow2pad);
+
+  case kArrayMode2dTiledThin:
+  case kArrayMode2dTiledThick:
+  case kArrayMode2dTiledXThick:
+  case kArrayMode3dTiledThin:
+  case kArrayMode3dTiledThick:
+  case kArrayMode3dTiledXThick:
+  case kArrayModeTiledThinPrt:
+  case kArrayModeTiledThickPrt:
+  case kArrayMode2dTiledThinPrt:
+  case kArrayMode2dTiledThickPrt:
+  case kArrayMode3dTiledThinPrt:
+  case kArrayMode3dTiledThickPrt:
+    std::abort();
+  }
+
+  std::abort();
+}
+
+SurfaceInfo amdgpu::computeSurfaceInfo(const gnm::TBuffer &tbuffer,
+                                       TileMode tileMode) {
+  return computeSurfaceInfo(
+      tileMode, tbuffer.type, tbuffer.dfmt, tbuffer.width + 1,
+      tbuffer.height + 1, tbuffer.depth + 1, tbuffer.pitch + 1,
+      tbuffer.base_array, tbuffer.last_array - tbuffer.base_array + 1,
+      tbuffer.base_level, tbuffer.last_level - tbuffer.base_level + 1,
+      tbuffer.pow2pad != 0);
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_cpu.cpp
@ -0,0 +1,441 @@
+#include "amdgpu/tiler_cpu.hpp"
+#include "amdgpu/tiler.hpp"
+#include "gnm/gnm.hpp"
+
+constexpr std::uint64_t
+getTiledOffset1D(gnm::TextureType texType, bool isPow2Padded,
+                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode, int mipLevel,
+                 int arraySlice, int numFragments, int width, int height,
+                 int depth, int pitch, int x, int y, int z) {
+
+  using namespace amdgpu;
+  bool isCubemap = texType == gnm::TextureType::Cube;
+  bool isVolume = texType == gnm::TextureType::Dim3D;
+
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+  uint32_t arraySliceCount = depth;
+
+  if (isCubemap) {
+    arraySliceCount *= 6;
+  } else if (isVolume) {
+    arraySliceCount = 1;
+  }
+
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+  auto arrayMode = tileMode.arrayMode();
+
+  auto bitsPerElement = bitsPerFragment;
+  auto paddedWidth = std::max((mipLevel != 0 ? pitch : width) >> mipLevel, 1);
+  auto paddedHeight = std::max(height >> mipLevel, 1);
+
+  auto tileThickness = (arrayMode == amdgpu::kArrayMode1dTiledThick) ? 4 : 1;
+
+  if (isBlockCompressed) {
+    switch (bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
+      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
+      break;
+    case 16:
+      std::abort();
+      break;
+
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  if (isPow2Padded) {
+    arraySliceCount = std::bit_ceil(arraySliceCount);
+    paddedWidth = std::bit_ceil(unsigned(paddedWidth));
+    paddedHeight = std::bit_ceil(unsigned(paddedHeight));
+  }
+
+  uint64_t finalSurfaceOffset = 0;
+  uint64_t finalSurfaceSize = 0;
+
+  auto thickness = getMicroTileThickness(arrayMode);
+
+  for (int i = 0; i <= mipLevel; i++) {
+    finalSurfaceOffset += arraySliceCount * finalSurfaceSize;
+
+    std::uint32_t elemWidth =
+        std::max<std::uint64_t>((i > 0 ? pitch : width) >> i, 1);
+    std::uint32_t elemHeight = std::max<std::uint64_t>(height >> i, 1);
+    std::uint32_t elemDepth =
+        std::max<std::uint64_t>((isVolume ? depth : 1) >> i, 1);
+
+    if (isBlockCompressed) {
+      switch (bitsPerFragment) {
+      case 1:
+        elemWidth = std::max<std::uint64_t>((elemWidth + 7) / 8, 1);
+        break;
+      case 4:
+      case 8:
+        elemWidth = std::max<std::uint64_t>((elemWidth + 3) / 4, 1);
+        elemHeight = std::max<std::uint64_t>((elemHeight + 3) / 4, 1);
+        break;
+      case 16:
+        std::abort();
+        break;
+
+      default:
+        std::abort();
+        break;
+      }
+    }
+
+    if (isPow2Padded) {
+      elemWidth = std::bit_ceil(elemWidth);
+      elemHeight = std::bit_ceil(elemHeight);
+      elemDepth = std::bit_ceil(elemDepth);
+    }
+
+    elemWidth = (elemWidth + kMicroTileWidth - 1) & ~(kMicroTileWidth - 1);
+    elemHeight = (elemHeight + kMicroTileHeight - 1) & ~(kMicroTileHeight - 1);
+    elemDepth = (elemDepth + thickness - 1) & ~(thickness - 1);
+
+    std::uint32_t tempPitch = elemWidth;
+    std::uint64_t logicalSliceSizeBytes = std::uint64_t(tempPitch) *
+                                          elemHeight * bitsPerElement *
+                                          numFragmentsPerPixel;
+    logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+
+    uint64_t physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    while ((physicalSliceSizeBytes % kPipeInterleaveBytes) != 0) {
+      tempPitch += 8;
+      logicalSliceSizeBytes = std::uint64_t(tempPitch) * elemHeight *
+                              bitsPerElement * numFragmentsPerPixel;
+      logicalSliceSizeBytes = (logicalSliceSizeBytes + 7) / 8;
+      physicalSliceSizeBytes = logicalSliceSizeBytes * thickness;
+    }
+
+    finalSurfaceSize = logicalSliceSizeBytes * elemDepth;
+  }
+
+  finalSurfaceOffset += finalSurfaceSize * (uint64_t)arraySlice;
+
+  auto tileBytes =
+      (kMicroTileWidth * kMicroTileHeight * tileThickness * bitsPerElement +
+       7) /
+      8;
+  auto tilesPerRow = paddedWidth / kMicroTileWidth;
+  auto tilesPerSlice =
+      std::max(tilesPerRow * (paddedHeight / kMicroTileHeight), 1U);
+
+  uint64_t elementIndex = getElementIndex(x, y, z, bitsPerElement,
+                                          tileMode.microTileMode(), arrayMode);
+
+  uint64_t sliceOffset = (z / tileThickness) * tilesPerSlice * tileBytes;
+
+  uint64_t tileRowIndex = y / kMicroTileHeight;
+  uint64_t tileColumnIndex = x / kMicroTileWidth;
+  uint64_t tileOffset =
+      (tileRowIndex * tilesPerRow + tileColumnIndex) * tileBytes;
+
+  uint64_t elementOffset = elementIndex * bitsPerElement;
+  uint64_t finalOffset = (sliceOffset + tileOffset) * 8 + elementOffset;
+
+  return finalOffset + finalSurfaceOffset * 8;
+}
+
+constexpr std::uint64_t getTiledOffsetLinear(gnm::DataFormat dfmt, int height,
+                                             int pitch, int x, int y, int z) {
+  auto bitsPerFragment = getBitsPerElement(dfmt);
+
+  auto bitsPerElement = bitsPerFragment;
+  auto paddedHeight = height;
+  auto paddedWidth = pitch;
+
+  if (bitsPerFragment == 1) {
+    bitsPerElement *= 8;
+    paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+  }
+
+  uint64_t tiledRowSizeBits = bitsPerElement * paddedWidth;
+  uint64_t tiledSliceBits = paddedWidth * paddedHeight * bitsPerElement;
+  return tiledSliceBits * z + tiledRowSizeBits * y + bitsPerElement * x;
+}
+
+constexpr std::uint64_t
+getTiledOffset2D(gnm::TextureType texType, bool isPow2Padded,
+                 gnm::DataFormat dfmt, amdgpu::TileMode tileMode,
+                 amdgpu::MacroTileMode macroTileMode, int mipLevel,
+                 int arraySlice, int numFragments, int width, int height,
+                 int depth, int pitch, int x, int y, int z, int fragmentIndex) {
+  using namespace amdgpu;
+
+  bool isCubemap = texType == gnm::TextureType::Cube;
+  bool isVolume = texType == gnm::TextureType::Dim3D;
+  auto m_bitsPerFragment = getBitsPerElement(dfmt);
+
+  auto m_isBlockCompressed = getTexelsPerElement(dfmt) > 1;
+  auto tileSwizzleMask = 0;
+  auto numFragmentsPerPixel = 1 << numFragments;
+  auto arrayMode = tileMode.arrayMode();
+
+  auto tileThickness = 1;
+
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+    tileThickness = 1;
+    break;
+  case amdgpu::kArrayMode1dTiledThick:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    tileThickness = 4;
+    break;
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    tileThickness = 8;
+    break;
+  default:
+    break;
+  }
+
+  auto bitsPerElement = m_bitsPerFragment;
+  auto paddedWidth = pitch;
+  auto paddedHeight = height;
+
+  if (m_isBlockCompressed) {
+    switch (m_bitsPerFragment) {
+    case 1:
+      bitsPerElement *= 8;
+      paddedWidth = std::max((paddedWidth + 7) / 8, 1);
+      break;
+    case 4:
+    case 8:
+      bitsPerElement *= 16;
+      paddedWidth = std::max((paddedWidth + 3) / 4, 1);
+      paddedHeight = std::max((paddedHeight + 3) / 4, 1);
+      break;
+    case 16:
+      std::abort();
+      break;
+    default:
+      std::abort();
+      break;
+    }
+  }
+
+  auto bankWidthHW = macroTileMode.bankWidth();
+  auto bankHeightHW = macroTileMode.bankHeight();
+  auto macroAspectHW = macroTileMode.macroTileAspect();
+  auto numBanksHW = macroTileMode.numBanks();
+
+  auto bankWidth = 1 << bankWidthHW;
+  auto bankHeight = 1 << bankHeightHW;
+  unsigned numBanks = 2 << numBanksHW;
+  auto macroTileAspect = 1 << macroAspectHW;
+
+  uint32_t tileBytes1x =
+      (tileThickness * bitsPerElement * kMicroTileWidth * kMicroTileHeight +
+       7) /
+      8;
+
+  auto sampleSplitHw = tileMode.sampleSplit();
+  auto tileSplitHw = tileMode.tileSplit();
+  uint32_t sampleSplit = 1 << sampleSplitHw;
+  uint32_t tileSplitC =
+      (tileMode.microTileMode() == amdgpu::kMicroTileModeDepth)
+          ? (64 << tileSplitHw)
+          : std::max(256U, tileBytes1x * sampleSplit);
+
+  auto tileSplitBytes = std::min(kDramRowSize, tileSplitC);
+
+  auto numPipes = getPipeCount(tileMode.pipeConfig());
+  auto pipeInterleaveBits = std::countr_zero(kPipeInterleaveBytes);
+  auto pipeInterleaveMask = (1 << pipeInterleaveBits) - 1;
+  auto pipeBits = std::countr_zero(numPipes);
+  auto bankBits = std::countr_zero(numBanks);
+  // auto pipeMask = (numPipes - 1) << pipeInterleaveBits;
+  auto bankSwizzleMask = tileSwizzleMask;
+  auto pipeSwizzleMask = 0;
+  auto macroTileWidth =
+      (kMicroTileWidth * bankWidth * numPipes) * macroTileAspect;
+  auto macroTileHeight =
+      (kMicroTileHeight * bankHeight * numBanks) / macroTileAspect;
+
+  auto microTileMode = tileMode.microTileMode();
+
+  uint64_t elementIndex =
+      getElementIndex(x, y, z, bitsPerElement, microTileMode, arrayMode);
+
+  uint32_t xh = x, yh = y;
+  if (arrayMode == amdgpu::kArrayModeTiledThinPrt ||
+      arrayMode == amdgpu::kArrayModeTiledThickPrt) {
+    xh %= macroTileWidth;
+    yh %= macroTileHeight;
+  }
+  uint64_t pipe = getPipeIndex(xh, yh, tileMode.pipeConfig());
+  uint64_t bank =
+      getBankIndex(xh, yh, bankWidth, bankHeight, numBanks, numPipes);
+
+  uint32_t tileBytes = (kMicroTileWidth * kMicroTileHeight * tileThickness *
+                            bitsPerElement * numFragmentsPerPixel +
+                        7) /
+                       8;
+
+  uint64_t elementOffset = 0;
+  if (microTileMode == amdgpu::kMicroTileModeDepth) {
+    uint64_t pixelOffset = elementIndex * bitsPerElement * numFragmentsPerPixel;
+    elementOffset = pixelOffset + (fragmentIndex * bitsPerElement);
+  } else {
+    uint64_t fragmentOffset =
+        fragmentIndex * (tileBytes / numFragmentsPerPixel) * 8;
+    elementOffset = fragmentOffset + (elementIndex * bitsPerElement);
+  }
+
+  uint64_t slicesPerTile = 1;
+  uint64_t tileSplitSlice = 0;
+  if (tileBytes > tileSplitBytes && tileThickness == 1) {
+    slicesPerTile = tileBytes / tileSplitBytes;
+    tileSplitSlice = elementOffset / (tileSplitBytes * 8);
+    elementOffset %= (tileSplitBytes * 8);
+    tileBytes = tileSplitBytes;
+  }
+
+  uint64_t macroTileBytes = (macroTileWidth / kMicroTileWidth) *
+                            (macroTileHeight / kMicroTileHeight) * tileBytes /
+                            (numPipes * numBanks);
+  uint64_t macroTilesPerRow = paddedWidth / macroTileWidth;
+  uint64_t macroTileRowIndex = y / macroTileHeight;
+  uint64_t macroTileColumnIndex = x / macroTileWidth;
+  uint64_t macroTileIndex =
+      (macroTileRowIndex * macroTilesPerRow) + macroTileColumnIndex;
+  uint64_t macro_tile_offset = macroTileIndex * macroTileBytes;
+  uint64_t macroTilesPerSlice =
+      macroTilesPerRow * (paddedHeight / macroTileHeight);
+  uint64_t sliceBytes = macroTilesPerSlice * macroTileBytes;
+
+  uint32_t slice = z;
+  uint64_t sliceOffset =
+      (tileSplitSlice + slicesPerTile * slice / tileThickness) * sliceBytes;
+  if (arraySlice != 0) {
+    slice = arraySlice;
+  }
+
+  uint64_t tileRowIndex = (y / kMicroTileHeight) % bankHeight;
+  uint64_t tileColumnIndex = ((x / kMicroTileWidth) / numPipes) % bankWidth;
+  uint64_t tileIndex = (tileRowIndex * bankWidth) + tileColumnIndex;
+  uint64_t tileOffset = tileIndex * tileBytes;
+
+  uint64_t bankSwizzle = bankSwizzleMask;
+  uint64_t pipeSwizzle = pipeSwizzleMask;
+
+  uint64_t pipeSliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    pipeSliceRotation =
+        std::max(1UL, (numPipes / 2UL) - 1UL) * (slice / tileThickness);
+    break;
+  default:
+    break;
+  }
+  pipeSwizzle += pipeSliceRotation;
+  pipeSwizzle &= (numPipes - 1);
+  pipe = pipe ^ pipeSwizzle;
+
+  uint32_t sliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+    sliceRotation = ((numBanks / 2) - 1) * (slice / tileThickness);
+    break;
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+    sliceRotation = std::max(1UL, (numPipes / 2UL) - 1UL) *
+                    (slice / tileThickness) / numPipes;
+    break;
+  default:
+    break;
+  }
+  uint64_t tileSplitSliceRotation = 0;
+  switch (arrayMode) {
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+    tileSplitSliceRotation = ((numBanks / 2) + 1) * tileSplitSlice;
+    break;
+  default:
+    break;
+  }
+  bank ^= bankSwizzle + sliceRotation;
+  bank ^= tileSplitSliceRotation;
+  bank &= (numBanks - 1);
+
+  uint64_t totalOffset =
+      (sliceOffset + macro_tile_offset + tileOffset) * 8 + elementOffset;
+  uint64_t bitOffset = totalOffset & 0x7;
+  totalOffset /= 8;
+
+  uint64_t pipeInterleaveOffset = totalOffset & pipeInterleaveMask;
+  uint64_t offset = totalOffset >> pipeInterleaveBits;
+
+  uint64_t finalByteOffset =
+      pipeInterleaveOffset | (pipe << (pipeInterleaveBits)) |
+      (bank << (pipeInterleaveBits + pipeBits)) |
+      (offset << (pipeInterleaveBits + pipeBits + bankBits));
+  return (finalByteOffset << 3) | bitOffset;
+}
+
+std::uint64_t amdgpu::getTiledOffset(gnm::TextureType texType,
+                                     bool isPow2Padded, int numFragments,
+                                     gnm::DataFormat dfmt,
+                                     amdgpu::TileMode tileMode,
+                                     amdgpu::MacroTileMode macroTileMode,
+                                     int mipLevel, int arraySlice, int width,
+                                     int height, int depth, int pitch, int x,
+                                     int y, int z, int fragmentIndex) {
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    return getTiledOffsetLinear(dfmt, height, pitch, x, y, z);
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick: {
+    return getTiledOffset1D(texType, isPow2Padded, dfmt, tileMode, mipLevel,
+                            arraySlice, numFragments, width, height, depth,
+                            pitch, x, y, z);
+  }
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    return getTiledOffset2D(texType, isPow2Padded, dfmt, tileMode,
+                            macroTileMode, mipLevel, arraySlice, numFragments,
+                            width, height, depth, pitch, x, y, z,
+                            fragmentIndex);
+  }
+
+  std::abort();
+}
--- a/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
+++ b/rpcsx-gpu2/lib/amdgpu-tiler/src/tiler_vulkan.cpp
@ -0,0 +1,354 @@
+#include "amdgpu/tiler_vulkan.hpp"
+#include "Scheduler.hpp"
+#include "amdgpu/tiler.hpp"
+#include <bit>
+#include <cstring>
+#include <memory>
+#include <vk.hpp>
+
+#include <shaders/detiler1d.comp.h>
+#include <shaders/detiler2d.comp.h>
+#include <shaders/detilerLinear.comp.h>
+#include <shaders/tiler1d.comp.h>
+#include <shaders/tiler2d.comp.h>
+#include <shaders/tilerLinear.comp.h>
+
+struct TilerDecriptorSetLayout {
+  VkDescriptorSetLayout layout;
+
+  TilerDecriptorSetLayout() {
+    std::vector<VkDescriptorSetLayoutBinding> bindings{{
+        .binding = 0,
+        .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        .descriptorCount = 1,
+        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+    }};
+
+    VkDescriptorSetLayoutCreateInfo layoutInfo{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .bindingCount = static_cast<uint32_t>(bindings.size()),
+        .pBindings = bindings.data(),
+    };
+
+    VK_VERIFY(vkCreateDescriptorSetLayout(vk::context->device, &layoutInfo,
+                                          nullptr, &layout));
+  }
+
+  ~TilerDecriptorSetLayout() {
+    vkDestroyDescriptorSetLayout(vk::context->device, layout,
+                                 vk::context->allocator);
+  }
+};
+
+struct TilerShader {
+  VkShaderEXT shader;
+
+  TilerShader(TilerDecriptorSetLayout &setLayout,
+              std::span<const std::uint32_t> spirv) {
+
+    VkShaderCreateInfoEXT shaderInfo{
+        .sType = VK_STRUCTURE_TYPE_SHADER_CREATE_INFO_EXT,
+        .flags = 0,
+        .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+        .nextStage = 0,
+        .codeType = VK_SHADER_CODE_TYPE_SPIRV_EXT,
+        .codeSize = spirv.size_bytes(),
+        .pCode = spirv.data(),
+        .pName = "main",
+        .setLayoutCount = 1,
+        .pSetLayouts = &setLayout.layout,
+        .pushConstantRangeCount = 0,
+        .pPushConstantRanges = 0,
+        .pSpecializationInfo = 0,
+    };
+
+    VK_VERIFY(vk::CreateShadersEXT(vk::context->device, 1, &shaderInfo, nullptr,
+                                   &shader));
+  }
+
+  ~TilerShader() {
+    vk::DestroyShaderEXT(vk::context->device, shader, vk::context->allocator);
+  }
+};
+
+struct amdgpu::GpuTiler::Impl {
+  TilerDecriptorSetLayout descriptorSetLayout;
+  std::mutex descriptorMtx;
+  VkDescriptorSet descriptorSets[4]{};
+  VkDescriptorPool descriptorPool;
+  std::uint32_t inUseDescriptorSets = 0;
+
+  vk::Buffer configData;
+  TilerShader detilerLinear{descriptorSetLayout, spirv_detilerLinear_comp};
+  TilerShader detiler1d{descriptorSetLayout, spirv_detiler1d_comp};
+  TilerShader detiler2d{descriptorSetLayout, spirv_detilerLinear_comp};
+  TilerShader tilerLinear{descriptorSetLayout, spirv_tiler2d_comp};
+  TilerShader tiler1d{descriptorSetLayout, spirv_tiler1d_comp};
+  TilerShader tiler2d{descriptorSetLayout, spirv_tiler2d_comp};
+  VkPipelineLayout pipelineLayout;
+
+  struct Config {
+    uint64_t srcAddress;
+    uint64_t dstAddress;
+    uint32_t dataWidth;
+    uint32_t dataHeight;
+    uint32_t tileMode;
+    uint32_t numFragments;
+    uint32_t bitsPerElement;
+    uint32_t tiledSurfaceSize;
+    uint32_t linearSurfaceSize;
+  };
+
+  Impl() {
+    std::size_t count = 256;
+
+    configData = vk::Buffer::Allocate(
+        vk::getHostVisibleMemory(), sizeof(Config) * count,
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+            VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
+
+    VkPipelineLayoutCreateInfo piplineLayoutInfo{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &descriptorSetLayout.layout,
+    };
+
+    VK_VERIFY(vkCreatePipelineLayout(vk::context->device, &piplineLayoutInfo,
+                                     nullptr, &pipelineLayout));
+
+    {
+      VkDescriptorPoolSize poolSizes[]{{
+          .type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+          .descriptorCount = 1,
+      }};
+
+      VkDescriptorPoolCreateInfo info{
+          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+          .maxSets = static_cast<std::uint32_t>(std::size(descriptorSets)) * 4,
+          .poolSizeCount = static_cast<uint32_t>(std::size(poolSizes)),
+          .pPoolSizes = poolSizes,
+      };
+
+      VK_VERIFY(vkCreateDescriptorPool(
+          vk::context->device, &info, vk::context->allocator, &descriptorPool));
+    }
+
+    VkDescriptorSetAllocateInfo info{
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = descriptorPool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &descriptorSetLayout.layout,
+    };
+    for (std::size_t i = 0; i < std::size(descriptorSets); ++i) {
+      VK_VERIFY(vkAllocateDescriptorSets(vk::context->device, &info,
+                                         descriptorSets + i));
+    }
+  }
+
+  ~Impl() {
+    vkDestroyDescriptorPool(vk::context->device, descriptorPool,
+                            vk::context->allocator);
+    vkDestroyPipelineLayout(vk::context->device, pipelineLayout,
+                            vk::context->allocator);
+  }
+
+  std::uint32_t allocateDescriptorSlot() {
+    std::lock_guard lock(descriptorMtx);
+
+    auto result = std::countl_one(inUseDescriptorSets);
+    rx::dieIf(result >= std::size(descriptorSets),
+              "out of tiler descriptor sets");
+    inUseDescriptorSets |= (1 << result);
+
+    return result;
+  }
+
+  void releaseDescriptorSlot(std::uint32_t slot) {
+    std::lock_guard lock(descriptorMtx);
+    inUseDescriptorSets &= ~(1u << slot);
+  }
+};
+
+amdgpu::GpuTiler::GpuTiler() { mImpl = std::make_unique<Impl>(); }
+amdgpu::GpuTiler::~GpuTiler() = default;
+
+void amdgpu::GpuTiler::detile(Scheduler &scheduler,
+                              const amdgpu::SurfaceInfo &info,
+                              amdgpu::TileMode tileMode,
+                              std::uint64_t srcTiledAddress,
+                              std::uint64_t dstLinearAddress, int mipLevel,
+                              int baseArray, int arrayCount) {
+  auto commandBuffer = scheduler.getCommandBuffer();
+  auto slot = mImpl->allocateDescriptorSlot();
+
+  auto configOffset = slot * sizeof(Impl::Config);
+  auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
+                                                 configOffset);
+
+  auto &subresource = info.getSubresourceInfo(mipLevel);
+  config->srcAddress = srcTiledAddress + subresource.offset +
+                       (subresource.tiledSize * baseArray);
+  config->dstAddress = dstLinearAddress + (subresource.linearSize * baseArray);
+  config->dataWidth = subresource.dataWidth;
+  config->dataHeight = subresource.dataHeight;
+  config->tileMode = tileMode.raw;
+  config->numFragments = info.numFragments;
+  config->bitsPerElement = info.bitsPerElement;
+  uint32_t groupCountZ = subresource.dataDepth;
+
+  if (arrayCount > 1) {
+    config->tiledSurfaceSize = subresource.tiledSize;
+    config->linearSurfaceSize = subresource.linearSize;
+    groupCountZ = arrayCount;
+  } else {
+    config->tiledSurfaceSize = 0;
+    config->linearSurfaceSize = 0;
+  }
+
+  VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
+
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages,
+                          &mImpl->detilerLinear.shader);
+    break;
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler1d.shader);
+    break;
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    std::abort();
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->detiler2d.shader);
+    break;
+  }
+
+  VkDescriptorBufferInfo bufferInfo{
+      .buffer = mImpl->configData.getHandle(),
+      .offset = configOffset,
+      .range = sizeof(Impl::Config),
+  };
+
+  VkWriteDescriptorSet writeDescSet{
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = mImpl->descriptorSets[slot],
+      .dstBinding = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      .pBufferInfo = &bufferInfo,
+  };
+
+  vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
+
+  vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          mImpl->pipelineLayout, 0, 1,
+                          &mImpl->descriptorSets[slot], 0, nullptr);
+
+  vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
+                groupCountZ);
+
+  scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
+}
+
+void amdgpu::GpuTiler::tile(Scheduler &scheduler,
+                            const amdgpu::SurfaceInfo &info,
+                            amdgpu::TileMode tileMode,
+                            std::uint64_t srcLinearAddress,
+                            std::uint64_t dstTiledAddress, int mipLevel,
+                            int baseArray, int arrayCount) {
+  auto commandBuffer = scheduler.getCommandBuffer();
+  auto slot = mImpl->allocateDescriptorSlot();
+
+  auto configOffset = slot * sizeof(Impl::Config);
+  auto config = reinterpret_cast<Impl::Config *>(mImpl->configData.getData() +
+                                                 configOffset);
+
+  auto &subresource = info.getSubresourceInfo(mipLevel);
+  config->srcAddress = srcLinearAddress + subresource.offset +
+                       subresource.linearSize * baseArray;
+  config->dstAddress = dstTiledAddress;
+  config->dataWidth = subresource.dataWidth;
+  config->dataHeight = subresource.dataHeight;
+  config->tileMode = tileMode.raw;
+  config->numFragments = info.numFragments;
+  config->bitsPerElement = info.bitsPerElement;
+  uint32_t groupCountZ = subresource.dataDepth;
+
+  if (arrayCount > 1) {
+    config->tiledSurfaceSize = subresource.tiledSize;
+    config->linearSurfaceSize = subresource.linearSize;
+    groupCountZ = arrayCount;
+  } else {
+    config->tiledSurfaceSize = 0;
+    config->linearSurfaceSize = 0;
+  }
+
+  VkShaderStageFlagBits stages[]{VK_SHADER_STAGE_COMPUTE_BIT};
+
+  switch (tileMode.arrayMode()) {
+  case amdgpu::kArrayModeLinearGeneral:
+  case amdgpu::kArrayModeLinearAligned:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tilerLinear.shader);
+    break;
+
+  case amdgpu::kArrayMode1dTiledThin:
+  case amdgpu::kArrayMode1dTiledThick:
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler1d.shader);
+    break;
+
+  case amdgpu::kArrayMode2dTiledThin:
+  case amdgpu::kArrayModeTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThinPrt:
+  case amdgpu::kArrayMode2dTiledThick:
+  case amdgpu::kArrayMode2dTiledXThick:
+  case amdgpu::kArrayModeTiledThickPrt:
+  case amdgpu::kArrayMode2dTiledThickPrt:
+  case amdgpu::kArrayMode3dTiledThinPrt:
+  case amdgpu::kArrayMode3dTiledThin:
+  case amdgpu::kArrayMode3dTiledThick:
+  case amdgpu::kArrayMode3dTiledXThick:
+  case amdgpu::kArrayMode3dTiledThickPrt:
+    std::abort();
+    vk::CmdBindShadersEXT(commandBuffer, 1, stages, &mImpl->tiler2d.shader);
+    break;
+  }
+
+  VkDescriptorBufferInfo bufferInfo{
+      .buffer = mImpl->configData.getHandle(),
+      .offset = configOffset,
+      .range = sizeof(Impl::Config),
+  };
+
+  VkWriteDescriptorSet writeDescSet{
+      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+      .dstSet = mImpl->descriptorSets[slot],
+      .dstBinding = 0,
+      .descriptorCount = 1,
+      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+      .pBufferInfo = &bufferInfo,
+  };
+
+  vkUpdateDescriptorSets(vk::context->device, 1, &writeDescSet, 0, nullptr);
+
+  vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
+                          mImpl->pipelineLayout, 0, 1,
+                          &mImpl->descriptorSets[slot], 0, nullptr);
+
+  vkCmdDispatch(commandBuffer, subresource.dataWidth, subresource.dataHeight,
+                groupCountZ);
+
+  scheduler.afterSubmit([this, slot] { mImpl->releaseDescriptorSlot(slot); });
+}
--- a/rpcsx-gpu2/lib/gcn-shader/CMakeLists.txt
+++ b/rpcsx-gpu2/lib/gcn-shader/CMakeLists.txt
@ -0,0 +1,48 @@
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/)
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp
+    COMMAND $<TARGET_FILE:spv-gen> ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp
+    DEPENDS spv-gen
+    WORKING_DIRECTORY $<TARGET_PROPERTY:SPIRV-Headers,INTERFACE_INCLUDE_DIRECTORIES>/spirv/unified1
+    COMMENT "Generating ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp..."
+)
+
+add_custom_target(shader-spv-dialect-gen DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/include/dialect/spv.hpp)
+add_library(shader-spv-dialect INTERFACE)
+add_dependencies(shader-spv-dialect shader-spv-dialect-gen)
+target_include_directories(shader-spv-dialect INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/include/)
+
+add_library(gcn-shader STATIC
+    src/analyze.cpp
+    src/eval.cpp
+    src/Evaluator.cpp
+    src/gcn.cpp
+    src/GcnConverter.cpp
+    src/GcnInstruction.cpp
+    src/glsl.cpp
+    src/ModuleInfo.cpp
+    src/opt.cpp
+    src/SemanticModuleInfo.cpp
+    src/spv.cpp
+    src/SpvConverter.cpp
+    src/SpvTypeInfo.cpp
+    src/transform.cpp
+)
+
+target_include_directories(gcn-shader PUBLIC include PRIVATE include/shader)
+
+target_link_libraries(gcn-shader
+PUBLIC
+    shader-spv-dialect
+    rx
+
+PRIVATE
+    glslang::glslang
+    glslang::SPIRV
+    SPIRV-Tools
+    SPIRV-Tools-opt
+    spirv-cross-c-shared
+)
+
+add_subdirectory(shaders)
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/Access.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Access.hpp
@ -0,0 +1,26 @@
+#pragma once
+
+namespace shader {
+enum class Access {
+  None = 0,
+  Read = 1 << 0,
+  Write = 1 << 1,
+  ReadWrite = Read | Write
+};
+
+constexpr Access operator|(Access lhs, Access rhs) {
+  return static_cast<Access>(static_cast<int>(lhs) | static_cast<int>(rhs));
+}
+constexpr Access operator&(Access lhs, Access rhs) {
+  return static_cast<Access>(static_cast<int>(lhs) & static_cast<int>(rhs));
+}
+constexpr Access operator~(Access rhs) {
+  return static_cast<Access>(~static_cast<int>(rhs));
+}
+constexpr Access &operator|=(Access &lhs, Access rhs) {
+  return ((lhs = lhs | rhs));
+}
+constexpr Access &operator&=(Access &lhs, Access rhs) {
+  return ((lhs = lhs & rhs));
+}
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Evaluator.hpp
@ -0,0 +1,20 @@
+#pragma once
+#include "eval.hpp"
+#include <map>
+
+namespace shader::eval {
+class Evaluator {
+  std::map<ir::Value, Value> values;
+
+public:
+  virtual ~Evaluator() = default;
+
+  void invalidate(ir::Value node) { values.erase(node); }
+  void setValue(ir::Value node, Value value) { values[node] = value; }
+
+  Value eval(const ir::Operand &op, ir::Value type = nullptr);
+  virtual Value eval(ir::Value op);
+  virtual Value eval(ir::InstructionId instId,
+                     std::span<const ir::Operand> operands);
+};
+} // namespace shader::eval
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnConverter.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnConverter.hpp
@ -0,0 +1,131 @@
+#pragma once
+
+#include "gcn.hpp"
+#include "rx/MemoryTable.hpp"
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+namespace shader::gcn {
+enum class PsVGprInput {
+  IPerspSample,
+  JPerspSample,
+  IPerspCenter,
+  JPerspCenter,
+  IPerspCentroid,
+  JPerspCentroid,
+  IW,
+  JW,
+  _1W,
+  ILinearSample,
+  JLinearSample,
+  ILinearCenter,
+  JLinearCenter,
+  ILinearCentroid,
+  JLinearCentroid,
+  X,
+  Y,
+  Z,
+  W,
+  FrontFace,
+  Ancillary,
+  SampleCoverage,
+  PosFixed,
+
+  Count
+};
+enum class ConfigType {
+  Imm,
+  UserSgpr,
+  ResourceSlot,
+  MemoryTable,
+  Gds,
+  PsInputVGpr,
+  VsPrimType,
+  CbCompSwap,
+  ViewPortOffsetX,
+  ViewPortOffsetY,
+  ViewPortOffsetZ,
+  ViewPortScaleX,
+  ViewPortScaleY,
+  ViewPortScaleZ,
+};
+
+struct ConfigSlot {
+  ConfigType type;
+  std::uint64_t data;
+};
+
+struct Resources {
+  struct Resource {
+    std::uint32_t resourceSlot;
+  };
+
+  struct Pointer : Resource {
+    std::uint32_t size;
+    ir::Value base;
+    ir::Value offset;
+  };
+
+  struct Texture : Resource {
+    Access access;
+    ir::Value words[8];
+  };
+
+  struct Buffer : Resource {
+    Access access;
+    ir::Value words[4];
+  };
+
+  struct Sampler : Resource {
+    bool unorm;
+    ir::Value words[4];
+  };
+
+  spv::Context context;
+  bool hasUnknown = false;
+  std::uint32_t slots = 0;
+  std::vector<Pointer> pointers;
+  std::vector<Texture> textures;
+  std::vector<Buffer> buffers;
+  std::vector<Sampler> samplers;
+
+  void print(std::ostream &os, ir::NameStorage &ns) const;
+  void dump();
+};
+
+struct ShaderInfo {
+  std::vector<ConfigSlot> configSlots;
+  rx::MemoryAreaTable<> memoryMap;
+  std::vector<std::pair<int, std::uint32_t>> requiredSgprs;
+  Resources resources;
+
+  std::uint32_t create(ConfigType type, std::uint64_t data) {
+    for (std::size_t slotIndex = 0; auto &slotInfo : configSlots) {
+      if (slotInfo.type == type && slotInfo.data == data) {
+        return slotIndex;
+      }
+
+      slotIndex++;
+    }
+
+    configSlots.push_back({
+        .type = type,
+        .data = data,
+    });
+
+    return configSlots.size() - 1;
+  }
+};
+
+struct ConvertedShader {
+  std::vector<std::uint32_t> spv;
+  ShaderInfo info;
+};
+
+std::optional<ConvertedShader>
+convertToSpv(Context &context, ir::Region body,
+             const SemanticModuleInfo &semanticModule, Stage stage,
+             const Environment &state);
+
+} // namespace shader::gcn
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnInstruction.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/GcnInstruction.hpp
@ -0,0 +1,256 @@
+#pragma once
+
+#include "dialect.hpp"
+#include "ir/Kind.hpp"
+
+#include <functional>
+#include <ostream>
+#include <span>
+#include <type_traits>
+
+namespace shader {
+struct GcnOperand {
+  enum class Kind : std::uint8_t {
+    Invalid,
+    Constant,
+    Immediate,
+    VccLo,
+    VccHi,
+    M0,
+    ExecLo,
+    ExecHi,
+    Scc,
+    VccZ,
+    ExecZ,
+    LdsDirect,
+    Vgpr,
+    Sgpr,
+    Attr,
+    Buffer,
+    Texture128,
+    Texture256,
+    Sampler,
+    Pointer,
+  };
+
+  static constexpr auto R = 1 << 0;
+  static constexpr auto W = 1 << 1;
+
+  union {
+    std::uint32_t value;
+    std::uint64_t address = 0;
+
+    struct {
+      std::uint16_t attrId;
+      std::uint16_t attrChannel;
+    };
+
+    struct {
+      Kind firstRegisterKind;
+      union {
+        struct {
+          Kind pointerOffsetKind;
+          std::uint16_t pointeeSize;
+        };
+        bool samplerUnorm;
+      };
+      std::uint32_t firstRegisterIndex;
+
+      union {
+        std::uint32_t pointerOffsetValue;
+        std::uint64_t pointerOffsetAddress;
+      };
+    };
+  };
+
+  Kind kind = Kind::Invalid;
+  std::uint8_t access = 0;
+  std::uint8_t omod : 4 = 0;
+  bool abs : 1 = false;
+  bool clamp : 1 = false;
+  bool neg : 1 = false;
+
+  constexpr GcnOperand getUnderlyingOperand(int offset = 0) const {
+    return {
+        .value = firstRegisterIndex + offset,
+        .kind = firstRegisterKind,
+    };
+  }
+
+  constexpr GcnOperand getPointerOffsetOperand() const {
+    return {
+        .address = pointerOffsetAddress,
+        .kind = pointerOffsetKind,
+    };
+  }
+
+  static constexpr GcnOperand createImmediateConstant(std::uint64_t address) {
+    return GcnOperand{
+        .address = address,
+        .kind = Kind::Immediate,
+        .access = R,
+    };
+  }
+
+  static constexpr GcnOperand createConstant(std::uint32_t value) {
+    return GcnOperand{
+        .value = value,
+        .kind = Kind::Constant,
+        .access = R,
+    };
+  }
+
+  static constexpr GcnOperand createConstant(bool value) {
+    return createConstant(std::uint32_t(value ? 1 : 0));
+  }
+
+  static constexpr GcnOperand createConstant(float value) {
+    return createConstant(std::bit_cast<std::uint32_t>(value));
+  }
+
+  static constexpr GcnOperand createVgpr(std::uint32_t index) {
+    return {
+        .value = index,
+        .kind = Kind::Vgpr,
+    };
+  }
+
+  static constexpr GcnOperand createSgpr(std::uint32_t index) {
+    return {
+        .value = index,
+        .kind = Kind::Sgpr,
+    };
+  }
+
+  static constexpr GcnOperand createSampler(GcnOperand firstReg, bool unorm) {
+    return {
+        .firstRegisterKind = firstReg.kind,
+        .samplerUnorm = unorm,
+        .firstRegisterIndex = static_cast<std::uint8_t>(firstReg.value),
+        .kind = Kind::Sampler,
+    };
+  }
+  static constexpr GcnOperand createTexture(GcnOperand firstReg, bool is128) {
+    return {
+        .firstRegisterKind = firstReg.kind,
+        .firstRegisterIndex = static_cast<std::uint8_t>(firstReg.value),
+        .kind = (is128 ? Kind::Texture128 : Kind::Texture256),
+    };
+  }
+  static constexpr GcnOperand createBuffer(GcnOperand firstReg) {
+    return {
+        .firstRegisterKind = firstReg.kind,
+        .firstRegisterIndex = static_cast<std::uint8_t>(firstReg.value),
+        .kind = Kind::Buffer,
+    };
+  }
+  static constexpr GcnOperand
+  createPointer(GcnOperand firstReg, std::uint16_t size, GcnOperand offset) {
+    return {
+        .firstRegisterKind = firstReg.kind,
+        .pointerOffsetKind = offset.kind,
+        .pointeeSize = size,
+        .firstRegisterIndex = static_cast<std::uint8_t>(firstReg.value),
+        .pointerOffsetAddress = offset.address,
+        .kind = Kind::Pointer,
+    };
+  }
+
+  static constexpr GcnOperand createAttr(std::uint16_t id,
+                                         std::uint16_t channel) {
+    return {
+        .attrId = id,
+        .attrChannel = channel,
+        .kind = Kind::Attr,
+    };
+  }
+
+  constexpr GcnOperand withRW() const { return withAccess(R | W); }
+  constexpr GcnOperand withR() const { return withAccess(R); }
+  constexpr GcnOperand withW() const { return withAccess(W); }
+
+  constexpr GcnOperand withAccess(std::uint8_t access) const {
+    GcnOperand result = *this;
+    result.access = access;
+    return result;
+  }
+
+  constexpr GcnOperand withNeg(bool value) const {
+    GcnOperand result = *this;
+    result.neg = value;
+    return result;
+  }
+
+  constexpr GcnOperand withAbs(bool value) const {
+    GcnOperand result = *this;
+    result.abs = value;
+    return result;
+  }
+
+  constexpr GcnOperand withClamp(bool value) const {
+    GcnOperand result = *this;
+    result.clamp = value;
+    return result;
+  }
+
+  constexpr GcnOperand withOutputModifier(std::uint8_t value) const {
+    GcnOperand result = *this;
+    result.omod = value;
+    return result;
+  }
+
+  static constexpr GcnOperand createVccLo() { return {.kind = Kind::VccLo}; }
+  static constexpr GcnOperand createVccHi() { return {.kind = Kind::VccHi}; }
+  static constexpr GcnOperand createM0() { return {.kind = Kind::M0}; }
+  static constexpr GcnOperand createExecLo() { return {.kind = Kind::ExecLo}; }
+  static constexpr GcnOperand createExecHi() { return {.kind = Kind::ExecHi}; }
+  static constexpr GcnOperand createVccZ() { return {.kind = Kind::VccZ}; }
+  static constexpr GcnOperand createExecZ() { return {.kind = Kind::ExecZ}; }
+  static constexpr GcnOperand createScc() { return {.kind = Kind::Scc}; }
+  static constexpr GcnOperand createLdsDirect() {
+    return {.kind = Kind::LdsDirect};
+  }
+
+  void print(std::ostream &os) const;
+  void dump() const;
+};
+
+struct GcnInstruction {
+  ir::Kind kind = ir::Kind::Builtin;
+  unsigned op = ir::builtin::INVALID_INSTRUCTION;
+  GcnOperand operands[16];
+  std::size_t operandCount{};
+
+  std::span<const GcnOperand> getOperands() const {
+    return {operands, operandCount};
+  }
+
+  const GcnOperand &getOperand(std::size_t index) const {
+    if (index >= operandCount) {
+      std::abort();
+    }
+    return operands[index];
+  }
+
+  void addOperand(GcnOperand op) {
+    if (operandCount >= std::size(operands)) {
+      std::abort();
+    }
+
+    operands[operandCount++] = op;
+  }
+
+  template <typename T>
+  bool operator==(T testOp)
+    requires(ir::kOpToKind<std::remove_cvref_t<T>> != ir::Kind::Count)
+  {
+    return ir::kOpToKind<std::remove_cvref_t<T>> == kind && op == testOp;
+  }
+
+  void print(std::ostream &os) const;
+  void dump() const;
+};
+
+void readGcnInst(GcnInstruction &isaInst, std::uint64_t &address,
+                 const std::function<std::uint32_t(std::uint64_t)> &readMemory);
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ModuleInfo.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ModuleInfo.hpp
@ -0,0 +1,28 @@
+#pragma once
+
+#include "Access.hpp"
+#include "ir/Value.hpp"
+#include "spv.hpp"
+#include <map>
+#include <vector>
+
+namespace shader {
+struct ModuleInfo {
+  struct Param {
+    ir::Value type;
+    Access access = Access::None;
+  };
+
+  struct Function {
+    std::map<ir::Value, Access> variables;
+    std::vector<Param> parameters;
+    ir::Value returnType;
+  };
+
+  std::map<ir::Value, Function> functions;
+};
+
+ModuleInfo::Function &collectFunctionInfo(ModuleInfo &moduleInfo,
+                                          ir::Value function);
+void collectModuleInfo(ModuleInfo &moduleInfo, const spv::BinaryLayout &layout);
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/SemanticInfo.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SemanticInfo.hpp
@ -0,0 +1,46 @@
+#pragma once
+
+#include "ModuleInfo.hpp"
+#include "SpvTypeInfo.hpp"
+
+namespace shader {
+struct SemanticModuleInfo : ModuleInfo {
+  std::unordered_map<ir::InstructionId, ir::Value> semantics;
+
+  ir::Value findSemanticOf(ir::InstructionId sem) const {
+    auto semIt = semantics.find(sem);
+    if (semIt == semantics.end()) {
+      return nullptr;
+    }
+
+    return semIt->second;
+  }
+};
+
+struct SemanticInfo {
+  struct Param {
+    spv::TypeInfo type;
+    Access access = Access::None;
+  };
+
+  struct Function {
+    std::unordered_map<int, Access> registerAccesses;
+    std::vector<Param> parameters;
+    spv::TypeInfo returnType;
+    Access bufferAccess = Access::None;
+  };
+
+  std::unordered_map<ir::InstructionId, Function> semantics;
+
+  const Function *findSemantic(ir::InstructionId sem) const {
+    if (auto it = semantics.find(sem); it != semantics.end()) {
+      return &it->second;
+    }
+
+    return nullptr;
+  }
+};
+
+void collectSemanticModuleInfo(SemanticModuleInfo &moduleInfo,
+                               const spv::BinaryLayout &layout);
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvConverter.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvConverter.hpp
@ -0,0 +1,154 @@
+#pragma once
+#include "SpvTypeInfo.hpp"
+#include "dialect/spv.hpp"
+#include "spv.hpp"
+
+namespace shader::spv {
+struct Import : ir::CloneMap {
+  ir::Node getOrCloneImpl(ir::Context &context, ir::Node node,
+                          bool isOperand) override;
+};
+
+struct Context : ir::Context {
+  BinaryLayout layout;
+  ir::Location rootLocation;
+
+  ir::NameStorage ns;
+  ir::Value perVertex;
+  std::map<int, ir::Value> outputs;
+  std::map<int, ir::Value> inputs;
+
+  ir::RegionLike localVariables;
+  ir::RegionLike epilogue;
+  ir::Value entryPoint;
+
+  std::map<ir::InstructionId, std::vector<ir::Value>> globals;
+  std::map<ir::InstructionId, std::vector<ir::Value>> constants;
+
+  Context();
+
+  ir::Value createRegionWithLabel(ir::Location loc);
+
+  void setName(ir::spv::IdRef inst, std::string name);
+  void setConstantName(ir::Value constant);
+
+  ir::Value getOrCreateConstant(ir::Value typeValue, const ir::Operand &value);
+
+  ir::Value getType(ir::spv::Op baseType, int width, bool isSigned);
+  ir::Value getType(const TypeInfo &info);
+
+  ir::Value imm64(std::uint64_t value) {
+    return getOrCreateConstant(getTypeUInt64(), value);
+  }
+  ir::Value imm32(std::uint32_t value) {
+    return getOrCreateConstant(getTypeUInt32(), value);
+  }
+
+  ir::Value simm64(std::int64_t value) {
+    return getOrCreateConstant(getTypeSInt64(), value);
+  }
+  ir::Value simm32(std::int32_t value) {
+    return getOrCreateConstant(getTypeSInt32(), value);
+  }
+  ir::Value fimm64(double value) {
+    return getOrCreateConstant(getTypeFloat(64), value);
+  }
+  ir::Value fimm32(float value) {
+    return getOrCreateConstant(getTypeFloat(32), value);
+  }
+  ir::Value getBool(bool value) { return value ? getTrue() : getFalse(); }
+  ir::Value getTrue() {
+    return getOrCreateGlobal(ir::spv::OpConstantTrue, {{getTypeBool()}});
+  }
+  ir::Value getFalse() {
+    return getOrCreateGlobal(ir::spv::OpConstantFalse, {{getTypeBool()}});
+  }
+
+  ir::Value getIndex(std::int32_t index) { return simm32(index); }
+
+  void setTypeName(ir::Value type);
+
+  void addGlobal(ir::Value type) {
+    globals[type.getInstId()].push_back(type);
+    setTypeName(type);
+  }
+
+  ir::Value findGlobal(ir::spv::Op op,
+                       std::span<const ir::Operand> operands = {}) const;
+  ir::Value createGlobal(ir::spv::Op op, std::span<const ir::Operand> operands);
+  ir::Value getOrCreateGlobal(ir::spv::Op op,
+                              std::span<const ir::Operand> operands = {});
+
+  ir::Value getTypeInt(int width, bool sign) {
+    return getOrCreateGlobal(ir::spv::OpTypeInt, {{width, sign ? 1 : 0}});
+  }
+  ir::Value getTypeFloat(int width) {
+    return getOrCreateGlobal(ir::spv::OpTypeFloat, {{width}});
+  }
+  ir::Value getTypeVoid() { return getOrCreateGlobal(ir::spv::OpTypeVoid); }
+  ir::Value getTypeBool() { return getOrCreateGlobal(ir::spv::OpTypeBool); }
+  ir::Value getTypeSampler() {
+    return getOrCreateGlobal(ir::spv::OpTypeSampler);
+  }
+  ir::Value getTypeArray(ir::Value elementType, ir::Value count) {
+    return getOrCreateGlobal(ir::spv::OpTypeArray, {{elementType, count}});
+  }
+  ir::Value getTypeVector(ir::Value elementType, int count) {
+    return getOrCreateGlobal(ir::spv::OpTypeVector, {{elementType, count}});
+  }
+
+  ir::Value getTypeStruct(auto... elements) {
+    return getOrCreateGlobal(ir::spv::OpTypeStruct, {{elements...}});
+  }
+  ir::Value getTypeSInt8() { return getTypeInt(8, true); }
+  ir::Value getTypeUInt8() { return getTypeInt(8, false); }
+  ir::Value getTypeSInt16() { return getTypeInt(16, true); }
+  ir::Value getTypeUInt16() { return getTypeInt(16, false); }
+  ir::Value getTypeSInt32() { return getTypeInt(32, true); }
+  ir::Value getTypeUInt32() { return getTypeInt(32, false); }
+  ir::Value getTypeSInt64() { return getTypeInt(64, true); }
+  ir::Value getTypeUInt64() { return getTypeInt(64, false); }
+  ir::Value getTypeFloat16() { return getTypeFloat(16); }
+  ir::Value getTypeFloat32() { return getTypeFloat(32); }
+  ir::Value getTypeFloat64() { return getTypeFloat(64); }
+
+  ir::Value getTypeFunction(ir::Value returnType,
+                            std::span<const ir::Value> params) {
+    std::vector<ir::Operand> operands;
+    operands.reserve(1 + params.size());
+    operands.push_back(returnType);
+    for (auto param : params) {
+      operands.push_back(param);
+    }
+    return getOrCreateGlobal(ir::spv::OpTypeFunction, operands);
+  }
+
+  ir::Value getTypePointer(ir::spv::StorageClass storageClass,
+                           ir::spv::IdRef pointeeType) {
+    return getOrCreateGlobal(ir::spv::OpTypePointer,
+                             {{storageClass, pointeeType}});
+  }
+
+  ir::Value getTypeImage(ir::spv::IdRef sampledType, ir::spv::Dim dim,
+                         std::int32_t depth, bool arrayed, bool multisampled,
+                         std::int32_t sampled, ir::spv::ImageFormat format) {
+    return getOrCreateGlobal(
+        ir::spv::OpTypeImage,
+        {{sampledType, dim, depth, arrayed, multisampled, sampled, format}});
+  }
+
+  ir::Value getOperandValue(const ir::Operand &op, ir::Value type = {});
+
+  void createPerVertex();
+
+  ir::Value createUniformBuffer(int descriptorSet, int binding,
+                                ir::Value structType);
+
+  ir::Value createRuntimeArrayUniformBuffer(int descriptorSet, int binding,
+                                            ir::Value elementType);
+
+  ir::Value createOutput(ir::Location loc, int index);
+  ir::Value createInput(ir::Location loc, int index);
+  ir::Value createAttr(ir::Location loc, int attrId, bool perVertex, bool flat);
+};
+} // namespace shader::spv
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvTypeInfo.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/SpvTypeInfo.hpp
@ -0,0 +1,18 @@
+#pragma once
+
+#include "dialect/spv.hpp"
+
+namespace shader::spv {
+struct TypeInfo {
+  ir::spv::Op baseType = {};
+  ir::spv::Op componentType = {};
+  int componentWidth = 0;
+  int componentsCount = 1;
+  bool isSigned = false;
+
+  int width() const { return componentWidth * componentsCount; }
+  bool operator==(const TypeInfo &other) const = default;
+};
+
+TypeInfo getTypeInfo(ir::Value type);
+} // namespace shader::spv
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/Vector.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/Vector.hpp
@ -0,0 +1,129 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+namespace shader {
+template <typename T, std::size_t N> struct Vector : std::array<T, N> {
+  using std::array<T, N>::array;
+
+  template<typename U>
+  constexpr explicit operator Vector<U, N>() const {
+     Vector<U, N> result;
+     for (std::size_t i = 0; i < N; ++i) {
+       result[i] = static_cast<U>((*this)[i]);
+     }
+     return result;
+  }
+
+#define DEFINE_BINOP(OP)                                                       \
+  constexpr auto operator OP(const Vector &other) const                        \
+    requires requires(T lhs, T rhs) { lhs OP rhs; }                            \
+  {                                                                            \
+    using ResultElementT =                                                     \
+        std::remove_cvref_t<decltype(std::declval<T>() OP std::declval<T>())>; \
+    Vector<ResultElementT, N> result;                                          \
+    for (std::size_t i = 0; i < N; ++i) {                                      \
+      result[i] = (*this)[i] OP other[i];                                      \
+    }                                                                          \
+    return result;                                                             \
+  }                                                                            \
+  constexpr auto operator OP(const T &other) const                             \
+    requires requires(T lhs, T rhs) { lhs OP rhs; }                            \
+  {                                                                            \
+    using ResultElementT =                                                     \
+        std::remove_cvref_t<decltype(std::declval<T>() OP std::declval<T>())>; \
+    Vector<ResultElementT, N> result;                                          \
+    for (std::size_t i = 0; i < N; ++i) {                                      \
+      result[i] = (*this)[i] OP other;                                         \
+    }                                                                          \
+    return result;                                                             \
+  }
+
+#define DEFINE_UNOP(OP)                                                        \
+  constexpr auto operator OP() const                                           \
+    requires requires(T rhs) { OP rhs; }                                       \
+  {                                                                            \
+    using ResultElementT =                                                     \
+        std::remove_cvref_t<decltype(OP std::declval<T>())>;                   \
+    Vector<ResultElementT, N> result;                                          \
+    for (std::size_t i = 0; i < N; ++i) {                                      \
+      result[i] = OP(*this)[i];                                                \
+    }                                                                          \
+    return result;                                                             \
+  }
+
+  DEFINE_BINOP(+)
+  DEFINE_BINOP(-)
+  DEFINE_BINOP(*)
+  DEFINE_BINOP(/)
+  DEFINE_BINOP(%)
+  DEFINE_BINOP(&)
+  DEFINE_BINOP(|)
+  DEFINE_BINOP(^)
+  DEFINE_BINOP(>>)
+  DEFINE_BINOP(<<)
+  DEFINE_BINOP(&&)
+  DEFINE_BINOP(||)
+  DEFINE_BINOP(<)
+  DEFINE_BINOP(>)
+  DEFINE_BINOP(<=)
+  DEFINE_BINOP(>=)
+  DEFINE_BINOP(==)
+  DEFINE_BINOP(!=)
+
+  DEFINE_UNOP(-)
+  DEFINE_UNOP(~)
+  DEFINE_UNOP(!)
+
+#undef DEFINE_BINOP
+#undef DEFINE_UNOP
+};
+
+using float16_t = _Float16;
+using float32_t = float;
+using float64_t = double;
+
+using u8vec2 = Vector<std::uint8_t, 2>;
+using u8vec3 = Vector<std::uint8_t, 3>;
+using u8vec4 = Vector<std::uint8_t, 4>;
+using i8vec2 = Vector<std::int8_t, 2>;
+using i8vec3 = Vector<std::int8_t, 3>;
+using i8vec4 = Vector<std::int8_t, 4>;
+
+using u16vec2 = Vector<std::uint16_t, 2>;
+using u16vec3 = Vector<std::uint16_t, 3>;
+using u16vec4 = Vector<std::uint16_t, 4>;
+using i16vec2 = Vector<std::int16_t, 2>;
+using i16vec3 = Vector<std::int16_t, 3>;
+using i16vec4 = Vector<std::int16_t, 4>;
+
+using u32vec2 = Vector<std::uint32_t, 2>;
+using u32vec3 = Vector<std::uint32_t, 3>;
+using u32vec4 = Vector<std::uint32_t, 4>;
+using i32vec2 = Vector<std::int32_t, 2>;
+using i32vec3 = Vector<std::int32_t, 3>;
+using i32vec4 = Vector<std::int32_t, 4>;
+
+using u64vec2 = Vector<std::uint64_t, 2>;
+using u64vec3 = Vector<std::uint64_t, 3>;
+using u64vec4 = Vector<std::uint64_t, 4>;
+using i64vec2 = Vector<std::int64_t, 2>;
+using i64vec3 = Vector<std::int64_t, 3>;
+using i64vec4 = Vector<std::int64_t, 4>;
+
+using f32vec2 = Vector<float32_t, 2>;
+using f32vec3 = Vector<float32_t, 3>;
+using f32vec4 = Vector<float32_t, 4>;
+using f64vec2 = Vector<float64_t, 2>;
+using f64vec3 = Vector<float64_t, 3>;
+using f64vec4 = Vector<float64_t, 4>;
+
+using f16vec2 = Vector<float16_t, 2>;
+using f16vec3 = Vector<float16_t, 3>;
+using f16vec4 = Vector<float16_t, 4>;
+
+using bvec2 = Vector<bool, 2>;
+using bvec3 = Vector<bool, 3>;
+using bvec4 = Vector<bool, 4>;
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/analyze.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/analyze.hpp
@ -0,0 +1,445 @@
+#pragma once
+
+#include "ModuleInfo.hpp"
+#include "SemanticInfo.hpp"
+#include "dialect/memssa.hpp"
+#include "graph.hpp"
+#include "ir/Instruction.hpp"
+#include "ir/Value.hpp"
+#include "rx/FunctionRef.hpp"
+#include "rx/TypeId.hpp"
+#include <map>
+#include <ostream>
+#include <utility>
+#include <vector>
+
+namespace shader {
+struct DomTree;
+struct PostDomTree;
+class CFG {
+public:
+  class Node {
+    ir::Value mLabel;
+    ir::Instruction mTerminator;
+    std::unordered_set<Node *> mPredecessors;
+    std::unordered_set<Node *> mSuccessors;
+
+  public:
+    using Iterator = std::unordered_set<Node *>::iterator;
+
+    Node() = default;
+    Node(ir::Value label) : mLabel(label) {}
+
+    ir::Value getLabel() { return mLabel; }
+
+    void setTerminator(ir::Instruction inst) { mTerminator = inst; }
+    bool hasTerminator() { return mTerminator != nullptr; }
+    ir::Instruction getTerminator() { return mTerminator; }
+
+    void addEdge(Node *to) {
+      to->mPredecessors.insert(this);
+      mSuccessors.insert(to);
+    }
+
+    bool hasPredecessor(Node *node) { return mPredecessors.contains(node); }
+    bool hasSuccessor(Node *node) { return mSuccessors.contains(node); }
+    auto &getPredecessors() { return mPredecessors; }
+    auto &getSuccessors() { return mSuccessors; }
+    std::size_t getPredecessorCount() { return mPredecessors.size(); }
+    std::size_t getSuccessorCount() { return mSuccessors.size(); }
+    bool hasPredecessors() { return !mPredecessors.empty(); }
+    bool hasSuccessors() { return !mSuccessors.empty(); }
+
+    template <typename T = ir::Instruction> auto range() {
+      return ir::range<T>(mLabel, mTerminator.getNext());
+    }
+
+    template <typename T = ir::Instruction> auto rangeWithoutLabel() {
+      return ir::range<T>(mLabel.getNext(),
+                          mTerminator ? mTerminator.getNext() : nullptr);
+    }
+
+    template <typename T = ir::Instruction> auto rangeWithoutTerminator() {
+      return ir::range<T>(mLabel, mTerminator);
+    }
+
+    template <typename T = ir::Instruction>
+    auto rangeWithoutLabelAndTerminator() {
+      return ir::range<T>(mLabel.getNext(), mTerminator);
+    }
+  };
+
+private:
+  std::map<ir::Value, Node> mNodes;
+  std::vector<Node *> mPreorderNodes;
+  std::vector<Node *> mPostorderNodes;
+  Node *mEntryNode = nullptr;
+
+public:
+  bool empty() { return mNodes.empty(); }
+  void clear() {
+    mNodes.clear();
+    mPreorderNodes.clear();
+    mPostorderNodes.clear();
+    mEntryNode = nullptr;
+  }
+
+  void addPreorderNode(Node *node) { mPreorderNodes.push_back(node); }
+  void addPostorderNode(Node *node) { mPostorderNodes.push_back(node); }
+
+  Node *getEntryNode() { return mEntryNode; }
+  ir::Value getEntryLabel() { return getEntryNode()->getLabel(); }
+  void setEntryNode(Node *node) { mEntryNode = node; }
+
+  std::span<Node *> getPreorderNodes() { return mPreorderNodes; }
+  std::span<Node *> getPostorderNodes() { return mPostorderNodes; }
+
+  Node *getOrCreateNode(ir::Value label) {
+    return &mNodes.emplace(label, label).first->second;
+  }
+
+  Node *getNode(ir::Value label) {
+    if (auto it = mNodes.find(label); it != mNodes.end()) {
+      return &it->second;
+    }
+
+    return nullptr;
+  }
+
+  auto &getSuccessors(ir::Value label) {
+    return getNode(label)->getSuccessors();
+  }
+
+  auto &getPredecessors(ir::Value label) {
+    return getNode(label)->getPredecessors();
+  }
+
+  void print(std::ostream &os, ir::NameStorage &ns, bool subgraph = false,
+             std::string_view nameSuffix = "");
+  std::string genTest();
+
+  CFG buildView(CFG::Node *from, PostDomTree *domTree = nullptr,
+                const std::unordered_set<ir::Value> &stopLabels = {},
+                ir::Value continueLabel = nullptr);
+
+  CFG buildView(ir::Value from, PostDomTree *domTree = nullptr,
+                const std::unordered_set<ir::Value> &stopLabels = {},
+                ir::Value continueLabel = nullptr) {
+    return buildView(getNode(from), domTree, stopLabels, continueLabel);
+  }
+};
+
+class MemorySSA {
+public:
+  ir::Context context;
+  ir::Region region;
+  std::map<ir::Value, ir::memssa::Var> variableToVar;
+  std::map<ir::Instruction, std::map<ir::memssa::Var, ir::memssa::Def>>
+      userDefs;
+
+  ir::memssa::Var getVar(ir::Value variable, std::span<const ir::Operand> path);
+  ir::memssa::Var getVar(ir::Value pointer);
+
+  ir::memssa::Def getDef(ir::Instruction user, ir::memssa::Var var) {
+    auto userIt = userDefs.find(user);
+    if (userIt == userDefs.end()) {
+      return {};
+    }
+
+    if (auto it = userIt->second.find(var); it != userIt->second.end()) {
+      return it->second;
+    }
+
+    return {};
+  }
+
+  ir::memssa::Def getDef(ir::Instruction user, ir::Value pointer) {
+    if (auto var = getVar(pointer)) {
+      return getDef(user, var);
+    }
+
+    return {};
+  }
+
+  ir::Instruction getDefInst(ir::Instruction user, ir::Value pointer) {
+    if (auto def = getDef(user, pointer)) {
+      return def.getLinkedInst();
+    }
+
+    return {};
+  }
+
+  void print(std::ostream &os, ir::Region irRegion, ir::NameStorage &ns);
+  void print(std::ostream &os, ir::NameStorage &ns);
+  void dump();
+
+private:
+  ir::memssa::Var getVarImpl(ir::Value variable);
+};
+
+bool isWithoutSideEffects(ir::InstructionId id);
+bool isTerminator(ir::Instruction inst);
+bool isBranch(ir::Instruction inst);
+ir::Value unwrapPointer(ir::Value pointer);
+graph::DomTree<ir::Value> buildDomTree(CFG &cfg, ir::Value root = nullptr);
+graph::DomTree<ir::Value> buildPostDomTree(CFG &cfg, ir::Value root);
+
+CFG buildCFG(ir::Instruction firstInstruction,
+             const std::unordered_set<ir::Value> &exitLabels = {},
+             ir::Value continueLabel = nullptr);
+MemorySSA buildMemorySSA(CFG &cfg, ModuleInfo *moduleInfo = nullptr);
+
+MemorySSA buildMemorySSA(CFG &cfg, const SemanticInfo &instructionSemantic,
+                         std::function<ir::Value(int)> getRegisterVarCb);
+
+bool dominates(ir::Instruction a, ir::Instruction b, bool isPostDom,
+               graph::DomTree<ir::Value> &domTree);
+
+ir::Value findNearestCommonDominator(ir::Instruction a, ir::Instruction b,
+                                     graph::DomTree<ir::Value> &domTree);
+
+class BackEdgeStorage {
+  std::unordered_map<ir::Value, std::unordered_set<ir::Value>> backEdges;
+
+public:
+  BackEdgeStorage() = default;
+  BackEdgeStorage(CFG &cfg);
+
+  const std::unordered_set<ir::Value> *get(ir::Value value) {
+    if (auto it = backEdges.find(value); it != backEdges.end()) {
+      return &it->second;
+    }
+    return nullptr;
+  }
+
+  auto &all() { return backEdges; }
+};
+
+struct AnalysisStorage {
+  template <typename... T>
+    requires(sizeof...(T) > 0)
+  bool invalidate() {
+    bool invalidated = false;
+    ((invalidated = invalidate(rx::TypeId::get<T>()) || invalidated), ...);
+    return invalidated;
+  }
+
+  bool invalidate(rx::TypeId id) {
+    if (auto it = mStorage.find(id); it != mStorage.end()) {
+      return std::exchange(it->second.invalid, true) == false;
+    }
+
+    return false;
+  }
+  void invalidateAll() {
+    for (auto &entry : mStorage) {
+      entry.second.invalid = true;
+    }
+  }
+
+  template <typename T, typename... ArgsT>
+  T &get(ArgsT &&...args)
+    requires requires { T(std::forward<ArgsT>(args)...); }
+  {
+    void *result = getImpl(
+        rx::TypeId::get<T>(), getDeleter<T>(),
+        [&] {
+          return std::make_unique<T>(std::forward<ArgsT>(args)...).release();
+        },
+        [&](void *object) {
+          *reinterpret_cast<T *>(object) = T(std::forward<ArgsT>(args)...);
+        });
+
+    return *static_cast<T *>(result);
+  }
+
+  template <typename T, typename BuilderFn>
+  T &get(BuilderFn &&builder)
+    requires requires { T(std::forward<BuilderFn>(builder)()); }
+  {
+    void *result = getImpl(
+        rx::TypeId::get<T>(), getDeleter<T>(),
+        [&] {
+          return std::make_unique<T>(std::forward<BuilderFn>(builder)())
+              .release();
+        },
+        [&](void *object) {
+          *reinterpret_cast<T *>(object) = std::forward<BuilderFn>(builder)();
+        });
+
+    return *static_cast<T *>(result);
+  }
+
+private:
+  template <typename T> static void (*getDeleter())(void *) {
+    return +[](void *data) { delete static_cast<T *>(data); };
+  }
+
+  void *getImpl(rx::TypeId typeId, void (*deleter)(void *),
+                rx::FunctionRef<void *()> constructor,
+                rx::FunctionRef<void(void *)> placementConstructor) {
+    auto [it, inserted] = mStorage.emplace(typeId, getNullPointer());
+
+    if (inserted) {
+      it->second.object =
+          std::unique_ptr<void, void (*)(void *)>(constructor(), deleter);
+    } else if (it->second.invalid) {
+      placementConstructor(it->second.object.get());
+      it->second.invalid = false;
+    }
+
+    return it->second.object.get();
+  }
+  static constexpr std::unique_ptr<void, void (*)(void *)> getNullPointer() {
+    return {nullptr, [](void *) {}};
+  }
+
+  struct Entry {
+    std::unique_ptr<void, void (*)(void *)> object;
+    bool invalid = false;
+  };
+
+  std::map<rx::TypeId, Entry> mStorage;
+};
+
+struct PostDomTree : graph::DomTree<ir::Value> {
+  PostDomTree() = default;
+  PostDomTree(graph::DomTree<ir::Value> &&other)
+      : graph::DomTree<ir::Value>::DomTree(std::move(other)) {}
+  PostDomTree(CFG &cfg, ir::Value root)
+      : PostDomTree(buildPostDomTree(cfg, root)) {}
+};
+
+struct DomTree : graph::DomTree<ir::Value> {
+  DomTree() = default;
+  DomTree(graph::DomTree<ir::Value> &&other)
+      : graph::DomTree<ir::Value>::DomTree(std::move(other)) {}
+  DomTree(CFG &cfg, ir::Value root = nullptr)
+      : DomTree(buildDomTree(cfg, root)) {}
+};
+
+template <typename T, std::size_t> struct Tag : T {
+  using T::T;
+  using T::operator=;
+
+  Tag(T &&other) : T(std::move(other)) {}
+  Tag(const T &other) : T(other) {}
+
+  Tag &operator=(T &&other) {
+    T::operator=(std::move(other));
+    return *this;
+  }
+  Tag &operator=(const T &other) {
+    T::operator=(other);
+    return *this;
+  }
+};
+
+struct Construct {
+  Construct *parent;
+  std::forward_list<Construct> children;
+  ir::Value header;
+  ir::Value merge;
+  ir::Value loopBody;
+  ir::Value loopContinue;
+  AnalysisStorage analysis;
+
+  static std::unique_ptr<Construct> createRoot(ir::RegionLike region,
+                                               ir::Value merge) {
+    auto result = std::make_unique<Construct>();
+    auto &cfg =
+        result->analysis.get<CFG>([&] { return buildCFG(region.getFirst()); });
+    result->header = cfg.getEntryLabel();
+    result->merge = merge;
+    return result;
+  }
+
+  Construct *createChild(ir::Value header, ir::Value merge) {
+    auto &result = children.emplace_front();
+    result.parent = this;
+    result.header = header;
+    result.merge = merge;
+    return &result;
+  }
+
+  Construct *createChild(ir::Value header, ir::Value merge,
+                         ir::Value loopContinue, ir::Value loopBody) {
+    auto &result = children.emplace_front();
+    result.parent = this;
+    result.header = header;
+    result.merge = merge;
+    result.loopContinue = loopContinue;
+    result.loopBody = loopBody;
+    return &result;
+  }
+
+  Construct createTemporaryChild(ir::Value header, ir::Value merge) {
+    Construct result;
+    result.parent = this;
+    result.header = header;
+    result.merge = merge;
+    return result;
+  }
+
+  CFG &getCfg() {
+    return analysis.get<CFG>([this] {
+      if (parent != nullptr) {
+        return parent->getCfg().buildView(
+            header,
+            &parent->getPostDomTree(),
+            {header, merge});
+      }
+
+      return buildCFG(header);
+    });
+  }
+
+  CFG &getCfgWithoutContinue() {
+    if (loopContinue == nullptr) {
+      return getCfg();
+    }
+
+    return analysis.get<Tag<CFG, kWithoutContinue>>([this] {
+      if (parent != nullptr) {
+        return parent->getCfg().buildView(
+            header,
+            &parent->getPostDomTree(),
+            {header, merge}, loopContinue);
+      }
+
+      return buildCFG(header, {}, loopContinue);
+    });
+  }
+
+  DomTree &getDomTree() { return analysis.get<DomTree>(getCfg(), header); }
+  PostDomTree &getPostDomTree() {
+    return analysis.get<PostDomTree>(getCfg(), merge);
+  }
+  BackEdgeStorage &getBackEdgeStorage() {
+    return analysis.get<BackEdgeStorage>(getCfg());
+  }
+  BackEdgeStorage &getBackEdgeWithoutContinueStorage() {
+    if (loopContinue == nullptr) {
+      return getBackEdgeStorage();
+    }
+    return analysis.get<Tag<BackEdgeStorage, kWithoutContinue>>(
+        getCfgWithoutContinue());
+  }
+  auto getBackEdges(ir::Value node) { return getBackEdgeStorage().get(node); }
+  auto getBackEdgesWithoutContinue(ir::Value node) {
+    return getBackEdgeWithoutContinueStorage().get(node);
+  }
+  auto getBackEdges() { return getBackEdges(header); }
+  void invalidate();
+  void invalidateAll();
+
+  bool isNull() const { return header == nullptr; }
+
+  void removeLastChild() { children.pop_front(); }
+
+private:
+  enum {
+    kWithoutContinue,
+  };
+};
+} // namespace shader
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect.hpp
@ -0,0 +1,78 @@
+#pragma once
+
+#include "dialect/builtin.hpp" // IWYU pragma: export
+#include "dialect/ds.hpp"     // IWYU pragma: export
+#include "dialect/exp.hpp"    // IWYU pragma: export
+#include "dialect/memssa.hpp" // IWYU pragma: export
+#include "dialect/mimg.hpp"   // IWYU pragma: export
+#include "dialect/mtbuf.hpp"  // IWYU pragma: export
+#include "dialect/mubuf.hpp"  // IWYU pragma: export
+#include "dialect/smrd.hpp"   // IWYU pragma: export
+#include "dialect/sop1.hpp"   // IWYU pragma: export
+#include "dialect/sop2.hpp"   // IWYU pragma: export
+#include "dialect/sopc.hpp"   // IWYU pragma: export
+#include "dialect/sopk.hpp"   // IWYU pragma: export
+#include "dialect/sopp.hpp"   // IWYU pragma: export
+#include "dialect/vintrp.hpp" // IWYU pragma: export
+#include "dialect/vop1.hpp"   // IWYU pragma: export
+#include "dialect/vop2.hpp"   // IWYU pragma: export
+#include "dialect/vop3.hpp"   // IWYU pragma: export
+#include "dialect/vopc.hpp"   // IWYU pragma: export
+
+#include "dialect/spv.hpp" // IWYU pragma: export
+
+#include "dialect/amdgpu.hpp"  // IWYU pragma: export
+#include <concepts>
+
+namespace shader::ir {
+template <> inline constexpr Kind kOpToKind<spv::Op> = Kind::Spv;
+template <> inline constexpr Kind kOpToKind<builtin::Op> = Kind::Builtin;
+template <> inline constexpr Kind kOpToKind<amdgpu::Op> = Kind::AmdGpu;
+template <> inline constexpr Kind kOpToKind<vop2::Op> = Kind::Vop2;
+template <> inline constexpr Kind kOpToKind<sop2::Op> = Kind::Sop2;
+template <> inline constexpr Kind kOpToKind<sopk::Op> = Kind::Sopk;
+template <> inline constexpr Kind kOpToKind<smrd::Op> = Kind::Smrd;
+template <> inline constexpr Kind kOpToKind<vop3::Op> = Kind::Vop3;
+template <> inline constexpr Kind kOpToKind<mubuf::Op> = Kind::Mubuf;
+template <> inline constexpr Kind kOpToKind<mtbuf::Op> = Kind::Mtbuf;
+template <> inline constexpr Kind kOpToKind<mimg::Op> = Kind::Mimg;
+template <> inline constexpr Kind kOpToKind<ds::Op> = Kind::Ds;
+template <> inline constexpr Kind kOpToKind<vintrp::Op> = Kind::Vintrp;
+template <> inline constexpr Kind kOpToKind<exp::Op> = Kind::Exp;
+template <> inline constexpr Kind kOpToKind<vop1::Op> = Kind::Vop1;
+template <> inline constexpr Kind kOpToKind<vopc::Op> = Kind::Vopc;
+template <> inline constexpr Kind kOpToKind<sop1::Op> = Kind::Sop1;
+template <> inline constexpr Kind kOpToKind<sopc::Op> = Kind::Sopc;
+template <> inline constexpr Kind kOpToKind<sopp::Op> = Kind::Sopp;
+template <> inline constexpr Kind kOpToKind<memssa::Op> = Kind::MemSSA;
+
+template <typename T>
+  requires(kOpToKind<std::remove_cvref_t<T>> != Kind::Count)
+constexpr InstructionId getInstructionId(T op) {
+  return getInstructionId(kOpToKind<std::remove_cvref_t<T>>, op);
+}
+
+constexpr bool operator==(ir::Instruction lhs, InstructionId rhs) {
+  return lhs && lhs.getInstId() == rhs;
+}
+
+template <typename L, typename R>
+constexpr bool operator==(L lhs, R rhs)
+  requires requires {
+    requires(!std::is_same_v<L, R>);
+    { getInstructionId(lhs) == rhs } -> std::convertible_to<bool>;
+  }
+{
+  return getInstructionId(lhs) == rhs;
+}
+
+template <typename L, typename R>
+constexpr bool operator==(L lhs, R rhs)
+  requires requires {
+    requires(!std::is_same_v<L, R>);
+    { getTypeId(lhs) == rhs } -> std::convertible_to<bool>;
+  }
+{
+  return getTypeId(lhs) == rhs;
+}
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/amdgpu.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/amdgpu.hpp
@ -0,0 +1,57 @@
+#pragma once
+
+namespace shader::ir::amdgpu {
+
+enum Op {
+  EXEC_TEST,
+  BRANCH,
+  IMM,
+  USER_SGPR,
+  VBUFFER,
+  SAMPLER,
+  TBUFFER,
+  POINTER,
+  OMOD,
+  NEG_ABS,
+  PS_INPUT_VGPR,
+  PS_COMP_SWAP,
+  VS_GET_INDEX,
+  RESOURCE_PHI,
+
+  OpCount,
+};
+
+inline const char *getInstructionName(unsigned op) {
+  switch (op) {
+  case EXEC_TEST:
+    return "exec_test";
+  case BRANCH:
+    return "branch";
+  case IMM:
+    return "imm";
+  case USER_SGPR:
+    return "user_sgpr";
+  case VBUFFER:
+    return "vbuffer";
+  case SAMPLER:
+    return "sampler";
+  case TBUFFER:
+    return "tbuffer";
+  case POINTER:
+    return "pointer";
+  case OMOD:
+    return "omod";
+  case NEG_ABS:
+    return "neg_abs";
+  case PS_INPUT_VGPR:
+    return "ps_input_vgpr";
+  case PS_COMP_SWAP:
+    return "ps_comp_swap";
+  case VS_GET_INDEX:
+    return "vs_get_index";
+  case RESOURCE_PHI:
+    return "resource_phi";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::amdgpu
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/builtin.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/builtin.hpp
@ -0,0 +1,193 @@
+#pragma once
+#include "../ir/Block.hpp"
+#include "../ir/Builder.hpp"
+#include "../ir/Value.hpp"
+
+namespace shader::ir {
+template <typename T> inline constexpr Kind kOpToKind = Kind::Count;
+}
+
+namespace shader::ir::builtin {
+enum Op {
+  INVALID_INSTRUCTION,
+  BLOCK,
+  IF_ELSE,
+  LOOP,
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case INVALID_INSTRUCTION:
+    return "<invalid instruction>";
+
+  case BLOCK:
+    return "block";
+
+  case IF_ELSE:
+    return "ifElse";
+
+  case LOOP:
+    return "loop";
+  }
+  return nullptr;
+}
+
+template <typename ImplT>
+struct Builder : BuilderFacade<Builder<ImplT>, ImplT> {
+  /**
+   * Creates an invalid instruction with the given location.
+   *
+   * @param location the location of the instruction
+   *
+   * @return the created invalid instruction
+   */
+  Instruction createInvalidInstruction(Location location) {
+    return this->template create<Instruction>(location, Kind::Builtin,
+                                              INVALID_INSTRUCTION);
+  }
+
+  Instruction createIfElse(Location location, Value cond, Block ifTrue,
+                           Block ifFalse = {}) {
+    std::vector<Operand> operands = {{cond, ifTrue}};
+    if (ifFalse) {
+      operands.push_back(ifFalse);
+    }
+    return this->template create<Instruction>(location, Kind::Builtin, IF_ELSE,
+                                              operands);
+  }
+
+  Instruction createLoop(Location location, Block body) {
+    return this->template create<Instruction>(location, Kind::Builtin, IF_ELSE,
+                                              {{body}});
+  }
+
+  auto createBlock(Location location) {
+    return this->template create<Block>(location);
+  }
+
+  auto createRegion(Location location) {
+    return this->getContext().template create<Region>(location);
+  }
+
+  /**
+   * Creates an instruction with the given location, kind, op, and operands.
+   *
+   * @param location the location of the instruction
+   * @param kind the kind of the instruction
+   * @param op the opcode of the instruction
+   * @param operands the operands of the instruction
+   *
+   * @return the created instruction
+   */
+  Instruction createInstruction(Location location, Kind kind, unsigned op,
+                                std::span<const Operand> operands = {}) {
+    return this->template create<Instruction>(location, kind, op, operands);
+  }
+
+  template <typename OpT>
+  Instruction createInstruction(Location location, OpT &&op,
+                                std::span<const Operand> operands = {})
+    requires requires {
+      this->template create<Instruction>(
+          location, kOpToKind<std::remove_cvref_t<OpT>>, op, operands);
+    }
+  {
+    return this->template create<Instruction>(
+        location, kOpToKind<std::remove_cvref_t<OpT>>, op, operands);
+  }
+
+  /**
+   * Creates an Instruction object with the given location, kind, opcode, and
+   * operands.
+   *
+   * @param location the location of the instruction
+   * @param kind the kind of the instruction
+   * @param op the opcode of the instruction
+   * @param operands variadic parameter pack of operands for the instruction
+   *
+   * @return the created Instruction object
+   */
+  template <typename... T>
+  Instruction createInstruction(Location location, Kind kind, unsigned op,
+                                T &&...operands)
+    requires requires {
+      createInstruction(location, kind, op,
+                        {{Operand(std::forward<T>(operands))...}});
+    }
+  {
+    return createInstruction(location, kind, op,
+                             {{Operand(std::forward<T>(operands))...}});
+  }
+
+  template <typename OpT, typename... T>
+  Instruction createInstruction(Location location, OpT &&op, T &&...operands)
+    requires requires {
+      createInstruction(location, std::forward<OpT>(op),
+                        {{Operand(std::forward<T>(operands))...}});
+    }
+  {
+    return createInstruction(location, std::forward<OpT>(op),
+                             {{Operand(std::forward<T>(operands))...}});
+  }
+
+  /**
+   * Creates a Value object with the given location, kind, opcode, and operands.
+   *
+   * @param location the location of the Value object
+   * @param kind the kind of the Value object
+   * @param op the opcode of the Value object
+   * @param operands a span of operands for the Value object
+   *
+   * @return the created Value object
+   */
+  auto createValue(Location location, Kind kind, unsigned op,
+                   std::span<const Operand> operands = {}) {
+    return this->template create<Value>(location, kind, op, operands);
+  }
+
+  template <typename OpT>
+  auto createValue(Location location, OpT &&op,
+                   std::span<const Operand> operands = {})
+    requires requires {
+      this->template create<Value>(
+          location, kOpToKind<std::remove_cvref_t<OpT>>, op, operands);
+    }
+  {
+    return this->template create<Value>(
+        location, kOpToKind<std::remove_cvref_t<OpT>>, op, operands);
+  }
+
+  /**
+   * Creates a Value object with the given location, kind, opcode, and operands.
+   *
+   * @param location the location of the Value object
+   * @param kind the kind of the Value object
+   * @param op the opcode of the Value object
+   * @param operands variadic parameter pack of operands for the Value object
+   *
+   * @return the created Value object
+   */
+  template <typename... T>
+  auto createValue(Location location, Kind kind, unsigned op, T &&...operands)
+    requires requires {
+      createValue(location, kind, op,
+                  {{Operand(std::forward<T>(operands))...}});
+    }
+  {
+    return createValue(location, kind, op,
+                       {{Operand(std::forward<T>(operands))...}});
+  }
+
+  template <typename OpT, typename... T>
+    requires requires { kOpToKind<std::remove_cvref_t<OpT>>; }
+  auto createValue(Location location, OpT &&op, T &&...operands)
+    requires requires {
+      createValue(location, std::forward<OpT>(op),
+                  {{Operand(std::forward<T>(operands))...}});
+    }
+  {
+    return createValue(location, std::forward<OpT>(op),
+                       {{Operand(std::forward<T>(operands))...}});
+  }
+};
+} // namespace shader::ir::builtin
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/ds.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/ds.hpp
@ -0,0 +1,294 @@
+#pragma once
+
+namespace shader::ir::ds {
+enum Op {
+  ADD_U32,
+  SUB_U32,
+  RSUB_U32,
+  INC_U32,
+  DEC_U32,
+  MIN_I32,
+  MAX_I32,
+  MIN_U32,
+  MAX_U32,
+  AND_B32,
+  OR_B32,
+  XOR_B32,
+  MSKOR_B32,
+  WRITE_B32,
+  WRITE2_B32,
+  WRITE2ST64_B32,
+  CMPST_B32,
+  CMPST_F32,
+  MIN_F32,
+  MAX_F32,
+  NOP,
+  GWS_SEMA_RELEASE_ALL = 24,
+  GWS_INIT,
+  GWS_SEMA_V,
+  GWS_SEMA_BR,
+  GWS_SEMA_P,
+  GWS_BARRIER,
+  WRITE_B8,
+  WRITE_B16,
+  ADD_RTN_U32,
+  SUB_RTN_U32,
+  RSUB_RTN_U32,
+  INC_RTN_U32,
+  DEC_RTN_U32,
+  MIN_RTN_I32,
+  MAX_RTN_I32,
+  MIN_RTN_U32,
+  MAX_RTN_U32,
+  AND_RTN_B32,
+  OR_RTN_B32,
+  XOR_RTN_B32,
+  MSKOR_RTN_B32,
+  WRXCHG_RTN_B32,
+  WRXCHG2_RTN_B32,
+  WRXCHG2ST64_RTN_B32,
+  CMPST_RTN_B32,
+  CMPST_RTN_F32,
+  MIN_RTN_F32,
+  MAX_RTN_F32,
+  WRAP_RTN_B32,
+  SWIZZLE_B32,
+  READ_B32,
+  READ2_B32,
+  READ2ST64_B32,
+  READ_I8,
+  READ_U8,
+  READ_I16,
+  READ_U16,
+  CONSUME,
+  APPEND,
+  ORDERED_COUNT,
+  ADD_U64,
+  SUB_U64,
+  RSUB_U64,
+  INC_U64,
+  DEC_U64,
+  MIN_I64,
+  MAX_I64,
+  MIN_U64,
+  MAX_U64,
+  AND_B64,
+  OR_B64,
+  XOR_B64,
+  MSKOR_B64,
+  WRITE_B64,
+  WRITE2_B64,
+  WRITE2ST64_B64,
+  CMPST_B64,
+  CMPST_F64,
+  MIN_F64,
+  MAX_F64,
+  ADD_RTN_U64 = 96,
+  SUB_RTN_U64,
+  RSUB_RTN_U64,
+  INC_RTN_U64,
+  DEC_RTN_U64,
+  MIN_RTN_I64,
+  MAX_RTN_I64,
+  MIN_RTN_U64,
+  MAX_RTN_U64,
+  AND_RTN_B64,
+  OR_RTN_B64,
+  XOR_RTN_B64,
+  MSKOR_RTN_B64,
+  WRXCHG_RTN_B64,
+  WRXCHG2_RTN_B64,
+  WRXCHG2ST64_RTN_B64,
+  CMPST_RTN_B64,
+  CMPST_RTN_F64,
+  MIN_RTN_F64,
+  MAX_RTN_F64,
+  READ_B64 = 118,
+  READ2_B64,
+  READ2ST64_B64,
+  CONDXCHG32_RTN_B64 = 126,
+  ADD_SRC2_U32 = 128,
+  SUB_SRC2_U32,
+  RSUB_SRC2_U32,
+  INC_SRC2_U32,
+  DEC_SRC2_U32,
+  MIN_SRC2_I32,
+  MAX_SRC2_I32,
+  MIN_SRC2_U32,
+  MAX_SRC2_U32,
+  AND_SRC2_B32,
+  OR_SRC2_B32,
+  XOR_SRC2_B32,
+  WRITE_SRC2_B32,
+  MIN_SRC2_F32 = 146,
+  MAX_SRC2_F32,
+  ADD_SRC2_U64 = 192,
+  SUB_SRC2_U64,
+  RSUB_SRC2_U64,
+  INC_SRC2_U64,
+  DEC_SRC2_U64,
+  MIN_SRC2_I64,
+  MAX_SRC2_I64,
+  MIN_SRC2_U64,
+  MAX_SRC2_U64,
+  AND_SRC2_B64,
+  OR_SRC2_B64,
+  XOR_SRC2_B64,
+  WRITE_SRC2_B64,
+  MIN_SRC2_F64 = 210,
+  MAX_SRC2_F64,
+  WRITE_B96 = 222,
+  WRITE_B128,
+  CONDXCHG32_RTN_B128 = 253,
+  READ_B96,
+  READ_B128,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case ADD_U32: return "ds_add_u32";
+  case SUB_U32: return "ds_sub_u32";
+  case RSUB_U32: return "ds_rsub_u32";
+  case INC_U32: return "ds_inc_u32";
+  case DEC_U32: return "ds_dec_u32";
+  case MIN_I32: return "ds_min_i32";
+  case MAX_I32: return "ds_max_i32";
+  case MIN_U32: return "ds_min_u32";
+  case MAX_U32: return "ds_max_u32";
+  case AND_B32: return "ds_and_b32";
+  case OR_B32: return "ds_or_b32";
+  case XOR_B32: return "ds_xor_b32";
+  case MSKOR_B32: return "ds_mskor_b32";
+  case WRITE_B32: return "ds_write_b32";
+  case WRITE2_B32: return "ds_write2_b32";
+  case WRITE2ST64_B32: return "ds_write2st64_b32";
+  case CMPST_B32: return "ds_cmpst_b32";
+  case CMPST_F32: return "ds_cmpst_f32";
+  case MIN_F32: return "ds_min_f32";
+  case MAX_F32: return "ds_max_f32";
+  case NOP: return "ds_nop";
+  case GWS_SEMA_RELEASE_ALL: return "ds_gws_sema_release_all";
+  case GWS_INIT: return "ds_gws_init";
+  case GWS_SEMA_V: return "ds_gws_sema_v";
+  case GWS_SEMA_BR: return "ds_gws_sema_br";
+  case GWS_SEMA_P: return "ds_gws_sema_p";
+  case GWS_BARRIER: return "ds_gws_barrier";
+  case WRITE_B8: return "ds_write_b8";
+  case WRITE_B16: return "ds_write_b16";
+  case ADD_RTN_U32: return "ds_add_rtn_u32";
+  case SUB_RTN_U32: return "ds_sub_rtn_u32";
+  case RSUB_RTN_U32: return "ds_rsub_rtn_u32";
+  case INC_RTN_U32: return "ds_inc_rtn_u32";
+  case DEC_RTN_U32: return "ds_dec_rtn_u32";
+  case MIN_RTN_I32: return "ds_min_rtn_i32";
+  case MAX_RTN_I32: return "ds_max_rtn_i32";
+  case MIN_RTN_U32: return "ds_min_rtn_u32";
+  case MAX_RTN_U32: return "ds_max_rtn_u32";
+  case AND_RTN_B32: return "ds_and_rtn_b32";
+  case OR_RTN_B32: return "ds_or_rtn_b32";
+  case XOR_RTN_B32: return "ds_xor_rtn_b32";
+  case MSKOR_RTN_B32: return "ds_mskor_rtn_b32";
+  case WRXCHG_RTN_B32: return "ds_wrxchg_rtn_b32";
+  case WRXCHG2_RTN_B32: return "ds_wrxchg2_rtn_b32";
+  case WRXCHG2ST64_RTN_B32: return "ds_wrxchg2st64_rtn_b32";
+  case CMPST_RTN_B32: return "ds_cmpst_rtn_b32";
+  case CMPST_RTN_F32: return "ds_cmpst_rtn_f32";
+  case MIN_RTN_F32: return "ds_min_rtn_f32";
+  case MAX_RTN_F32: return "ds_max_rtn_f32";
+  case WRAP_RTN_B32: return "ds_wrap_rtn_b32";
+  case SWIZZLE_B32: return "ds_swizzle_b32";
+  case READ_B32: return "ds_read_b32";
+  case READ2_B32: return "ds_read2_b32";
+  case READ2ST64_B32: return "ds_read2st64_b32";
+  case READ_I8: return "ds_read_i8";
+  case READ_U8: return "ds_read_u8";
+  case READ_I16: return "ds_read_i16";
+  case READ_U16: return "ds_read_u16";
+  case CONSUME: return "ds_consume";
+  case APPEND: return "ds_append";
+  case ORDERED_COUNT: return "ds_ordered_count";
+  case ADD_U64: return "ds_add_u64";
+  case SUB_U64: return "ds_sub_u64";
+  case RSUB_U64: return "ds_rsub_u64";
+  case INC_U64: return "ds_inc_u64";
+  case DEC_U64: return "ds_dec_u64";
+  case MIN_I64: return "ds_min_i64";
+  case MAX_I64: return "ds_max_i64";
+  case MIN_U64: return "ds_min_u64";
+  case MAX_U64: return "ds_max_u64";
+  case AND_B64: return "ds_and_b64";
+  case OR_B64: return "ds_or_b64";
+  case XOR_B64: return "ds_xor_b64";
+  case MSKOR_B64: return "ds_mskor_b64";
+  case WRITE_B64: return "ds_write_b64";
+  case WRITE2_B64: return "ds_write2_b64";
+  case WRITE2ST64_B64: return "ds_write2st64_b64";
+  case CMPST_B64: return "ds_cmpst_b64";
+  case CMPST_F64: return "ds_cmpst_f64";
+  case MIN_F64: return "ds_min_f64";
+  case MAX_F64: return "ds_max_f64";
+  case ADD_RTN_U64: return "ds_add_rtn_u64";
+  case SUB_RTN_U64: return "ds_sub_rtn_u64";
+  case RSUB_RTN_U64: return "ds_rsub_rtn_u64";
+  case INC_RTN_U64: return "ds_inc_rtn_u64";
+  case DEC_RTN_U64: return "ds_dec_rtn_u64";
+  case MIN_RTN_I64: return "ds_min_rtn_i64";
+  case MAX_RTN_I64: return "ds_max_rtn_i64";
+  case MIN_RTN_U64: return "ds_min_rtn_u64";
+  case MAX_RTN_U64: return "ds_max_rtn_u64";
+  case AND_RTN_B64: return "ds_and_rtn_b64";
+  case OR_RTN_B64: return "ds_or_rtn_b64";
+  case XOR_RTN_B64: return "ds_xor_rtn_b64";
+  case MSKOR_RTN_B64: return "ds_mskor_rtn_b64";
+  case WRXCHG_RTN_B64: return "ds_wrxchg_rtn_b64";
+  case WRXCHG2_RTN_B64: return "ds_wrxchg2_rtn_b64";
+  case WRXCHG2ST64_RTN_B64: return "ds_wrxchg2st64_rtn_b64";
+  case CMPST_RTN_B64: return "ds_cmpst_rtn_b64";
+  case CMPST_RTN_F64: return "ds_cmpst_rtn_f64";
+  case MIN_RTN_F64: return "ds_min_rtn_f64";
+  case MAX_RTN_F64: return "ds_max_rtn_f64";
+  case READ_B64: return "ds_read_b64";
+  case READ2_B64: return "ds_read2_b64";
+  case READ2ST64_B64: return "ds_read2st64_b64";
+  case CONDXCHG32_RTN_B64: return "ds_condxchg32_rtn_b64";
+  case ADD_SRC2_U32: return "ds_add_src2_u32";
+  case SUB_SRC2_U32: return "ds_sub_src2_u32";
+  case RSUB_SRC2_U32: return "ds_rsub_src2_u32";
+  case INC_SRC2_U32: return "ds_inc_src2_u32";
+  case DEC_SRC2_U32: return "ds_dec_src2_u32";
+  case MIN_SRC2_I32: return "ds_min_src2_i32";
+  case MAX_SRC2_I32: return "ds_max_src2_i32";
+  case MIN_SRC2_U32: return "ds_min_src2_u32";
+  case MAX_SRC2_U32: return "ds_max_src2_u32";
+  case AND_SRC2_B32: return "ds_and_src2_b32";
+  case OR_SRC2_B32: return "ds_or_src2_b32";
+  case XOR_SRC2_B32: return "ds_xor_src2_b32";
+  case WRITE_SRC2_B32: return "ds_write_src2_b32";
+  case MIN_SRC2_F32: return "ds_min_src2_f32";
+  case MAX_SRC2_F32: return "ds_max_src2_f32";
+  case ADD_SRC2_U64: return "ds_add_src2_u64";
+  case SUB_SRC2_U64: return "ds_sub_src2_u64";
+  case RSUB_SRC2_U64: return "ds_rsub_src2_u64";
+  case INC_SRC2_U64: return "ds_inc_src2_u64";
+  case DEC_SRC2_U64: return "ds_dec_src2_u64";
+  case MIN_SRC2_I64: return "ds_min_src2_i64";
+  case MAX_SRC2_I64: return "ds_max_src2_i64";
+  case MIN_SRC2_U64: return "ds_min_src2_u64";
+  case MAX_SRC2_U64: return "ds_max_src2_u64";
+  case AND_SRC2_B64: return "ds_and_src2_b64";
+  case OR_SRC2_B64: return "ds_or_src2_b64";
+  case XOR_SRC2_B64: return "ds_xor_src2_b64";
+  case WRITE_SRC2_B64: return "ds_write_src2_b64";
+  case MIN_SRC2_F64: return "ds_min_src2_f64";
+  case MAX_SRC2_F64: return "ds_max_src2_f64";
+  case WRITE_B96: return "ds_write_b96";
+  case WRITE_B128: return "ds_write_b128";
+  case CONDXCHG32_RTN_B128: return "ds_condxchg32_rtn_b128";
+  case READ_B96: return "ds_read_b96";
+  case READ_B128: return "ds_read_b128";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::ds
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/exp.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/exp.hpp
@ -0,0 +1,11 @@
+#pragma once
+
+namespace shader::ir::exp {
+enum Op {
+  EXP = 0,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned) { return "exp"; }
+} // namespace shader::ir::exp
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/memssa.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/memssa.hpp
@ -0,0 +1,423 @@
+#pragma once
+
+#include "../ir/Block.hpp"
+#include "../ir/Builder.hpp"
+#include "../ir/Value.hpp"
+#include "../ir/ValueImpl.hpp"
+
+namespace shader::ir::memssa {
+enum Op {
+  OpVar,
+  OpDef,
+  OpPhi,
+  OpUse,
+  OpBarrier,
+  OpJump,
+  OpExit,
+
+  OpCount,
+};
+
+template <typename BaseT> struct BaseImpl : BaseT {
+  Instruction link;
+
+  using BaseT::BaseT;
+  using BaseT::operator=;
+
+  void print(std::ostream &os, NameStorage &ns) const override {
+    BaseT::print(os, ns);
+
+    if (link) {
+      os << " : ";
+      link.print(os, ns);
+    }
+  }
+};
+
+template <typename ImplT, template <typename> typename BaseT>
+struct BaseWrapper : BaseT<ImplT> {
+  using BaseT<ImplT>::BaseT;
+  using BaseT<ImplT>::operator=;
+
+  Instruction getLinkedInst() const { return this->impl->link; }
+};
+
+struct DefImpl : BaseImpl<ValueImpl> {
+  using BaseImpl::BaseImpl;
+  using BaseImpl::operator=;
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+struct UseImpl : BaseImpl<InstructionImpl> {
+  using BaseImpl::BaseImpl;
+  using BaseImpl::operator=;
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+struct VarImpl : BaseImpl<ValueImpl> {
+  using BaseImpl::BaseImpl;
+  using BaseImpl::operator=;
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+struct PhiImpl : DefImpl {
+  using DefImpl::DefImpl;
+  using DefImpl::operator=;
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+
+using Use = BaseWrapper<UseImpl, InstructionWrapper>;
+using Var = BaseWrapper<VarImpl, ValueWrapper>;
+
+template <typename ImplT> struct DefWrapper : BaseWrapper<ImplT, ValueWrapper> {
+  using BaseWrapper<ImplT, ValueWrapper>::BaseWrapper;
+  using BaseWrapper<ImplT, ValueWrapper>::operator=;
+
+  void addVariable(Var variable) {
+    this->addOperand(variable);
+
+    std::vector<Var> workList;
+
+    for (auto &comp : variable.getOperands()) {
+      auto compVar = comp.getAsValue().staticCast<Var>();
+      this->addOperand(compVar);
+
+      if (compVar.getOperandCount() > 1) {
+        workList.push_back(compVar);
+      } else if (compVar.getOperandCount() == 1) {
+        this->addOperand(compVar.getOperand(0).getAsValue().staticCast<Var>());
+      }
+    }
+
+    while (!workList.empty()) {
+      auto var = workList.back();
+      workList.pop_back();
+
+      for (auto &comp : var.getOperands()) {
+        auto compVar = comp.getAsValue().staticCast<Var>();
+        this->addOperand(compVar);
+
+        if (compVar.getOperandCount() > 1) {
+          workList.push_back(var);
+        } else if (compVar.getOperandCount() == 1) {
+          this->addOperand(
+              compVar.getOperand(0).getAsValue().staticCast<Var>());
+        }
+      }
+    }
+  }
+
+  Var getRootVar() {
+    return this->getOperand(0).getAsValue().template staticCast<Var>();
+  }
+
+  Var getVar(std::size_t index) {
+    return this->getOperand(index).getAsValue().template staticCast<Var>();
+  }
+};
+
+struct ScopeImpl : BaseImpl<ir::BlockImpl> {
+  using BaseImpl::BaseImpl;
+  using BaseImpl::operator=;
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+
+template <typename ImplT> struct ScopeWrapper;
+
+using Scope = ScopeWrapper<ScopeImpl>;
+using Def = DefWrapper<DefImpl>;
+
+template <typename ImplT> struct BarrierWrapper : DefWrapper<ImplT> {
+  using DefWrapper<ImplT>::DefWrapper;
+  using DefWrapper<ImplT>::operator=;
+};
+
+using Barrier = BarrierWrapper<PhiImpl>;
+
+template <typename ImplT>
+struct ScopeWrapper : BaseWrapper<ImplT, ir::BlockWrapper> {
+  using BaseWrapper<ImplT, ir::BlockWrapper>::BaseWrapper;
+  using BaseWrapper<ImplT, ir::BlockWrapper>::operator=;
+
+  Scope getSingleSuccessor() {
+    if (this->empty()) {
+      return {};
+    }
+    auto terminator = this->getLast();
+    if (terminator.getKind() != Kind::MemSSA || terminator.getOp() != OpJump) {
+      return {};
+    }
+    if (terminator.getOperandCount() != 1) {
+      return {};
+    }
+
+    return terminator.getOperand(0).getAsValue().template cast<Scope>();
+  }
+
+  std::vector<Scope> getSuccessors() {
+    if (this->empty()) {
+      return {};
+    }
+    auto terminator = this->getLast();
+    if (terminator.getKind() != Kind::MemSSA || terminator.getOp() != OpJump) {
+      return {};
+    }
+
+    std::vector<Scope> result;
+    result.reserve(terminator.getOperandCount());
+    for (auto &successor : terminator.getOperands()) {
+      if (auto block = successor.getAsValue().template cast<Scope>()) {
+        result.push_back(block);
+      }
+    }
+    return result;
+  }
+
+  auto getPredecessors() {
+    std::set<Scope> predecessors;
+    for (auto &use : this->getUseList()) {
+      if (use.user != OpJump) {
+        continue;
+      }
+
+      if (auto userParent = use.user.getParent().template cast<Scope>()) {
+        predecessors.insert(userParent);
+      }
+    }
+    return predecessors;
+  }
+
+  auto getSinglePredecessor() {
+    Scope predecessor;
+
+    for (auto &use : this->getUseList()) {
+      if (use.user != OpJump) {
+        continue;
+      }
+
+      if (auto userParent = use.user.getParent().template cast<Scope>()) {
+        if (predecessor == nullptr) {
+          predecessor = userParent;
+        } else if (predecessor != userParent) {
+          return Scope(nullptr);
+        }
+      }
+    }
+
+    return predecessor;
+  }
+
+  Def findVarDef(Var var, Instruction point = nullptr) {
+    if (point == nullptr) {
+      point = this->getLast();
+    }
+
+    std::optional<std::set<Var>> compList;
+
+    auto buildMatchList = [&] {
+      std::set<Var> result;
+      std::vector<Var> workList;
+
+      for (auto comp : var.getOperands()) {
+        auto compVar = comp.getAsValue().staticCast<Var>();
+        result.insert(compVar);
+
+        if (compVar.getOperandCount() > 1) {
+          workList.push_back(compVar);
+        } else if (compVar.getOperandCount() == 1) {
+          result.insert(compVar.getOperand(0).getAsValue().staticCast<Var>());
+        }
+      }
+
+      while (!workList.empty()) {
+        auto var = workList.back();
+        workList.pop_back();
+
+        for (auto comp : var.getOperands()) {
+          auto compVar = comp.getAsValue().staticCast<Var>();
+          result.insert(compVar);
+
+          if (compVar.getOperandCount() > 1) {
+            workList.push_back(compVar);
+          } else if (compVar.getOperandCount() == 1) {
+            result.insert(compVar.getOperand(0).getAsValue().staticCast<Var>());
+          }
+        }
+      }
+
+      return result;
+    };
+
+    for (auto child : revRange(point)) {
+      if (child.getKind() != Kind::MemSSA) {
+        continue;
+      }
+
+      if (child.getOp() == OpDef || child.getOp() == OpPhi) {
+        if (child.getOperand(0) == var) {
+          return child.template staticCast<Def>();
+        }
+
+        if (!compList) {
+          compList = buildMatchList();
+        }
+
+        if (compList->empty()) {
+          continue;
+        }
+
+        if (compList->contains(
+                child.getOperand(0).getAsValue().staticCast<Var>())) {
+          return child.template staticCast<Def>();
+        }
+      }
+
+      if (child.getOp() == OpBarrier) {
+        // barrier is definition for everything
+        return child.template staticCast<Def>();
+      }
+    }
+
+    return {};
+  }
+};
+
+template <typename ImplT> struct PhiWrapper : ValueWrapper<ImplT> {
+  using ValueWrapper<ImplT>::ValueWrapper;
+  using ValueWrapper<ImplT>::operator=;
+
+  void addValue(Scope scope, Def def) {
+    this->addOperand(scope);
+    this->addOperand(def);
+  }
+
+  // Set value for specified block or add new node
+  // Returns true if node was added
+  bool setValue(Scope pred, Def def) {
+    for (std::size_t i = 1, end = this->getOperandCount(); i < end; i += 2) {
+      if (pred == this->getOperand(i).getAsValue()) {
+        this->replaceOperand(i + 1, def);
+        return false;
+      }
+    }
+
+    addValue(pred, def);
+    return true;
+  }
+
+  Def getDef(Scope pred) {
+    for (std::size_t i = 1, end = this->getOperandCount(); i < end; i += 2) {
+      if (pred == this->getOperand(i).getAsValue()) {
+        return this->getOperand(i + 1).getAsValue().template staticCast<Def>();
+      }
+    }
+
+    return {};
+  }
+
+  bool empty() { return this->getOperandCount() < 2; }
+
+  Def getUniqDef() {
+    if (empty()) {
+      return {};
+    }
+
+    Def result = this->getOperand(2).getAsValue().template staticCast<Def>();
+
+    for (std::size_t i = 4, end = this->getOperandCount(); i < end; i += 2) {
+      if (this->getOperand(i) != result) {
+        return {};
+      }
+    }
+
+    return result;
+  }
+
+  Var getVar() {
+    return this->getOperand(0).getAsValue().template staticCast<Var>();
+  }
+};
+
+using Phi = PhiWrapper<PhiImpl>;
+
+template <typename ImplT>
+struct Builder : BuilderFacade<Builder<ImplT>, ImplT> {
+  Def createDef(Instruction defInst, Var var) {
+    auto result =
+        this->template create<Def>(defInst.getLocation(), Kind::MemSSA, OpDef);
+    result.impl->link = defInst;
+    result.addOperand(var);
+    return result;
+  }
+
+  Scope createScope(ir::Instruction labelInst) {
+    Scope result = this->template create<Scope>(labelInst.getLocation());
+    result.impl->link = labelInst;
+    return result;
+  }
+
+  Phi createPhi(Var var) {
+    auto result =
+        this->template create<Phi>(var.getLocation(), Kind::MemSSA, OpPhi);
+    result.addOperand(var);
+    return result;
+  }
+
+  Use createUse(ir::Instruction useInst) {
+    Use result =
+        this->template create<Use>(useInst.getLocation(), Kind::MemSSA, OpUse);
+    result.impl->link = useInst;
+    return result;
+  }
+
+  Use createUse(ir::Instruction useInst, Def def) {
+    auto result = createUse(useInst);
+    result.addOperand(def);
+    return result;
+  }
+
+  Var createVar(ir::Instruction varInst) {
+    Var result =
+        this->template create<Var>(varInst.getLocation(), Kind::MemSSA, OpVar);
+    result.impl->link = varInst;
+    return result;
+  }
+
+  Barrier createBarrier(ir::Instruction barrierInst) {
+    Barrier result = this->template create<Barrier>(barrierInst.getLocation(),
+                                                    Kind::MemSSA, OpBarrier);
+    result.impl->link = barrierInst;
+    return result;
+  }
+
+  Instruction createJump(Location loc) {
+    return this->template create<Instruction>(loc, Kind::MemSSA, OpJump);
+  }
+
+  Instruction createExit(Location loc) {
+    return this->template create<Instruction>(loc, Kind::MemSSA, OpExit);
+  }
+};
+
+inline const char *getInstructionName(unsigned op) {
+  switch (op) {
+  case OpVar:
+    return "var";
+  case OpDef:
+    return "def";
+  case OpPhi:
+    return "phi";
+  case OpUse:
+    return "use";
+  case OpBarrier:
+    return "barrier";
+  case OpJump:
+    return "jump";
+  case OpExit:
+    return "exit";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::memssa
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mimg.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mimg.hpp
@ -0,0 +1,199 @@
+#pragma once
+
+namespace shader::ir::mimg {
+enum Op {
+  LOAD,
+  LOAD_MIP,
+  LOAD_PCK,
+  LOAD_PCK_SGN,
+  LOAD_MIP_PCK,
+  LOAD_MIP_PCK_SGN,
+  STORE = 8,
+  STORE_MIP,
+  STORE_PCK,
+  STORE_MIP_PCK,
+  GET_RESINFO = 14,
+  ATOMIC_SWAP,
+  ATOMIC_CMPSWAP,
+  ATOMIC_ADD,
+  ATOMIC_SUB,
+  ATOMIC_RSUB,
+  ATOMIC_SMIN,
+  ATOMIC_UMIN,
+  ATOMIC_SMAX,
+  ATOMIC_UMAX,
+  ATOMIC_AND,
+  ATOMIC_OR,
+  ATOMIC_XOR,
+  ATOMIC_INC,
+  ATOMIC_DEC,
+  ATOMIC_FCMPSWAP,
+  ATOMIC_FMIN,
+  ATOMIC_FMAX,
+  SAMPLE,
+  SAMPLE_CL,
+  SAMPLE_D,
+  SAMPLE_D_CL,
+  SAMPLE_L,
+  SAMPLE_B,
+  SAMPLE_B_CL,
+  SAMPLE_LZ,
+  SAMPLE_C,
+  SAMPLE_C_CL,
+  SAMPLE_C_D,
+  SAMPLE_C_D_CL,
+  SAMPLE_C_L,
+  SAMPLE_C_B,
+  SAMPLE_C_B_CL,
+  SAMPLE_C_LZ,
+  SAMPLE_O,
+  SAMPLE_CL_O,
+  SAMPLE_D_O,
+  SAMPLE_D_CL_O,
+  SAMPLE_L_O,
+  SAMPLE_B_O,
+  SAMPLE_B_CL_O,
+  SAMPLE_LZ_O,
+  SAMPLE_C_O,
+  SAMPLE_C_CL_O,
+  SAMPLE_C_D_O,
+  SAMPLE_C_D_CL_O,
+  SAMPLE_C_L_O,
+  SAMPLE_C_B_O,
+  SAMPLE_C_B_CL_O,
+  SAMPLE_C_LZ_O,
+  GATHER4,
+  GATHER4_CL,
+  GATHER4_L = 68,
+  GATHER4_B,
+  GATHER4_B_CL,
+  GATHER4_LZ,
+  GATHER4_C,
+  GATHER4_C_CL,
+  GATHER4_C_L = 76,
+  GATHER4_C_B,
+  GATHER4_C_B_CL,
+  GATHER4_C_LZ,
+  GATHER4_O,
+  GATHER4_CL_O,
+  GATHER4_L_O = 84,
+  GATHER4_B_O,
+  GATHER4_B_CL_O,
+  GATHER4_LZ_O,
+  GATHER4_C_O,
+  GATHER4_C_CL_O,
+  GATHER4_C_L_O = 92,
+  GATHER4_C_B_O,
+  GATHER4_C_B_CL_O,
+  GATHER4_C_LZ_O,
+  GET_LOD,
+  SAMPLE_CD = 104,
+  SAMPLE_CD_CL,
+  SAMPLE_C_CD,
+  SAMPLE_C_CD_CL,
+  SAMPLE_CD_O,
+  SAMPLE_CD_CL_O,
+  SAMPLE_C_CD_O,
+  SAMPLE_C_CD_CL_O,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case LOAD: return "image_load";
+  case LOAD_MIP: return "image_load_mip";
+  case LOAD_PCK: return "image_load_pck";
+  case LOAD_PCK_SGN: return "image_load_pck_sgn";
+  case LOAD_MIP_PCK: return "image_load_mip_pck";
+  case LOAD_MIP_PCK_SGN: return "image_load_mip_pck_sgn";
+  case STORE: return "image_store";
+  case STORE_MIP: return "image_store_mip";
+  case STORE_PCK: return "image_store_pck";
+  case STORE_MIP_PCK: return "image_store_mip_pck";
+  case GET_RESINFO: return "image_get_resinfo";
+  case ATOMIC_SWAP: return "image_atomic_swap";
+  case ATOMIC_CMPSWAP: return "image_atomic_cmpswap";
+  case ATOMIC_ADD: return "image_atomic_add";
+  case ATOMIC_SUB: return "image_atomic_sub";
+  case ATOMIC_RSUB: return "image_atomic_rsub";
+  case ATOMIC_SMIN: return "image_atomic_smin";
+  case ATOMIC_UMIN: return "image_atomic_umin";
+  case ATOMIC_SMAX: return "image_atomic_smax";
+  case ATOMIC_UMAX: return "image_atomic_umax";
+  case ATOMIC_AND: return "image_atomic_and";
+  case ATOMIC_OR: return "image_atomic_or";
+  case ATOMIC_XOR: return "image_atomic_xor";
+  case ATOMIC_INC: return "image_atomic_inc";
+  case ATOMIC_DEC: return "image_atomic_dec";
+  case ATOMIC_FCMPSWAP: return "image_atomic_fcmpswap";
+  case ATOMIC_FMIN: return "image_atomic_fmin";
+  case ATOMIC_FMAX: return "image_atomic_fmax";
+  case SAMPLE: return "image_sample";
+  case SAMPLE_CL: return "image_sample_cl";
+  case SAMPLE_D: return "image_sample_d";
+  case SAMPLE_D_CL: return "image_sample_d_cl";
+  case SAMPLE_L: return "image_sample_l";
+  case SAMPLE_B: return "image_sample_b";
+  case SAMPLE_B_CL: return "image_sample_b_cl";
+  case SAMPLE_LZ: return "image_sample_lz";
+  case SAMPLE_C: return "image_sample_c";
+  case SAMPLE_C_CL: return "image_sample_c_cl";
+  case SAMPLE_C_D: return "image_sample_c_d";
+  case SAMPLE_C_D_CL: return "image_sample_c_d_cl";
+  case SAMPLE_C_L: return "image_sample_c_l";
+  case SAMPLE_C_B: return "image_sample_c_b";
+  case SAMPLE_C_B_CL: return "image_sample_c_b_cl";
+  case SAMPLE_C_LZ: return "image_sample_c_lz";
+  case SAMPLE_O: return "image_sample_o";
+  case SAMPLE_CL_O: return "image_sample_cl_o";
+  case SAMPLE_D_O: return "image_sample_d_o";
+  case SAMPLE_D_CL_O: return "image_sample_d_cl_o";
+  case SAMPLE_L_O: return "image_sample_l_o";
+  case SAMPLE_B_O: return "image_sample_b_o";
+  case SAMPLE_B_CL_O: return "image_sample_b_cl_o";
+  case SAMPLE_LZ_O: return "image_sample_lz_o";
+  case SAMPLE_C_O: return "image_sample_c_o";
+  case SAMPLE_C_CL_O: return "image_sample_c_cl_o";
+  case SAMPLE_C_D_O: return "image_sample_c_d_o";
+  case SAMPLE_C_D_CL_O: return "image_sample_c_d_cl_o";
+  case SAMPLE_C_L_O: return "image_sample_c_l_o";
+  case SAMPLE_C_B_O: return "image_sample_c_b_o";
+  case SAMPLE_C_B_CL_O: return "image_sample_c_b_cl_o";
+  case SAMPLE_C_LZ_O: return "image_sample_c_lz_o";
+  case GATHER4: return "image_gather4";
+  case GATHER4_CL: return "image_gather4_cl";
+  case GATHER4_L: return "image_gather4_l";
+  case GATHER4_B: return "image_gather4_b";
+  case GATHER4_B_CL: return "image_gather4_b_cl";
+  case GATHER4_LZ: return "image_gather4_lz";
+  case GATHER4_C: return "image_gather4_c";
+  case GATHER4_C_CL: return "image_gather4_c_cl";
+  case GATHER4_C_L: return "image_gather4_c_l";
+  case GATHER4_C_B: return "image_gather4_c_b";
+  case GATHER4_C_B_CL: return "image_gather4_c_b_cl";
+  case GATHER4_C_LZ: return "image_gather4_c_lz";
+  case GATHER4_O: return "image_gather4_o";
+  case GATHER4_CL_O: return "image_gather4_cl_o";
+  case GATHER4_L_O: return "image_gather4_l_o";
+  case GATHER4_B_O: return "image_gather4_b_o";
+  case GATHER4_B_CL_O: return "image_gather4_b_cl_o";
+  case GATHER4_LZ_O: return "image_gather4_lz_o";
+  case GATHER4_C_O: return "image_gather4_c_o";
+  case GATHER4_C_CL_O: return "image_gather4_c_cl_o";
+  case GATHER4_C_L_O: return "image_gather4_c_l_o";
+  case GATHER4_C_B_O: return "image_gather4_c_b_o";
+  case GATHER4_C_B_CL_O: return "image_gather4_c_b_cl_o";
+  case GATHER4_C_LZ_O: return "image_gather4_c_lz_o";
+  case GET_LOD: return "image_get_lod";
+  case SAMPLE_CD: return "image_sample_cd";
+  case SAMPLE_CD_CL: return "image_sample_cd_cl";
+  case SAMPLE_C_CD: return "image_sample_c_cd";
+  case SAMPLE_C_CD_CL: return "image_sample_c_cd_cl";
+  case SAMPLE_CD_O: return "image_sample_cd_o";
+  case SAMPLE_CD_CL_O: return "image_sample_cd_cl_o";
+  case SAMPLE_C_CD_O: return "image_sample_c_cd_o";
+  case SAMPLE_C_CD_CL_O: return "image_sample_c_cd_cl_o";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::mimg
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mtbuf.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mtbuf.hpp
@ -0,0 +1,37 @@
+#pragma once
+
+namespace shader::ir::mtbuf {
+enum Op {
+  LOAD_FORMAT_X,
+  LOAD_FORMAT_XY,
+  LOAD_FORMAT_XYZ,
+  LOAD_FORMAT_XYZW,
+  STORE_FORMAT_X,
+  STORE_FORMAT_XY,
+  STORE_FORMAT_XYZ,
+  STORE_FORMAT_XYZW,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case LOAD_FORMAT_X:
+    return "tbuffer_load_format_x";
+  case LOAD_FORMAT_XY:
+    return "tbuffer_load_format_xy";
+  case LOAD_FORMAT_XYZ:
+    return "tbuffer_load_format_xyz";
+  case LOAD_FORMAT_XYZW:
+    return "tbuffer_load_format_xyzw";
+  case STORE_FORMAT_X:
+    return "tbuffer_store_format_x";
+  case STORE_FORMAT_XY:
+    return "tbuffer_store_format_xy";
+  case STORE_FORMAT_XYZ:
+    return "tbuffer_store_format_xyz";
+  case STORE_FORMAT_XYZW:
+    return "tbuffer_store_format_xyzw";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::mtbuf
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mubuf.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/mubuf.hpp
@ -0,0 +1,129 @@
+#pragma once
+
+namespace shader::ir::mubuf {
+enum Op {
+  LOAD_FORMAT_X,
+  LOAD_FORMAT_XY,
+  LOAD_FORMAT_XYZ,
+  LOAD_FORMAT_XYZW,
+  STORE_FORMAT_X,
+  STORE_FORMAT_XY,
+  STORE_FORMAT_XYZ,
+  STORE_FORMAT_XYZW,
+  LOAD_UBYTE,
+  LOAD_SBYTE,
+  LOAD_USHORT,
+  LOAD_SSHORT,
+  LOAD_DWORD,
+  LOAD_DWORDX2,
+  LOAD_DWORDX4,
+  LOAD_DWORDX3,
+  STORE_BYTE = 24,
+  STORE_SHORT = 26,
+  STORE_DWORD = 28,
+  STORE_DWORDX2,
+  STORE_DWORDX4,
+  STORE_DWORDX3,
+  ATOMIC_SWAP = 48,
+  ATOMIC_CMPSWAP,
+  ATOMIC_ADD,
+  ATOMIC_SUB,
+  ATOMIC_RSUB,
+  ATOMIC_SMIN,
+  ATOMIC_UMIN,
+  ATOMIC_SMAX,
+  ATOMIC_UMAX,
+  ATOMIC_AND,
+  ATOMIC_OR,
+  ATOMIC_XOR,
+  ATOMIC_INC,
+  ATOMIC_DEC,
+  ATOMIC_FCMPSWAP,
+  ATOMIC_FMIN,
+  ATOMIC_FMAX,
+  ATOMIC_SWAP_X2 = 80,
+  ATOMIC_CMPSWAP_X2,
+  ATOMIC_ADD_X2,
+  ATOMIC_SUB_X2,
+  ATOMIC_RSUB_X2,
+  ATOMIC_SMIN_X2,
+  ATOMIC_UMIN_X2,
+  ATOMIC_SMAX_X2,
+  ATOMIC_UMAX_X2,
+  ATOMIC_AND_X2,
+  ATOMIC_OR_X2,
+  ATOMIC_XOR_X2,
+  ATOMIC_INC_X2,
+  ATOMIC_DEC_X2,
+  ATOMIC_FCMPSWAP_X2,
+  ATOMIC_FMIN_X2,
+  ATOMIC_FMAX_X2,
+  WBINVL1_SC_VOL = 112,
+  WBINVL1,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case LOAD_FORMAT_X:return "buffer_load_format_x";
+  case LOAD_FORMAT_XY:return "buffer_load_format_xy";
+  case LOAD_FORMAT_XYZ:return "buffer_load_format_xyz";
+  case LOAD_FORMAT_XYZW:return "buffer_load_format_xyzw";
+  case STORE_FORMAT_X:return "buffer_store_format_x";
+  case STORE_FORMAT_XY:return "buffer_store_format_xy";
+  case STORE_FORMAT_XYZ:return "buffer_store_format_xyz";
+  case STORE_FORMAT_XYZW:return "buffer_store_format_xyzw";
+  case LOAD_UBYTE:return "buffer_load_ubyte";
+  case LOAD_SBYTE:return "buffer_load_sbyte";
+  case LOAD_USHORT:return "buffer_load_ushort";
+  case LOAD_SSHORT:return "buffer_load_sshort";
+  case LOAD_DWORD:return "buffer_load_dword";
+  case LOAD_DWORDX2:return "buffer_load_dwordx2";
+  case LOAD_DWORDX4:return "buffer_load_dwordx4";
+  case LOAD_DWORDX3:return "buffer_load_dwordx3";
+  case STORE_BYTE:return "buffer_store_byte";
+  case STORE_SHORT:return "buffer_store_short";
+  case STORE_DWORD:return "buffer_store_dword";
+  case STORE_DWORDX2:return "buffer_store_dwordx2";
+  case STORE_DWORDX4:return "buffer_store_dwordx4";
+  case STORE_DWORDX3:return "buffer_store_dwordx3";
+  case ATOMIC_SWAP:return "buffer_atomic_swap";
+  case ATOMIC_CMPSWAP:return "buffer_atomic_cmpswap";
+  case ATOMIC_ADD:return "buffer_atomic_add";
+  case ATOMIC_SUB:return "buffer_atomic_sub";
+  case ATOMIC_RSUB:return "buffer_atomic_rsub";
+  case ATOMIC_SMIN:return "buffer_atomic_smin";
+  case ATOMIC_UMIN:return "buffer_atomic_umin";
+  case ATOMIC_SMAX:return "buffer_atomic_smax";
+  case ATOMIC_UMAX:return "buffer_atomic_umax";
+  case ATOMIC_AND:return "buffer_atomic_and";
+  case ATOMIC_OR:return "buffer_atomic_or";
+  case ATOMIC_XOR:return "buffer_atomic_xor";
+  case ATOMIC_INC:return "buffer_atomic_inc";
+  case ATOMIC_DEC:return "buffer_atomic_dec";
+  case ATOMIC_FCMPSWAP:return "buffer_atomic_fcmpswap";
+  case ATOMIC_FMIN:return "buffer_atomic_fmin";
+  case ATOMIC_FMAX:return "buffer_atomic_fmax";
+  case ATOMIC_SWAP_X2:return "buffer_atomic_swap_x2";
+  case ATOMIC_CMPSWAP_X2:return "buffer_atomic_cmpswap_x2";
+  case ATOMIC_ADD_X2:return "buffer_atomic_add_x2";
+  case ATOMIC_SUB_X2:return "buffer_atomic_sub_x2";
+  case ATOMIC_RSUB_X2:return "buffer_atomic_rsub_x2";
+  case ATOMIC_SMIN_X2:return "buffer_atomic_smin_x2";
+  case ATOMIC_UMIN_X2:return "buffer_atomic_umin_x2";
+  case ATOMIC_SMAX_X2:return "buffer_atomic_smax_x2";
+  case ATOMIC_UMAX_X2:return "buffer_atomic_umax_x2";
+  case ATOMIC_AND_X2:return "buffer_atomic_and_x2";
+  case ATOMIC_OR_X2:return "buffer_atomic_or_x2";
+  case ATOMIC_XOR_X2:return "buffer_atomic_xor_x2";
+  case ATOMIC_INC_X2:return "buffer_atomic_inc_x2";
+  case ATOMIC_DEC_X2:return "buffer_atomic_dec_x2";
+  case ATOMIC_FCMPSWAP_X2:return "buffer_atomic_fcmpswap_x2";
+  case ATOMIC_FMIN_X2:return "buffer_atomic_fmin_x2";
+  case ATOMIC_FMAX_X2:return "buffer_atomic_fmax_x2";
+  case WBINVL1_SC_VOL:return "buffer_wbinvl1_sc_vol";
+  case WBINVL1:return "buffer_wbinvl1";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::mubuf
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/smrd.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/smrd.hpp
@ -0,0 +1,39 @@
+#pragma once
+
+namespace shader::ir::smrd {
+enum Op {
+  LOAD_DWORD,
+  LOAD_DWORDX2,
+  LOAD_DWORDX4,
+  LOAD_DWORDX8,
+  LOAD_DWORDX16,
+  BUFFER_LOAD_DWORD = 8,
+  BUFFER_LOAD_DWORDX2,
+  BUFFER_LOAD_DWORDX4,
+  BUFFER_LOAD_DWORDX8,
+  BUFFER_LOAD_DWORDX16,
+  DCACHE_INV_VOL = 29,
+  MEMTIME,
+  DCACHE_INV,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case LOAD_DWORD: return "s_load_dword";
+  case LOAD_DWORDX2: return "s_load_dwordx2";
+  case LOAD_DWORDX4: return "s_load_dwordx4";
+  case LOAD_DWORDX8: return "s_load_dwordx8";
+  case LOAD_DWORDX16: return "s_load_dwordx16";
+  case BUFFER_LOAD_DWORD: return "s_buffer_load_dword";
+  case BUFFER_LOAD_DWORDX2: return "s_buffer_load_dwordx2";
+  case BUFFER_LOAD_DWORDX4: return "s_buffer_load_dwordx4";
+  case BUFFER_LOAD_DWORDX8: return "s_buffer_load_dwordx8";
+  case BUFFER_LOAD_DWORDX16: return "s_buffer_load_dwordx16";
+  case DCACHE_INV_VOL: return "s_dcache_inv_vol";
+  case MEMTIME: return "s_memtime";
+  case DCACHE_INV: return "s_dcache_inv";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::smrd
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop1.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop1.hpp
@ -0,0 +1,109 @@
+#pragma once
+
+namespace shader::ir::sop1 {
+enum Op {
+  MOV_B32 = 3,
+  MOV_B64,
+  CMOV_B32,
+  CMOV_B64,
+  NOT_B32,
+  NOT_B64,
+  WQM_B32,
+  WQM_B64,
+  BREV_B32,
+  BREV_B64,
+  BCNT0_I32_B32,
+  BCNT0_I32_B64,
+  BCNT1_I32_B32,
+  BCNT1_I32_B64,
+  FF0_I32_B32,
+  FF0_I32_B64,
+  FF1_I32_B32,
+  FF1_I32_B64,
+  FLBIT_I32_B32,
+  FLBIT_I32_B64,
+  FLBIT_I32,
+  FLBIT_I32_I64,
+  SEXT_I32_I8,
+  SEXT_I32_I16,
+  BITSET0_B32,
+  BITSET0_B64,
+  BITSET1_B32,
+  BITSET1_B64,
+  GETPC_B64,
+  SETPC_B64,
+  SWAPPC_B64,
+  AND_SAVEEXEC_B64 = 36,
+  OR_SAVEEXEC_B64,
+  XOR_SAVEEXEC_B64,
+  ANDN2_SAVEEXEC_B64,
+  ORN2_SAVEEXEC_B64,
+  NAND_SAVEEXEC_B64,
+  NOR_SAVEEXEC_B64,
+  XNOR_SAVEEXEC_B64,
+  QUADMASK_B32,
+  QUADMASK_B64,
+  MOVRELS_B32,
+  MOVRELS_B64,
+  MOVRELD_B32,
+  MOVRELD_B64,
+  CBRANCH_JOIN,
+  ABS_I32 = 52,
+  MOV_FED_B32,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case MOV_B32: return "s_mov_b32";
+  case MOV_B64: return "s_mov_b64";
+  case CMOV_B32: return "s_cmov_b32";
+  case CMOV_B64: return "s_cmov_b64";
+  case NOT_B32: return "s_not_b32";
+  case NOT_B64: return "s_not_b64";
+  case WQM_B32: return "s_wqm_b32";
+  case WQM_B64: return "s_wqm_b64";
+  case BREV_B32: return "s_brev_b32";
+  case BREV_B64: return "s_brev_b64";
+  case BCNT0_I32_B32: return "s_bcnt0_i32_b32";
+  case BCNT0_I32_B64: return "s_bcnt0_i32_b64";
+  case BCNT1_I32_B32: return "s_bcnt1_i32_b32";
+  case BCNT1_I32_B64: return "s_bcnt1_i32_b64";
+  case FF0_I32_B32: return "s_ff0_i32_b32";
+  case FF0_I32_B64: return "s_ff0_i32_b64";
+  case FF1_I32_B32: return "s_ff1_i32_b32";
+  case FF1_I32_B64: return "s_ff1_i32_b64";
+  case FLBIT_I32_B32: return "s_flbit_i32_b32";
+  case FLBIT_I32_B64: return "s_flbit_i32_b64";
+  case FLBIT_I32: return "s_flbit_i32";
+  case FLBIT_I32_I64: return "s_flbit_i32_i64";
+  case SEXT_I32_I8: return "s_sext_i32_i8";
+  case SEXT_I32_I16: return "s_sext_i32_i16";
+  case BITSET0_B32: return "s_bitset0_b32";
+  case BITSET0_B64: return "s_bitset0_b64";
+  case BITSET1_B32: return "s_bitset1_b32";
+  case BITSET1_B64: return "s_bitset1_b64";
+  case GETPC_B64: return "s_getpc_b64";
+  case SETPC_B64: return "s_setpc_b64";
+  case SWAPPC_B64: return "s_swappc_b64";
+  case AND_SAVEEXEC_B64: return "s_and_saveexec_b64";
+  case OR_SAVEEXEC_B64: return "s_or_saveexec_b64";
+  case XOR_SAVEEXEC_B64: return "s_xor_saveexec_b64";
+  case ANDN2_SAVEEXEC_B64: return "s_andn2_saveexec_b64";
+  case ORN2_SAVEEXEC_B64: return "s_orn2_saveexec_b64";
+  case NAND_SAVEEXEC_B64: return "s_nand_saveexec_b64";
+  case NOR_SAVEEXEC_B64: return "s_nor_saveexec_b64";
+  case XNOR_SAVEEXEC_B64: return "s_xnor_saveexec_b64";
+  case QUADMASK_B32: return "s_quadmask_b32";
+  case QUADMASK_B64: return "s_quadmask_b64";
+  case MOVRELS_B32: return "s_movrels_b32";
+  case MOVRELS_B64: return "s_movrels_b64";
+  case MOVRELD_B32: return "s_movreld_b32";
+  case MOVRELD_B64: return "s_movreld_b64";
+  case CBRANCH_JOIN: return "s_cbranch_join";
+  case ABS_I32: return "s_abs_i32";
+  case MOV_FED_B32: return "s_mov_fed_b32";
+  }
+  return nullptr;
+}
+}
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop2.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sop2.hpp
@ -0,0 +1,171 @@
+#pragma once
+#include "../ir.hpp"
+
+namespace shader::ir::sop2 {
+enum Op {
+  ADD_U32,
+  SUB_U32,
+  ADD_I32,
+  SUB_I32,
+  ADDC_U32,
+  SUBB_U32,
+  MIN_I32,
+  MIN_U32,
+  MAX_I32,
+  MAX_U32,
+  CSELECT_B32,
+  CSELECT_B64,
+  AND_B32 = 14,
+  AND_B64,
+  OR_B32,
+  OR_B64,
+  XOR_B32,
+  XOR_B64,
+  ANDN2_B32,
+  ANDN2_B64,
+  ORN2_B32,
+  ORN2_B64,
+  NAND_B32,
+  NAND_B64,
+  NOR_B32,
+  NOR_B64,
+  XNOR_B32,
+  XNOR_B64,
+  LSHL_B32,
+  LSHL_B64,
+  LSHR_B32,
+  LSHR_B64,
+  ASHR_I32,
+  ASHR_I64,
+  BFM_B32,
+  BFM_B64,
+  MUL_I32,
+  BFE_U32,
+  BFE_I32,
+  BFE_U64,
+  BFE_I64,
+  CBRANCH_G_FORK,
+  ABSDIFF_I32,
+  LSHL1_ADD_U32,
+  LSHL2_ADD_U32,
+  LSHL3_ADD_U32,
+  LSHL4_ADD_U32,
+  PACK_LL_B32_B16,
+  PACK_LH_B32_B16,
+  PACK_HH_B32_B16,
+  MUL_HI_U32,
+  MUL_HI_I32,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case ADD_U32:
+    return "s_add_u32";
+  case SUB_U32:
+    return "s_sub_u32";
+  case ADD_I32:
+    return "s_add_i32";
+  case SUB_I32:
+    return "s_sub_i32";
+  case ADDC_U32:
+    return "s_addc_u32";
+  case SUBB_U32:
+    return "s_subb_u32";
+  case MIN_I32:
+    return "s_min_i32";
+  case MIN_U32:
+    return "s_min_u32";
+  case MAX_I32:
+    return "s_max_i32";
+  case MAX_U32:
+    return "s_max_u32";
+  case CSELECT_B32:
+    return "s_cselect_b32";
+  case CSELECT_B64:
+    return "s_cselect_b64";
+  case AND_B32:
+    return "s_and_b32";
+  case AND_B64:
+    return "s_and_b64";
+  case OR_B32:
+    return "s_or_b32";
+  case OR_B64:
+    return "s_or_b64";
+  case XOR_B32:
+    return "s_xor_b32";
+  case XOR_B64:
+    return "s_xor_b64";
+  case ANDN2_B32:
+    return "s_andn2_b32";
+  case ANDN2_B64:
+    return "s_andn2_b64";
+  case ORN2_B32:
+    return "s_orn2_b32";
+  case ORN2_B64:
+    return "s_orn2_b64";
+  case NAND_B32:
+    return "s_nand_b32";
+  case NAND_B64:
+    return "s_nand_b64";
+  case NOR_B32:
+    return "s_nor_b32";
+  case NOR_B64:
+    return "s_nor_b64";
+  case XNOR_B32:
+    return "s_xnor_b32";
+  case XNOR_B64:
+    return "s_xnor_b64";
+  case LSHL_B32:
+    return "s_lshl_b32";
+  case LSHL_B64:
+    return "s_lshl_b64";
+  case LSHR_B32:
+    return "s_lshr_b32";
+  case LSHR_B64:
+    return "s_lshr_b64";
+  case ASHR_I32:
+    return "s_ashr_i32";
+  case ASHR_I64:
+    return "s_ashr_i64";
+  case BFM_B32:
+    return "s_bfm_b32";
+  case BFM_B64:
+    return "s_bfm_b64";
+  case MUL_I32:
+    return "s_mul_i32";
+  case BFE_U32:
+    return "s_bfe_u32";
+  case BFE_I32:
+    return "s_bfe_i32";
+  case BFE_U64:
+    return "s_bfe_u64";
+  case BFE_I64:
+    return "s_bfe_i64";
+  case CBRANCH_G_FORK:
+    return "s_cbranch_g_fork";
+  case ABSDIFF_I32:
+    return "s_absdiff_i32";
+  case LSHL1_ADD_U32:
+    return "s_lshl1_add_u32";
+  case LSHL2_ADD_U32:
+    return "s_lshl2_add_u32";
+  case LSHL3_ADD_U32:
+    return "s_lshl3_add_u32";
+  case LSHL4_ADD_U32:
+    return "s_lshl4_add_u32";
+  case PACK_LL_B32_B16:
+    return "s_pack_ll_b32_b16";
+  case PACK_LH_B32_B16:
+    return "s_pack_lh_b32_b16";
+  case PACK_HH_B32_B16:
+    return "s_pack_hh_b32_b16";
+  case MUL_HI_U32:
+    return "s_mul_hi_u32";
+  case MUL_HI_I32:
+    return "s_mul_hi_i32";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::sop2
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopc.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopc.hpp
@ -0,0 +1,67 @@
+#pragma once
+
+namespace shader::ir::sopc {
+enum Op {
+  CMP_EQ_I32,
+  CMP_LG_I32,
+  CMP_GT_I32,
+  CMP_GE_I32,
+  CMP_LT_I32,
+  CMP_LE_I32,
+  CMP_EQ_U32,
+  CMP_LG_U32,
+  CMP_GT_U32,
+  CMP_GE_U32,
+  CMP_LT_U32,
+  CMP_LE_U32,
+  BITCMP0_B32,
+  BITCMP1_B32,
+  BITCMP0_B64,
+  BITCMP1_B64,
+  SETVSKIP,
+  ILLEGALD,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case CMP_EQ_I32:
+    return "s_cmp_eq_i32";
+  case CMP_LG_I32:
+    return "s_cmp_lg_i32";
+  case CMP_GT_I32:
+    return "s_cmp_gt_i32";
+  case CMP_GE_I32:
+    return "s_cmp_ge_i32";
+  case CMP_LT_I32:
+    return "s_cmp_lt_i32";
+  case CMP_LE_I32:
+    return "s_cmp_le_i32";
+  case CMP_EQ_U32:
+    return "s_cmp_eq_u32";
+  case CMP_LG_U32:
+    return "s_cmp_lg_u32";
+  case CMP_GT_U32:
+    return "s_cmp_gt_u32";
+  case CMP_GE_U32:
+    return "s_cmp_ge_u32";
+  case CMP_LT_U32:
+    return "s_cmp_lt_u32";
+  case CMP_LE_U32:
+    return "s_cmp_le_u32";
+  case BITCMP0_B32:
+    return "bitcmp0_b32";
+  case BITCMP1_B32:
+    return "bitcmp1_b32";
+  case BITCMP0_B64:
+    return "bitcmp0_b64";
+  case BITCMP1_B64:
+    return "bitcmp1_b64";
+  case SETVSKIP:
+    return "setvskip";
+  case ILLEGALD:
+    return "illegald";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::sopc
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopk.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopk.hpp
@ -0,0 +1,73 @@
+#pragma once
+
+namespace shader::ir::sopk {
+enum Op {
+  MOVK_I32,
+  CMOVK_I32 = 2,
+  CMPK_EQ_I32,
+  CMPK_LG_I32,
+  CMPK_GT_I32,
+  CMPK_GE_I32,
+  CMPK_LT_I32,
+  CMPK_LE_I32,
+  CMPK_EQ_U32,
+  CMPK_LG_U32,
+  CMPK_GT_U32,
+  CMPK_GE_U32,
+  CMPK_LT_U32,
+  CMPK_LE_U32,
+  ADDK_I32,
+  MULK_I32,
+  CBRANCH_I_FORK,
+  GETREG_B32,
+  SETREG_B32,
+  SETREG_IMM,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case MOVK_I32:
+    return "s_movk_i32";
+  case CMOVK_I32:
+    return "s_cmovk_i32";
+  case CMPK_EQ_I32:
+    return "s_cmpk_eq_i32";
+  case CMPK_LG_I32:
+    return "s_cmpk_lg_i32";
+  case CMPK_GT_I32:
+    return "s_cmpk_gt_i32";
+  case CMPK_GE_I32:
+    return "s_cmpk_ge_i32";
+  case CMPK_LT_I32:
+    return "s_cmpk_lt_i32";
+  case CMPK_LE_I32:
+    return "s_cmpk_le_i32";
+  case CMPK_EQ_U32:
+    return "s_cmpk_eq_u32";
+  case CMPK_LG_U32:
+    return "s_cmpk_lg_u32";
+  case CMPK_GT_U32:
+    return "s_cmpk_gt_u32";
+  case CMPK_GE_U32:
+    return "s_cmpk_ge_u32";
+  case CMPK_LT_U32:
+    return "s_cmpk_lt_u32";
+  case CMPK_LE_U32:
+    return "s_cmpk_le_u32";
+  case ADDK_I32:
+    return "s_addk_i32";
+  case MULK_I32:
+    return "s_mulk_i32";
+  case CBRANCH_I_FORK:
+    return "s_cbranch_i_fork";
+  case GETREG_B32:
+    return "s_getreg_b32";
+  case SETREG_B32:
+    return "s_setreg_b32";
+  case SETREG_IMM:
+    return "s_setreg_imm";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::sopk
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopp.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/sopp.hpp
@ -0,0 +1,89 @@
+#pragma once
+
+namespace shader::ir::sopp {
+enum Op {
+  NOP,
+  ENDPGM,
+  BRANCH,
+  CBRANCH_SCC0 = 4,
+  CBRANCH_SCC1,
+  CBRANCH_VCCZ,
+  CBRANCH_VCCNZ,
+  CBRANCH_EXECZ,
+  CBRANCH_EXECNZ,
+  BARRIER,
+  WAITCNT = 12,
+  SETHALT,
+  SLEEP,
+  SETPRIO,
+  SENDMSG,
+  SENDMSGHALT,
+  TRAP,
+  ICACHE_INV,
+  INCPERFLEVEL,
+  DECPERFLEVEL,
+  TTRACEDATA,
+  CBRANCH_CDBGSYS = 23,
+  CBRANCH_CDBGUSER = 24,
+  CBRANCH_CDBGSYS_OR_USER = 25,
+  CBRANCH_CDBGSYS_AND_USER = 26,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case NOP:
+    return "s_nop";
+  case ENDPGM:
+    return "s_endpgm";
+  case BRANCH:
+    return "s_branch";
+  case CBRANCH_SCC0:
+    return "s_cbranch_scc0";
+  case CBRANCH_SCC1:
+    return "s_cbranch_scc1";
+  case CBRANCH_VCCZ:
+    return "s_cbranch_vccz";
+  case CBRANCH_VCCNZ:
+    return "s_cbranch_vccnz";
+  case CBRANCH_EXECZ:
+    return "s_cbranch_execz";
+  case CBRANCH_EXECNZ:
+    return "s_cbranch_execnz";
+  case BARRIER:
+    return "s_barrier";
+  case WAITCNT:
+    return "s_waitcnt";
+  case SETHALT:
+    return "s_sethalt";
+  case SLEEP:
+    return "s_sleep";
+  case SETPRIO:
+    return "s_setprio";
+  case SENDMSG:
+    return "s_sendmsg";
+  case SENDMSGHALT:
+    return "s_sendmsghalt";
+  case TRAP:
+    return "s_trap";
+  case ICACHE_INV:
+    return "s_icache_inv";
+  case INCPERFLEVEL:
+    return "s_incperflevel";
+  case DECPERFLEVEL:
+    return "s_decperflevel";
+  case TTRACEDATA:
+    return "s_ttracedata";
+  case CBRANCH_CDBGSYS:
+    return "s_cbranch_cdbgsys";
+  case CBRANCH_CDBGUSER:
+    return "s_cbranch_cdbguser";
+  case CBRANCH_CDBGSYS_OR_USER:
+    return "s_cbranch_cdbgsys_or_user";
+  case CBRANCH_CDBGSYS_AND_USER:
+    return "s_cbranch_cdbgsys_and_user";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::sopp
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vintrp.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vintrp.hpp
@ -0,0 +1,23 @@
+#pragma once
+
+namespace shader::ir::vintrp {
+enum Op {
+  P1_F32,
+  P2_F32,
+  MOV_F32,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case P1_F32:
+    return "v_interp_p1_f32";
+  case P2_F32:
+    return "v_interp_p2_f32";
+  case MOV_F32:
+    return "v_interp_mov_f32";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::vintrp
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop1.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop1.hpp
@ -0,0 +1,259 @@
+#pragma once
+
+namespace shader::ir::vop1 {
+enum Op {
+  NOP,
+  MOV_B32,
+  READFIRSTLANE_B32,
+  CVT_I32_F64,
+  CVT_F64_I32,
+  CVT_F32_I32,
+  CVT_F32_U32,
+  CVT_U32_F32,
+  CVT_I32_F32,
+  MOV_FED_B32,
+  CVT_F16_F32,
+  CVT_F32_F16,
+  CVT_RPI_I32_F32,
+  CVT_FLR_I32_F32,
+  CVT_OFF_F32_I4,
+  CVT_F32_F64,
+  CVT_F64_F32,
+  CVT_F32_UBYTE0,
+  CVT_F32_UBYTE1,
+  CVT_F32_UBYTE2,
+  CVT_F32_UBYTE3,
+  CVT_U32_F64,
+  CVT_F64_U32,
+  FRACT_F32 = 32,
+  TRUNC_F32,
+  CEIL_F32,
+  RNDNE_F32,
+  FLOOR_F32,
+  EXP_F32,
+  LOG_CLAMP_F32,
+  LOG_F32,
+  RCP_CLAMP_F32,
+  RCP_LEGACY_F32,
+  RCP_F32,
+  RCP_IFLAG_F32,
+  RSQ_CLAMP_F32,
+  RSQ_LEGACY_F32,
+  RSQ_F32,
+  RCP_F64,
+  RCP_CLAMP_F64,
+  RSQ_F64,
+  RSQ_CLAMP_F64,
+  SQRT_F32,
+  SQRT_F64,
+  SIN_F32,
+  COS_F32,
+  NOT_B32,
+  BFREV_B32,
+  FFBH_U32,
+  FFBL_B32,
+  FFBH_I32,
+  FREXP_EXP_I32_F64,
+  FREXP_MANT_F64,
+  FRACT_F64,
+  FREXP_EXP_I32_F32,
+  FREXP_MANT_F32,
+  CLREXCP,
+  MOVRELD_B32,
+  MOVRELS_B32,
+  MOVRELSD_B32,
+  CVT_F16_U16 = 80,
+  CVT_F16_I16,
+  CVT_U16_F16,
+  CVT_I16_F16,
+  RCP_F16,
+  SQRT_F16,
+  RSQ_F16,
+  LOG_F16,
+  EXP_F16,
+  FREXP_MANT_F16,
+  FREXP_EXP_I16_F16,
+  FLOOR_F16,
+  CEIL_F16,
+  TRUNC_F16,
+  RNDNE_F16,
+  FRACT_F16,
+  SIN_F16,
+  COS_F16,
+  SAT_PK_U8_I16,
+  CVT_NORM_I16_F16,
+  CVT_NORM_U16_F16,
+  SWAP_B32,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case NOP:
+    return "v_nop";
+  case MOV_B32:
+    return "v_mov_b32";
+  case READFIRSTLANE_B32:
+    return "v_readfirstlane_b32";
+  case CVT_I32_F64:
+    return "v_cvt_i32_f64";
+  case CVT_F64_I32:
+    return "v_cvt_f64_i32";
+  case CVT_F32_I32:
+    return "v_cvt_f32_i32";
+  case CVT_F32_U32:
+    return "v_cvt_f32_u32";
+  case CVT_U32_F32:
+    return "v_cvt_u32_f32";
+  case CVT_I32_F32:
+    return "v_cvt_i32_f32";
+  case MOV_FED_B32:
+    return "v_mov_fed_b32";
+  case CVT_F16_F32:
+    return "v_cvt_f16_f32";
+  case CVT_F32_F16:
+    return "v_cvt_f32_f16";
+  case CVT_RPI_I32_F32:
+    return "v_cvt_rpi_i32_f32";
+  case CVT_FLR_I32_F32:
+    return "v_cvt_flr_i32_f32";
+  case CVT_OFF_F32_I4:
+    return "v_cvt_off_f32_i4";
+  case CVT_F32_F64:
+    return "v_cvt_f32_f64";
+  case CVT_F64_F32:
+    return "v_cvt_f64_f32";
+  case CVT_F32_UBYTE0:
+    return "v_cvt_f32_ubyte0";
+  case CVT_F32_UBYTE1:
+    return "v_cvt_f32_ubyte1";
+  case CVT_F32_UBYTE2:
+    return "v_cvt_f32_ubyte2";
+  case CVT_F32_UBYTE3:
+    return "v_cvt_f32_ubyte3";
+  case CVT_U32_F64:
+    return "v_cvt_u32_f64";
+  case CVT_F64_U32:
+    return "v_cvt_f64_u32";
+  case FRACT_F32:
+    return "v_fract_f32";
+  case TRUNC_F32:
+    return "v_trunc_f32";
+  case CEIL_F32:
+    return "v_ceil_f32";
+  case RNDNE_F32:
+    return "v_rndne_f32";
+  case FLOOR_F32:
+    return "v_floor_f32";
+  case EXP_F32:
+    return "v_exp_f32";
+  case LOG_CLAMP_F32:
+    return "v_log_clamp_f32";
+  case LOG_F32:
+    return "v_log_f32";
+  case RCP_CLAMP_F32:
+    return "v_rcp_clamp_f32";
+  case RCP_LEGACY_F32:
+    return "v_rcp_legacy_f32";
+  case RCP_F32:
+    return "v_rcp_f32";
+  case RCP_IFLAG_F32:
+    return "v_rcp_iflag_f32";
+  case RSQ_CLAMP_F32:
+    return "v_rsq_clamp_f32";
+  case RSQ_LEGACY_F32:
+    return "v_rsq_legacy_f32";
+  case RSQ_F32:
+    return "v_rsq_f32";
+  case RCP_F64:
+    return "v_rcp_f64";
+  case RCP_CLAMP_F64:
+    return "v_rcp_clamp_f64";
+  case RSQ_F64:
+    return "v_rsq_f64";
+  case RSQ_CLAMP_F64:
+    return "v_rsq_clamp_f64";
+  case SQRT_F32:
+    return "v_sqrt_f32";
+  case SQRT_F64:
+    return "v_sqrt_f64";
+  case SIN_F32:
+    return "v_sin_f32";
+  case COS_F32:
+    return "v_cos_f32";
+  case NOT_B32:
+    return "v_not_b32";
+  case BFREV_B32:
+    return "v_bfrev_b32";
+  case FFBH_U32:
+    return "v_ffbh_u32";
+  case FFBL_B32:
+    return "v_ffbl_b32";
+  case FFBH_I32:
+    return "v_ffbh_i32";
+  case FREXP_EXP_I32_F64:
+    return "v_frexp_exp_i32_f64";
+  case FREXP_MANT_F64:
+    return "v_frexp_mant_f64";
+  case FRACT_F64:
+    return "v_fract_f64";
+  case FREXP_EXP_I32_F32:
+    return "v_frexp_exp_i32_f32";
+  case FREXP_MANT_F32:
+    return "v_frexp_mant_f32";
+  case CLREXCP:
+    return "v_clrexcp";
+  case MOVRELD_B32:
+    return "v_movreld_b32";
+  case MOVRELS_B32:
+    return "v_movrels_b32";
+  case MOVRELSD_B32:
+    return "v_movrelsd_b32";
+  case CVT_F16_U16:
+    return "v_cvt_f16_u16";
+  case CVT_F16_I16:
+    return "v_cvt_f16_i16";
+  case CVT_U16_F16:
+    return "v_cvt_u16_f16";
+  case CVT_I16_F16:
+    return "v_cvt_i16_f16";
+  case RCP_F16:
+    return "v_rcp_f16";
+  case SQRT_F16:
+    return "v_sqrt_f16";
+  case RSQ_F16:
+    return "v_rsq_f16";
+  case LOG_F16:
+    return "v_log_f16";
+  case EXP_F16:
+    return "v_exp_f16";
+  case FREXP_MANT_F16:
+    return "v_frexp_mant_f16";
+  case FREXP_EXP_I16_F16:
+    return "v_frexp_exp_i16_f16";
+  case FLOOR_F16:
+    return "v_floor_f16";
+  case CEIL_F16:
+    return "v_ceil_f16";
+  case TRUNC_F16:
+    return "v_trunc_f16";
+  case RNDNE_F16:
+    return "v_rndne_f16";
+  case FRACT_F16:
+    return "v_fract_f16";
+  case SIN_F16:
+    return "v_sin_f16";
+  case COS_F16:
+    return "v_cos_f16";
+  case SAT_PK_U8_I16:
+    return "v_sat_pk_u8_i16";
+  case CVT_NORM_I16_F16:
+    return "v_cvt_norm_i16_f16";
+  case CVT_NORM_U16_F16:
+    return "v_cvt_norm_u16_f16";
+  case SWAP_B32:
+    return "v_swap_b32";
+  }
+  return nullptr;
+}
+} // namespace shader::ir::vop1
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop2.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop2.hpp
@ -0,0 +1,164 @@
+#pragma once
+
+namespace shader::ir::vop2 {
+enum Op {
+  CNDMASK_B32,
+  READLANE_B32,
+  WRITELANE_B32,
+  ADD_F32,
+  SUB_F32,
+  SUBREV_F32,
+  MAC_LEGACY_F32,
+  MUL_LEGACY_F32,
+  MUL_F32,
+  MUL_I32_I24,
+  MUL_HI_I32_I24,
+  MUL_U32_U24,
+  MUL_HI_U32_U24,
+  MIN_LEGACY_F32,
+  MAX_LEGACY_F32,
+  MIN_F32,
+  MAX_F32,
+  MIN_I32,
+  MAX_I32,
+  MIN_U32,
+  MAX_U32,
+  LSHR_B32,
+  LSHRREV_B32,
+  ASHR_I32,
+  ASHRREV_I32,
+  LSHL_B32,
+  LSHLREV_B32,
+  AND_B32,
+  OR_B32,
+  XOR_B32,
+  BFM_B32,
+  MAC_F32,
+  MADMK_F32,
+  MADAK_F32,
+  BCNT_U32_B32,
+  MBCNT_LO_U32_B32,
+  MBCNT_HI_U32_B32,
+  ADD_I32,
+  SUB_I32,
+  SUBREV_I32,
+  ADDC_U32,
+  SUBB_U32,
+  SUBBREV_U32,
+  LDEXP_F32,
+  CVT_PKACCUM_U8_F32,
+  CVT_PKNORM_I16_F32,
+  CVT_PKNORM_U16_F32,
+  CVT_PKRTZ_F16_F32,
+  CVT_PK_U16_U32,
+  CVT_PK_I16_I32,
+
+  OpCount
+};
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case CNDMASK_B32:
+    return "v_cndmask_b32";
+  case READLANE_B32:
+    return "v_readlane_b32";
+  case WRITELANE_B32:
+    return "v_writelane_b32";
+  case ADD_F32:
+    return "v_add_f32";
+  case SUB_F32:
+    return "v_sub_f32";
+  case SUBREV_F32:
+    return "v_subrev_f32";
+  case MAC_LEGACY_F32:
+    return "v_mac_legacy_f32";
+  case MUL_LEGACY_F32:
+    return "v_mul_legacy_f32";
+  case MUL_F32:
+    return "v_mul_f32";
+  case MUL_I32_I24:
+    return "v_mul_i32_i24";
+  case MUL_HI_I32_I24:
+    return "v_mul_hi_i32_i24";
+  case MUL_U32_U24:
+    return "v_mul_u32_u24";
+  case MUL_HI_U32_U24:
+    return "v_mul_hi_u32_u24";
+  case MIN_LEGACY_F32:
+    return "v_min_legacy_f32";
+  case MAX_LEGACY_F32:
+    return "v_max_legacy_f32";
+  case MIN_F32:
+    return "v_min_f32";
+  case MAX_F32:
+    return "v_max_f32";
+  case MIN_I32:
+    return "v_min_i32";
+  case MAX_I32:
+    return "v_max_i32";
+  case MIN_U32:
+    return "v_min_u32";
+  case MAX_U32:
+    return "v_max_u32";
+  case LSHR_B32:
+    return "v_lshr_b32";
+  case LSHRREV_B32:
+    return "v_lshrrev_b32";
+  case ASHR_I32:
+    return "v_ashr_i32";
+  case ASHRREV_I32:
+    return "v_ashrrev_i32";
+  case LSHL_B32:
+    return "v_lshl_b32";
+  case LSHLREV_B32:
+    return "v_lshlrev_b32";
+  case AND_B32:
+    return "v_and_b32";
+  case OR_B32:
+    return "v_or_b32";
+  case XOR_B32:
+    return "v_xor_b32";
+  case BFM_B32:
+    return "v_bfm_b32";
+  case MAC_F32:
+    return "v_mac_f32";
+  case MADMK_F32:
+    return "v_madmk_f32";
+  case MADAK_F32:
+    return "v_madak_f32";
+  case BCNT_U32_B32:
+    return "v_bcnt_u32_b32";
+  case MBCNT_LO_U32_B32:
+    return "v_mbcnt_lo_u32_b32";
+  case MBCNT_HI_U32_B32:
+    return "v_mbcnt_hi_u32_b32";
+  case ADD_I32:
+    return "v_add_i32";
+  case SUB_I32:
+    return "v_sub_i32";
+  case SUBREV_I32:
+    return "v_subrev_i32";
+  case ADDC_U32:
+    return "v_addc_u32";
+  case SUBB_U32:
+    return "v_subb_u32";
+  case SUBBREV_U32:
+    return "v_subbrev_u32";
+  case LDEXP_F32:
+    return "v_ldexp_f32";
+  case CVT_PKACCUM_U8_F32:
+    return "v_cvt_pkaccum_u8_f32";
+  case CVT_PKNORM_I16_F32:
+    return "v_cvt_pknorm_i16_f32";
+  case CVT_PKNORM_U16_F32:
+    return "v_cvt_pknorm_u16_f32";
+  case CVT_PKRTZ_F16_F32:
+    return "v_cvt_pkrtz_f16_f32";
+  case CVT_PK_U16_U32:
+    return "v_cvt_pk_u16_u32";
+  case CVT_PK_I16_I32:
+    return "v_cvt_pk_i16_i32";
+  }
+  return nullptr;
+}
+
+} // namespace shader::ir::vop2
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop3.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vop3.hpp
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vopc.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/dialect/vopc.hpp
@ -0,0 +1,522 @@
+#pragma once
+
+namespace shader::ir::vopc {
+enum Op {
+  CMP_F_F32,
+  CMP_LT_F32,
+  CMP_EQ_F32,
+  CMP_LE_F32,
+  CMP_GT_F32,
+  CMP_LG_F32,
+  CMP_GE_F32,
+  CMP_O_F32,
+  CMP_U_F32,
+  CMP_NGE_F32,
+  CMP_NLG_F32,
+  CMP_NGT_F32,
+  CMP_NLE_F32,
+  CMP_NEQ_F32,
+  CMP_NLT_F32,
+  CMP_TRU_F32,
+  CMPX_F_F32,
+  CMPX_LT_F32,
+  CMPX_EQ_F32,
+  CMPX_LE_F32,
+  CMPX_GT_F32,
+  CMPX_LG_F32,
+  CMPX_GE_F32,
+  CMPX_O_F32,
+  CMPX_U_F32,
+  CMPX_NGE_F32,
+  CMPX_NLG_F32,
+  CMPX_NGT_F32,
+  CMPX_NLE_F32,
+  CMPX_NEQ_F32,
+  CMPX_NLT_F32,
+  CMPX_TRU_F32,
+  CMP_F_F64,
+  CMP_LT_F64,
+  CMP_EQ_F64,
+  CMP_LE_F64,
+  CMP_GT_F64,
+  CMP_LG_F64,
+  CMP_GE_F64,
+  CMP_O_F64,
+  CMP_U_F64,
+  CMP_NGE_F64,
+  CMP_NLG_F64,
+  CMP_NGT_F64,
+  CMP_NLE_F64,
+  CMP_NEQ_F64,
+  CMP_NLT_F64,
+  CMP_TRU_F64,
+  CMPX_F_F64,
+  CMPX_LT_F64,
+  CMPX_EQ_F64,
+  CMPX_LE_F64,
+  CMPX_GT_F64,
+  CMPX_LG_F64,
+  CMPX_GE_F64,
+  CMPX_O_F64,
+  CMPX_U_F64,
+  CMPX_NGE_F64,
+  CMPX_NLG_F64,
+  CMPX_NGT_F64,
+  CMPX_NLE_F64,
+  CMPX_NEQ_F64,
+  CMPX_NLT_F64,
+  CMPX_TRU_F64,
+  CMPS_F_F32,
+  CMPS_LT_F32,
+  CMPS_EQ_F32,
+  CMPS_LE_F32,
+  CMPS_GT_F32,
+  CMPS_LG_F32,
+  CMPS_GE_F32,
+  CMPS_O_F32,
+  CMPS_U_F32,
+  CMPS_NGE_F32,
+  CMPS_NLG_F32,
+  CMPS_NGT_F32,
+  CMPS_NLE_F32,
+  CMPS_NEQ_F32,
+  CMPS_NLT_F32,
+  CMPS_TRU_F32,
+  CMPSX_F_F32,
+  CMPSX_LT_F32,
+  CMPSX_EQ_F32,
+  CMPSX_LE_F32,
+  CMPSX_GT_F32,
+  CMPSX_LG_F32,
+  CMPSX_GE_F32,
+  CMPSX_O_F32,
+  CMPSX_U_F32,
+  CMPSX_NGE_F32,
+  CMPSX_NLG_F32,
+  CMPSX_NGT_F32,
+  CMPSX_NLE_F32,
+  CMPSX_NEQ_F32,
+  CMPSX_NLT_F32,
+  CMPSX_TRU_F32,
+  CMPS_F_F64,
+  CMPS_LT_F64,
+  CMPS_EQ_F64,
+  CMPS_LE_F64,
+  CMPS_GT_F64,
+  CMPS_LG_F64,
+  CMPS_GE_F64,
+  CMPS_O_F64,
+  CMPS_U_F64,
+  CMPS_NGE_F64,
+  CMPS_NLG_F64,
+  CMPS_NGT_F64,
+  CMPS_NLE_F64,
+  CMPS_NEQ_F64,
+  CMPS_NLT_F64,
+  CMPS_TRU_F64,
+  CMPSX_F_F64,
+  CMPSX_LT_F64,
+  CMPSX_EQ_F64,
+  CMPSX_LE_F64,
+  CMPSX_GT_F64,
+  CMPSX_LG_F64,
+  CMPSX_GE_F64,
+  CMPSX_O_F64,
+  CMPSX_U_F64,
+  CMPSX_NGE_F64,
+  CMPSX_NLG_F64,
+  CMPSX_NGT_F64,
+  CMPSX_NLE_F64,
+  CMPSX_NEQ_F64,
+  CMPSX_NLT_F64,
+  CMPSX_TRU_F64,
+  CMP_F_I32,
+  CMP_LT_I32,
+  CMP_EQ_I32,
+  CMP_LE_I32,
+  CMP_GT_I32,
+  CMP_NE_I32,
+  CMP_GE_I32,
+  CMP_T_I32,
+  CMP_CLASS_F32,
+  CMP_LT_I16,
+  CMP_EQ_I16,
+  CMP_LE_I16,
+  CMP_GT_I16,
+  CMP_NE_I16,
+  CMP_GE_I16,
+  CMP_CLASS_F16,
+  CMPX_F_I32,
+  CMPX_LT_I32,
+  CMPX_EQ_I32,
+  CMPX_LE_I32,
+  CMPX_GT_I32,
+  CMPX_NE_I32,
+  CMPX_GE_I32,
+  CMPX_T_I32,
+  CMPX_CLASS_F32,
+  CMPX_LT_I16,
+  CMPX_EQ_I16,
+  CMPX_LE_I16,
+  CMPX_GT_I16,
+  CMPX_NE_I16,
+  CMPX_GE_I16,
+  CMPX_CLASS_F16,
+  CMP_F_I64,
+  CMP_LT_I64,
+  CMP_EQ_I64,
+  CMP_LE_I64,
+  CMP_GT_I64,
+  CMP_NE_I64,
+  CMP_GE_I64,
+  CMP_T_I64,
+  CMP_CLASS_F64,
+  CMP_LT_U16,
+  CMP_EQ_U16,
+  CMP_LE_U16,
+  CMP_GT_U16,
+  CMP_NE_U16,
+  CMP_GE_U16,
+  CMPX_F_I64 = 176,
+  CMPX_LT_I64,
+  CMPX_EQ_I64,
+  CMPX_LE_I64,
+  CMPX_GT_I64,
+  CMPX_NE_I64,
+  CMPX_GE_I64,
+  CMPX_T_I64,
+  CMPX_CLASS_F64,
+  CMPX_LT_U16,
+  CMPX_EQ_U16,
+  CMPX_LE_U16,
+  CMPX_GT_U16,
+  CMPX_NE_U16,
+  CMPX_GE_U16,
+  CMP_F_U32 = 192,
+  CMP_LT_U32,
+  CMP_EQ_U32,
+  CMP_LE_U32,
+  CMP_GT_U32,
+  CMP_NE_U32,
+  CMP_GE_U32,
+  CMP_T_U32,
+  CMP_F_F16,
+  CMP_LT_F16,
+  CMP_EQ_F16,
+  CMP_LE_F16,
+  CMP_GT_F16,
+  CMP_LG_F16,
+  CMP_GE_F16,
+  CMP_O_F16,
+  CMPX_F_U32,
+  CMPX_LT_U32,
+  CMPX_EQ_U32,
+  CMPX_LE_U32,
+  CMPX_GT_U32,
+  CMPX_NE_U32,
+  CMPX_GE_U32,
+  CMPX_T_U32,
+  CMPX_F_F16,
+  CMPX_LT_F16,
+  CMPX_EQ_F16,
+  CMPX_LE_F16,
+  CMPX_GT_F16,
+  CMPX_LG_F16,
+  CMPX_GE_F16,
+  CMPX_O_F16,
+  CMP_F_U64,
+  CMP_LT_U64,
+  CMP_EQ_U64,
+  CMP_LE_U64,
+  CMP_GT_U64,
+  CMP_NE_U64,
+  CMP_GE_U64,
+  CMP_T_U64,
+  CMP_U_F16,
+  CMP_NGE_F16,
+  CMP_NLG_F16,
+  CMP_NGT_F16,
+  CMP_NLE_F16,
+  CMP_NEQ_F16,
+  CMP_NLT_F16,
+  CMP_TRU_F16,
+  CMPX_F_U64,
+  CMPX_LT_U64,
+  CMPX_EQ_U64,
+  CMPX_LE_U64,
+  CMPX_GT_U64,
+  CMPX_NE_U64,
+  CMPX_GE_U64,
+  CMPX_T_U64,
+  CMPX_U_F16,
+  CMPX_NGE_F16,
+  CMPX_NLG_F16,
+  CMPX_NGT_F16,
+  CMPX_NLE_F16,
+  CMPX_NEQ_F16,
+  CMPX_NLT_F16,
+  CMPX_TRU_F16,
+
+  OpCount
+};
+
+inline const char *getInstructionName(unsigned id) {
+  switch (id) {
+  case CMP_F_F32: return "v_cmp_f_f32";
+  case CMP_LT_F32: return "v_cmp_lt_f32";
+  case CMP_EQ_F32: return "v_cmp_eq_f32";
+  case CMP_LE_F32: return "v_cmp_le_f32";
+  case CMP_GT_F32: return "v_cmp_gt_f32";
+  case CMP_LG_F32: return "v_cmp_lg_f32";
+  case CMP_GE_F32: return "v_cmp_ge_f32";
+  case CMP_O_F32: return "v_cmp_o_f32";
+  case CMP_U_F32: return "v_cmp_u_f32";
+  case CMP_NGE_F32: return "v_cmp_nge_f32";
+  case CMP_NLG_F32: return "v_cmp_nlg_f32";
+  case CMP_NGT_F32: return "v_cmp_ngt_f32";
+  case CMP_NLE_F32: return "v_cmp_nle_f32";
+  case CMP_NEQ_F32: return "v_cmp_neq_f32";
+  case CMP_NLT_F32: return "v_cmp_nlt_f32";
+  case CMP_TRU_F32: return "v_cmp_tru_f32";
+  case CMPX_F_F32: return "v_cmpx_f_f32";
+  case CMPX_LT_F32: return "v_cmpx_lt_f32";
+  case CMPX_EQ_F32: return "v_cmpx_eq_f32";
+  case CMPX_LE_F32: return "v_cmpx_le_f32";
+  case CMPX_GT_F32: return "v_cmpx_gt_f32";
+  case CMPX_LG_F32: return "v_cmpx_lg_f32";
+  case CMPX_GE_F32: return "v_cmpx_ge_f32";
+  case CMPX_O_F32: return "v_cmpx_o_f32";
+  case CMPX_U_F32: return "v_cmpx_u_f32";
+  case CMPX_NGE_F32: return "v_cmpx_nge_f32";
+  case CMPX_NLG_F32: return "v_cmpx_nlg_f32";
+  case CMPX_NGT_F32: return "v_cmpx_ngt_f32";
+  case CMPX_NLE_F32: return "v_cmpx_nle_f32";
+  case CMPX_NEQ_F32: return "v_cmpx_neq_f32";
+  case CMPX_NLT_F32: return "v_cmpx_nlt_f32";
+  case CMPX_TRU_F32: return "v_cmpx_tru_f32";
+  case CMP_F_F64: return "v_cmp_f_f64";
+  case CMP_LT_F64: return "v_cmp_lt_f64";
+  case CMP_EQ_F64: return "v_cmp_eq_f64";
+  case CMP_LE_F64: return "v_cmp_le_f64";
+  case CMP_GT_F64: return "v_cmp_gt_f64";
+  case CMP_LG_F64: return "v_cmp_lg_f64";
+  case CMP_GE_F64: return "v_cmp_ge_f64";
+  case CMP_O_F64: return "v_cmp_o_f64";
+  case CMP_U_F64: return "v_cmp_u_f64";
+  case CMP_NGE_F64: return "v_cmp_nge_f64";
+  case CMP_NLG_F64: return "v_cmp_nlg_f64";
+  case CMP_NGT_F64: return "v_cmp_ngt_f64";
+  case CMP_NLE_F64: return "v_cmp_nle_f64";
+  case CMP_NEQ_F64: return "v_cmp_neq_f64";
+  case CMP_NLT_F64: return "v_cmp_nlt_f64";
+  case CMP_TRU_F64: return "v_cmp_tru_f64";
+  case CMPX_F_F64: return "v_cmpx_f_f64";
+  case CMPX_LT_F64: return "v_cmpx_lt_f64";
+  case CMPX_EQ_F64: return "v_cmpx_eq_f64";
+  case CMPX_LE_F64: return "v_cmpx_le_f64";
+  case CMPX_GT_F64: return "v_cmpx_gt_f64";
+  case CMPX_LG_F64: return "v_cmpx_lg_f64";
+  case CMPX_GE_F64: return "v_cmpx_ge_f64";
+  case CMPX_O_F64: return "v_cmpx_o_f64";
+  case CMPX_U_F64: return "v_cmpx_u_f64";
+  case CMPX_NGE_F64: return "v_cmpx_nge_f64";
+  case CMPX_NLG_F64: return "v_cmpx_nlg_f64";
+  case CMPX_NGT_F64: return "v_cmpx_ngt_f64";
+  case CMPX_NLE_F64: return "v_cmpx_nle_f64";
+  case CMPX_NEQ_F64: return "v_cmpx_neq_f64";
+  case CMPX_NLT_F64: return "v_cmpx_nlt_f64";
+  case CMPX_TRU_F64: return "v_cmpx_tru_f64";
+  case CMPS_F_F32: return "v_cmps_f_f32";
+  case CMPS_LT_F32: return "v_cmps_lt_f32";
+  case CMPS_EQ_F32: return "v_cmps_eq_f32";
+  case CMPS_LE_F32: return "v_cmps_le_f32";
+  case CMPS_GT_F32: return "v_cmps_gt_f32";
+  case CMPS_LG_F32: return "v_cmps_lg_f32";
+  case CMPS_GE_F32: return "v_cmps_ge_f32";
+  case CMPS_O_F32: return "v_cmps_o_f32";
+  case CMPS_U_F32: return "v_cmps_u_f32";
+  case CMPS_NGE_F32: return "v_cmps_nge_f32";
+  case CMPS_NLG_F32: return "v_cmps_nlg_f32";
+  case CMPS_NGT_F32: return "v_cmps_ngt_f32";
+  case CMPS_NLE_F32: return "v_cmps_nle_f32";
+  case CMPS_NEQ_F32: return "v_cmps_neq_f32";
+  case CMPS_NLT_F32: return "v_cmps_nlt_f32";
+  case CMPS_TRU_F32: return "v_cmps_tru_f32";
+  case CMPSX_F_F32: return "v_cmpsx_f_f32";
+  case CMPSX_LT_F32: return "v_cmpsx_lt_f32";
+  case CMPSX_EQ_F32: return "v_cmpsx_eq_f32";
+  case CMPSX_LE_F32: return "v_cmpsx_le_f32";
+  case CMPSX_GT_F32: return "v_cmpsx_gt_f32";
+  case CMPSX_LG_F32: return "v_cmpsx_lg_f32";
+  case CMPSX_GE_F32: return "v_cmpsx_ge_f32";
+  case CMPSX_O_F32: return "v_cmpsx_o_f32";
+  case CMPSX_U_F32: return "v_cmpsx_u_f32";
+  case CMPSX_NGE_F32: return "v_cmpsx_nge_f32";
+  case CMPSX_NLG_F32: return "v_cmpsx_nlg_f32";
+  case CMPSX_NGT_F32: return "v_cmpsx_ngt_f32";
+  case CMPSX_NLE_F32: return "v_cmpsx_nle_f32";
+  case CMPSX_NEQ_F32: return "v_cmpsx_neq_f32";
+  case CMPSX_NLT_F32: return "v_cmpsx_nlt_f32";
+  case CMPSX_TRU_F32: return "v_cmpsx_tru_f32";
+  case CMPS_F_F64: return "v_cmps_f_f64";
+  case CMPS_LT_F64: return "v_cmps_lt_f64";
+  case CMPS_EQ_F64: return "v_cmps_eq_f64";
+  case CMPS_LE_F64: return "v_cmps_le_f64";
+  case CMPS_GT_F64: return "v_cmps_gt_f64";
+  case CMPS_LG_F64: return "v_cmps_lg_f64";
+  case CMPS_GE_F64: return "v_cmps_ge_f64";
+  case CMPS_O_F64: return "v_cmps_o_f64";
+  case CMPS_U_F64: return "v_cmps_u_f64";
+  case CMPS_NGE_F64: return "v_cmps_nge_f64";
+  case CMPS_NLG_F64: return "v_cmps_nlg_f64";
+  case CMPS_NGT_F64: return "v_cmps_ngt_f64";
+  case CMPS_NLE_F64: return "v_cmps_nle_f64";
+  case CMPS_NEQ_F64: return "v_cmps_neq_f64";
+  case CMPS_NLT_F64: return "v_cmps_nlt_f64";
+  case CMPS_TRU_F64: return "v_cmps_tru_f64";
+  case CMPSX_F_F64: return "v_cmpsx_f_f64";
+  case CMPSX_LT_F64: return "v_cmpsx_lt_f64";
+  case CMPSX_EQ_F64: return "v_cmpsx_eq_f64";
+  case CMPSX_LE_F64: return "v_cmpsx_le_f64";
+  case CMPSX_GT_F64: return "v_cmpsx_gt_f64";
+  case CMPSX_LG_F64: return "v_cmpsx_lg_f64";
+  case CMPSX_GE_F64: return "v_cmpsx_ge_f64";
+  case CMPSX_O_F64: return "v_cmpsx_o_f64";
+  case CMPSX_U_F64: return "v_cmpsx_u_f64";
+  case CMPSX_NGE_F64: return "v_cmpsx_nge_f64";
+  case CMPSX_NLG_F64: return "v_cmpsx_nlg_f64";
+  case CMPSX_NGT_F64: return "v_cmpsx_ngt_f64";
+  case CMPSX_NLE_F64: return "v_cmpsx_nle_f64";
+  case CMPSX_NEQ_F64: return "v_cmpsx_neq_f64";
+  case CMPSX_NLT_F64: return "v_cmpsx_nlt_f64";
+  case CMPSX_TRU_F64: return "v_cmpsx_tru_f64";
+  case CMP_F_I32: return "v_cmp_f_i32";
+  case CMP_LT_I32: return "v_cmp_lt_i32";
+  case CMP_EQ_I32: return "v_cmp_eq_i32";
+  case CMP_LE_I32: return "v_cmp_le_i32";
+  case CMP_GT_I32: return "v_cmp_gt_i32";
+  case CMP_NE_I32: return "v_cmp_ne_i32";
+  case CMP_GE_I32: return "v_cmp_ge_i32";
+  case CMP_T_I32: return "v_cmp_t_i32";
+  case CMP_CLASS_F32: return "v_cmp_class_f32";
+  case CMP_LT_I16: return "v_cmp_lt_i16";
+  case CMP_EQ_I16: return "v_cmp_eq_i16";
+  case CMP_LE_I16: return "v_cmp_le_i16";
+  case CMP_GT_I16: return "v_cmp_gt_i16";
+  case CMP_NE_I16: return "v_cmp_ne_i16";
+  case CMP_GE_I16: return "v_cmp_ge_i16";
+  case CMP_CLASS_F16: return "v_cmp_class_f16";
+  case CMPX_F_I32: return "v_cmpx_f_i32";
+  case CMPX_LT_I32: return "v_cmpx_lt_i32";
+  case CMPX_EQ_I32: return "v_cmpx_eq_i32";
+  case CMPX_LE_I32: return "v_cmpx_le_i32";
+  case CMPX_GT_I32: return "v_cmpx_gt_i32";
+  case CMPX_NE_I32: return "v_cmpx_ne_i32";
+  case CMPX_GE_I32: return "v_cmpx_ge_i32";
+  case CMPX_T_I32: return "v_cmpx_t_i32";
+  case CMPX_CLASS_F32: return "v_cmpx_class_f32";
+  case CMPX_LT_I16: return "v_cmpx_lt_i16";
+  case CMPX_EQ_I16: return "v_cmpx_eq_i16";
+  case CMPX_LE_I16: return "v_cmpx_le_i16";
+  case CMPX_GT_I16: return "v_cmpx_gt_i16";
+  case CMPX_NE_I16: return "v_cmpx_ne_i16";
+  case CMPX_GE_I16: return "v_cmpx_ge_i16";
+  case CMPX_CLASS_F16: return "v_cmpx_class_f16";
+  case CMP_F_I64: return "v_cmp_f_i64";
+  case CMP_LT_I64: return "v_cmp_lt_i64";
+  case CMP_EQ_I64: return "v_cmp_eq_i64";
+  case CMP_LE_I64: return "v_cmp_le_i64";
+  case CMP_GT_I64: return "v_cmp_gt_i64";
+  case CMP_NE_I64: return "v_cmp_ne_i64";
+  case CMP_GE_I64: return "v_cmp_ge_i64";
+  case CMP_T_I64: return "v_cmp_t_i64";
+  case CMP_CLASS_F64: return "v_cmp_class_f64";
+  case CMP_LT_U16: return "v_cmp_lt_u16";
+  case CMP_EQ_U16: return "v_cmp_eq_u16";
+  case CMP_LE_U16: return "v_cmp_le_u16";
+  case CMP_GT_U16: return "v_cmp_gt_u16";
+  case CMP_NE_U16: return "v_cmp_ne_u16";
+  case CMP_GE_U16: return "v_cmp_ge_u16";
+  case CMPX_F_I64: return "v_cmpx_f_i64";
+  case CMPX_LT_I64: return "v_cmpx_lt_i64";
+  case CMPX_EQ_I64: return "v_cmpx_eq_i64";
+  case CMPX_LE_I64: return "v_cmpx_le_i64";
+  case CMPX_GT_I64: return "v_cmpx_gt_i64";
+  case CMPX_NE_I64: return "v_cmpx_ne_i64";
+  case CMPX_GE_I64: return "v_cmpx_ge_i64";
+  case CMPX_T_I64: return "v_cmpx_t_i64";
+  case CMPX_CLASS_F64: return "v_cmpx_class_f64";
+  case CMPX_LT_U16: return "v_cmpx_lt_u16";
+  case CMPX_EQ_U16: return "v_cmpx_eq_u16";
+  case CMPX_LE_U16: return "v_cmpx_le_u16";
+  case CMPX_GT_U16: return "v_cmpx_gt_u16";
+  case CMPX_NE_U16: return "v_cmpx_ne_u16";
+  case CMPX_GE_U16: return "v_cmpx_ge_u16";
+  case CMP_F_U32: return "v_cmp_f_u32";
+  case CMP_LT_U32: return "v_cmp_lt_u32";
+  case CMP_EQ_U32: return "v_cmp_eq_u32";
+  case CMP_LE_U32: return "v_cmp_le_u32";
+  case CMP_GT_U32: return "v_cmp_gt_u32";
+  case CMP_NE_U32: return "v_cmp_ne_u32";
+  case CMP_GE_U32: return "v_cmp_ge_u32";
+  case CMP_T_U32: return "v_cmp_t_u32";
+  case CMP_F_F16: return "v_cmp_f_f16";
+  case CMP_LT_F16: return "v_cmp_lt_f16";
+  case CMP_EQ_F16: return "v_cmp_eq_f16";
+  case CMP_LE_F16: return "v_cmp_le_f16";
+  case CMP_GT_F16: return "v_cmp_gt_f16";
+  case CMP_LG_F16: return "v_cmp_lg_f16";
+  case CMP_GE_F16: return "v_cmp_ge_f16";
+  case CMP_O_F16: return "v_cmp_o_f16";
+  case CMPX_F_U32: return "v_cmpx_f_u32";
+  case CMPX_LT_U32: return "v_cmpx_lt_u32";
+  case CMPX_EQ_U32: return "v_cmpx_eq_u32";
+  case CMPX_LE_U32: return "v_cmpx_le_u32";
+  case CMPX_GT_U32: return "v_cmpx_gt_u32";
+  case CMPX_NE_U32: return "v_cmpx_ne_u32";
+  case CMPX_GE_U32: return "v_cmpx_ge_u32";
+  case CMPX_T_U32: return "v_cmpx_t_u32";
+  case CMPX_F_F16: return "v_cmpx_f_f16";
+  case CMPX_LT_F16: return "v_cmpx_lt_f16";
+  case CMPX_EQ_F16: return "v_cmpx_eq_f16";
+  case CMPX_LE_F16: return "v_cmpx_le_f16";
+  case CMPX_GT_F16: return "v_cmpx_gt_f16";
+  case CMPX_LG_F16: return "v_cmpx_lg_f16";
+  case CMPX_GE_F16: return "v_cmpx_ge_f16";
+  case CMPX_O_F16: return "v_cmpx_o_f16";
+  case CMP_F_U64: return "v_cmp_f_u64";
+  case CMP_LT_U64: return "v_cmp_lt_u64";
+  case CMP_EQ_U64: return "v_cmp_eq_u64";
+  case CMP_LE_U64: return "v_cmp_le_u64";
+  case CMP_GT_U64: return "v_cmp_gt_u64";
+  case CMP_NE_U64: return "v_cmp_ne_u64";
+  case CMP_GE_U64: return "v_cmp_ge_u64";
+  case CMP_T_U64: return "v_cmp_t_u64";
+  case CMP_U_F16: return "v_cmp_u_f16";
+  case CMP_NGE_F16: return "v_cmp_nge_f16";
+  case CMP_NLG_F16: return "v_cmp_nlg_f16";
+  case CMP_NGT_F16: return "v_cmp_ngt_f16";
+  case CMP_NLE_F16: return "v_cmp_nle_f16";
+  case CMP_NEQ_F16: return "v_cmp_neq_f16";
+  case CMP_NLT_F16: return "v_cmp_nlt_f16";
+  case CMP_TRU_F16: return "v_cmp_tru_f16";
+  case CMPX_F_U64: return "v_cmpx_f_u64";
+  case CMPX_LT_U64: return "v_cmpx_lt_u64";
+  case CMPX_EQ_U64: return "v_cmpx_eq_u64";
+  case CMPX_LE_U64: return "v_cmpx_le_u64";
+  case CMPX_GT_U64: return "v_cmpx_gt_u64";
+  case CMPX_NE_U64: return "v_cmpx_ne_u64";
+  case CMPX_GE_U64: return "v_cmpx_ge_u64";
+  case CMPX_T_U64: return "v_cmpx_t_u64";
+  case CMPX_U_F16: return "v_cmpx_u_f16";
+  case CMPX_NGE_F16: return "v_cmpx_nge_f16";
+  case CMPX_NLG_F16: return "v_cmpx_nlg_f16";
+  case CMPX_NGT_F16: return "v_cmpx_ngt_f16";
+  case CMPX_NLE_F16: return "v_cmpx_nle_f16";
+  case CMPX_NEQ_F16: return "v_cmpx_neq_f16";
+  case CMPX_NLT_F16: return "v_cmpx_nlt_f16";
+  case CMPX_TRU_F16: return "v_cmpx_tru_f16";
+  }
+  return nullptr;
+}
+}
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/eval.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/eval.hpp
@ -0,0 +1,92 @@
+#pragma once
+
+#include "Vector.hpp"
+#include "ir/Value.hpp"
+#include <cstdint>
+#include <variant>
+#include <array>
+
+namespace shader::eval {
+struct Value {
+  using Storage = std::variant<
+      std::nullptr_t, std::int8_t, std::int16_t, std::int32_t, std::int64_t,
+      std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float16_t,
+      float32_t, float64_t, u8vec2, u8vec3, u8vec4, i8vec2, i8vec3, i8vec4,
+      u16vec2, u16vec3, u16vec4, i16vec2, i16vec3, i16vec4, u32vec2, u32vec3,
+      u32vec4, i32vec2, i32vec3, i32vec4, u64vec2, u64vec3, u64vec4, i64vec2,
+      i64vec3, i64vec4, f32vec2, f32vec3, f32vec4, f64vec2, f64vec3, f64vec4,
+      f16vec2, f16vec3, f16vec4, bool, bvec2, bvec3, bvec4, std::array<uint32_t, 8>>;
+  static constexpr auto StorageSize = std::variant_size_v<Storage>;
+  Storage storage;
+
+  explicit operator bool() const { return !empty(); }
+  bool empty() const { return storage.index() == 0; }
+
+  Value() : storage(nullptr) {}
+
+  template <typename T>
+  Value(T &&value)
+    requires requires { Storage(std::forward<T>(value)); }
+      : storage(std::forward<T>(value)) {}
+
+  static Value compositeConstruct(ir::Value type,
+                                  std::span<const Value> constituents);
+  Value compositeExtract(const Value &index) const;
+  // Value compositeInsert(const Value &object, std::size_t index) const;
+
+  Value isNan() const;
+  Value isInf() const;
+  Value isFinite() const;
+  Value makeUnsigned() const;
+  Value makeSigned() const;
+  Value all() const;
+  Value any() const;
+  Value select(const Value &trueValue, const Value &falseValue) const;
+  Value iConvert(ir::Value type, bool isSigned) const;
+  Value sConvert(ir::Value type) const { return iConvert(type, true); }
+  Value uConvert(ir::Value type) const { return iConvert(type, false); }
+  Value fConvert(ir::Value type) const;
+  Value bitcast(ir::Value type) const;
+  std::optional<std::uint64_t> zExtScalar() const;
+  std::optional<std::int64_t> sExtScalar() const;
+
+  template <typename T>
+    requires requires { std::get<T>(storage); }
+  T get() const {
+    return std::get<T>(storage);
+  }
+
+  template <typename T>
+    requires requires { std::get<T>(storage); }
+  std::optional<T> as() const {
+    if (auto result = std::get_if<T>(&storage)) {
+      return *result;
+    }
+
+    return std::nullopt;
+  }
+
+  Value operator+(const Value &rhs) const;
+  Value operator-(const Value &rhs) const;
+  Value operator*(const Value &rhs) const;
+  Value operator/(const Value &rhs) const;
+  Value operator%(const Value &rhs) const;
+  Value operator&(const Value &rhs) const;
+  Value operator|(const Value &rhs) const;
+  Value operator^(const Value &rhs) const;
+  Value operator>>(const Value &rhs) const;
+  Value operator<<(const Value &rhs) const;
+  Value operator&&(const Value &rhs) const;
+  Value operator||(const Value &rhs) const;
+  Value operator<(const Value &rhs) const;
+  Value operator>(const Value &rhs) const;
+  Value operator<=(const Value &rhs) const;
+  Value operator>=(const Value &rhs) const;
+  Value operator==(const Value &rhs) const;
+  Value operator!=(const Value &rhs) const;
+
+  Value operator-() const;
+  Value operator~() const;
+  Value operator!() const;
+};
+} // namespace shader::eval
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/gcn.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/gcn.hpp
@ -0,0 +1,125 @@
+#pragma once
+
+#include "SemanticInfo.hpp"
+#include "SpvConverter.hpp"
+#include "analyze.hpp"
+#include "rx/MemoryTable.hpp"
+#include "spv.hpp"
+
+#include <cstdint>
+#include <functional>
+
+namespace shader::gcn {
+using Builder = ir::Builder<ir::spv::Builder, ir::builtin::Builder>;
+
+enum class Stage {
+  Ps,
+  VsVs,
+  VsEs,
+  VsLs,
+  Cs,
+  Gs,
+  GsVs,
+  Hs,
+  DsVs,
+  DsEs,
+
+  Invalid,
+};
+
+struct Import : spv::Import {
+  ir::Node getOrCloneImpl(ir::Context &context, ir::Node node,
+                          bool isOperand) override;
+};
+
+struct SemanticModuleInfo : shader::SemanticModuleInfo {
+  std::map<int, ir::Value> registerVariables;
+};
+
+void canonicalizeSemantic(ir::Context &context,
+                          const spv::BinaryLayout &semantic);
+void collectSemanticModuleInfo(SemanticModuleInfo &moduleInfo,
+                               const spv::BinaryLayout &layout);
+SemanticInfo collectSemanticInfo(const SemanticModuleInfo &moduleInfo);
+
+struct InstructionRegion : ir::RegionLikeImpl {
+  ir::RegionLike base;
+  ir::Instruction *firstInstruction;
+
+  void insertAfter(ir::Instruction point, ir::Instruction node) {
+    if (!*firstInstruction) {
+      *firstInstruction = node;
+    }
+
+    base.insertAfter(point, node);
+  }
+};
+
+enum RegId {
+  Sgpr,
+  Vgpr,
+  M0,
+  Scc,
+  Vcc,
+  Exec,
+  VccZ,
+  ExecZ,
+  LdsDirect,
+  SgprCount,
+  VgprCount,
+  ThreadId,
+  MemoryTable,
+  Gds,
+};
+
+struct Context : spv::Context {
+  ir::Region body;
+  rx::MemoryAreaTable<> memoryMap;
+  std::uint32_t requiredUserSgprs = 0;
+  std::map<RegId, ir::Value> registerVariables;
+  std::map<std::uint64_t, ir::Instruction> instructions;
+  AnalysisStorage analysis;
+
+  std::pair<ir::Value, bool> getOrCreateLabel(ir::Location loc, ir::Region body,
+                                              std::uint64_t address);
+  Builder createBuilder(InstructionRegion &region, ir::Region bodyRegion,
+                        std::uint64_t address);
+
+  ir::Value createCast(ir::Location loc, Builder &builder, ir::Value targetType,
+                       ir::Value value);
+
+  void setRegisterVariable(RegId id, ir::Value value) {
+    registerVariables[id] = value;
+  }
+
+  ir::Value getOrCreateRegisterVariable(RegId id);
+
+  ir::Value getRegisterRef(ir::Location loc, Builder &builder, RegId id,
+                           const ir::Operand &index, ir::Value lane = nullptr);
+
+  ir::Value readReg(ir::Location loc, Builder &builder, ir::Value typeValue,
+                    RegId id, const ir::Operand &index,
+                    ir::Value lane = nullptr);
+
+  void writeReg(ir::Location loc, Builder &builder, RegId id,
+                const ir::Operand &index, ir::Value value,
+                ir::Value lane = nullptr);
+
+  ir::Value createRegisterAccess(Builder &builder, ir::Location loc,
+                                 ir::Value reg, const ir::Operand &index,
+                                 ir::Value lane = nullptr);
+};
+
+struct Environment {
+  std::uint8_t vgprCount;
+  std::uint8_t sgprCount;
+  std::span<const std::uint32_t> userSgprs;
+  bool supportsBarycentric = true;
+  bool supportsInt8 = false;
+  bool supportsInt64Atomics = false;
+};
+
+ir::Region deserialize(Context &context, const Environment &environment,
+                       const SemanticInfo &semanticInfo, std::uint64_t base,
+                       std::function<std::uint32_t(std::uint64_t)> readMemory);
+} // namespace shader::gcn
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/glsl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/glsl.hpp
@ -0,0 +1,31 @@
+#pragma once
+#include "ir/Location.hpp"
+#include "spv.hpp"
+#include <filesystem>
+
+namespace shader::glsl {
+enum class Stage {
+  Library,
+  Vertex,
+  TessControl,
+  TessEvaluation,
+  Geometry,
+  Fragment,
+  Compute,
+  RayGen,
+  Intersect,
+  AnyHit,
+  ClosestHit,
+  Miss,
+  Callable,
+  Task,
+  Mesh,
+};
+
+std::optional<spv::BinaryLayout> parseFile(ir::Context &context, Stage stage,
+                                           const std::filesystem::path &path);
+std::optional<spv::BinaryLayout> parseSource(ir::Context &context, Stage stage,
+                                             std::string_view source,
+                                             ir::Location loc = nullptr);
+std::string decompile(std::span<const std::uint32_t> spv);
+} // namespace shader::glsl
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/graph.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/graph.hpp
@ -0,0 +1,320 @@
+#pragma once
+
+#include <map>
+#include <vector>
+
+namespace graph {
+template <typename BasicBlockPtrT> class DomTree {
+public:
+  struct Node {
+    BasicBlockPtrT block = nullptr;
+    Node *immDom = nullptr;
+    unsigned dfsNumIn = ~0;
+    unsigned dfsNumOut = ~0;
+    unsigned level = 0;
+    std::vector<Node *> children;
+
+    bool isLeaf() const { return children.empty(); }
+
+    bool dominatedBy(const Node *other) const {
+      return this->dfsNumIn >= other->dfsNumIn &&
+             this->dfsNumOut <= other->dfsNumOut;
+    }
+  };
+
+private:
+  std::map<BasicBlockPtrT, Node> bbToNodes;
+  Node *rootNode = nullptr;
+
+public:
+  Node *getNode(BasicBlockPtrT bb) {
+    auto it = bbToNodes.find(bb);
+    if (it != bbToNodes.end()) {
+      return &it->second;
+    }
+
+    return nullptr;
+  }
+
+  Node *createChild(BasicBlockPtrT bb, Node *parent) {
+    auto &child = bbToNodes[bb];
+    child.block = bb;
+    child.immDom = parent;
+    child.level = parent->level + 1;
+    parent->children.push_back(&child);
+    return &child;
+  }
+
+  Node *createRoot(BasicBlockPtrT bb) {
+    auto &root = bbToNodes[bb];
+    rootNode = &root;
+    root.block = bb;
+    return rootNode;
+  }
+
+  Node *getRootNode() { return rootNode; }
+
+  void updateDFSNumbers() {
+    std::vector<std::pair<Node *, typename std::vector<Node *>::iterator>>
+        workStack;
+
+    auto root = getRootNode();
+    if (!root)
+      return;
+
+    workStack.push_back({root, root->children.begin()});
+
+    unsigned dfsNum = 0;
+    root->dfsNumIn = dfsNum++;
+
+    while (!workStack.empty()) {
+      auto node = workStack.back().first;
+      const auto childIt = workStack.back().second;
+
+      if (childIt == node->children.end()) {
+        node->dfsNumOut = dfsNum++;
+        workStack.pop_back();
+      } else {
+        auto child = *childIt;
+        ++workStack.back().second;
+
+        workStack.push_back({child, child->children.begin()});
+        child->dfsNumIn = dfsNum++;
+      }
+    }
+  }
+
+  bool dominates(Node *a, Node *b) {
+    if (a == b || b->immDom == a) {
+      return true;
+    }
+
+    if (a->immDom == b || a->level >= b->level) {
+      return false;
+    }
+
+    return b->dominatedBy(a);
+  }
+
+  bool dominates(BasicBlockPtrT a, BasicBlockPtrT b) {
+    return dominates(getNode(a), getNode(b));
+  }
+
+  BasicBlockPtrT getImmediateDominator(BasicBlockPtrT a) {
+    auto immDom = getNode(a)->immDom;
+    if (immDom) {
+      return immDom->block;
+    }
+    return{};
+  }
+
+  bool isImmediateDominator(BasicBlockPtrT block, BasicBlockPtrT immDomBlock) {
+    if (immDomBlock == nullptr) {
+      return false;
+    }
+
+    return getImmediateDominator(immDomBlock) == block;
+  }
+
+  BasicBlockPtrT findNearestCommonDominator(BasicBlockPtrT a,
+                                            BasicBlockPtrT b) {
+    auto aNode = getNode(a);
+    auto bNode = getNode(b);
+
+    if (aNode == rootNode || bNode == rootNode) {
+      return rootNode->block;
+    }
+
+    while (aNode != bNode) {
+      if (aNode->level < bNode->level) {
+        std::swap(aNode, bNode);
+      }
+
+      aNode = aNode->immDom;
+    }
+
+    return aNode->block;
+  }
+};
+
+template <typename BasicBlockPtrT> class DomTreeBuilder {
+  using DomTreeNode = typename DomTree<BasicBlockPtrT>::Node;
+
+  struct NodeInfo {
+    unsigned dfsNum = 0;
+    unsigned parent = 0;
+    unsigned semi = 0;
+    BasicBlockPtrT label = nullptr;
+    BasicBlockPtrT immDom = nullptr;
+    std::vector<BasicBlockPtrT> revChildren;
+  };
+
+  std::vector<BasicBlockPtrT> indexToNode = {nullptr};
+  std::map<BasicBlockPtrT, NodeInfo> nodeToInfo;
+
+  template <typename WalkFn>
+  void runDFS(BasicBlockPtrT root, const WalkFn &walk) {
+    std::vector<BasicBlockPtrT> workList;
+    workList.reserve(10);
+    workList.push_back(root);
+    unsigned index = 0;
+
+    while (!workList.empty()) {
+      auto bb = workList.back();
+      workList.pop_back();
+
+      auto &bbInfo = nodeToInfo[bb];
+
+      if (bbInfo.dfsNum != 0) {
+        continue;
+      }
+
+      bbInfo.dfsNum = bbInfo.semi = ++index;
+      bbInfo.label = bb;
+      indexToNode.push_back(bb);
+
+      walk(bb, [&](BasicBlockPtrT successor) {
+        auto it = nodeToInfo.find(successor);
+        if (it != nodeToInfo.end() && it->second.dfsNum != 0) {
+          if (successor != bb) {
+            it->second.revChildren.push_back(bb);
+          }
+
+          return;
+        }
+
+        auto &succInfo = nodeToInfo[successor];
+        workList.push_back(successor);
+        succInfo.parent = index;
+        succInfo.revChildren.push_back(bb);
+      });
+    }
+  }
+
+  void runSemiNCA() {
+    const unsigned nextDFS = indexToNode.size();
+
+    for (unsigned i = 1; i < nextDFS; ++i) {
+      const BasicBlockPtrT node = indexToNode[i];
+      auto &NodeInfo = nodeToInfo[node];
+      NodeInfo.immDom = indexToNode[NodeInfo.parent];
+    }
+
+    std::vector<NodeInfo *> evalStack;
+    evalStack.reserve(10);
+
+    for (unsigned i = nextDFS - 1; i >= 2; --i) {
+      BasicBlockPtrT node = indexToNode[i];
+      auto &nodeInfo = nodeToInfo[node];
+
+      nodeInfo.semi = nodeInfo.parent;
+      for (const auto &child : nodeInfo.revChildren) {
+        if (!nodeToInfo.contains(child)) {
+          continue;
+        }
+
+        unsigned childSemi = nodeToInfo[eval(child, i + 1, evalStack)].semi;
+        if (childSemi < nodeInfo.semi) {
+          nodeInfo.semi = childSemi;
+        }
+      }
+    }
+
+    for (unsigned i = 2; i < nextDFS; ++i) {
+      const BasicBlockPtrT node = indexToNode[i];
+      auto &nodeInfo = nodeToInfo[node];
+      const unsigned sDomNum = nodeToInfo[indexToNode[nodeInfo.semi]].dfsNum;
+      BasicBlockPtrT immDom = nodeInfo.immDom;
+
+      while (nodeToInfo[immDom].dfsNum > sDomNum) {
+        immDom = nodeToInfo[immDom].immDom;
+      }
+
+      nodeInfo.immDom = immDom;
+    }
+  }
+
+  BasicBlockPtrT eval(BasicBlockPtrT block, unsigned LastLinked,
+                      std::vector<NodeInfo *> &stack) {
+    NodeInfo *blockInfo = &nodeToInfo[block];
+    if (blockInfo->parent < LastLinked)
+      return blockInfo->label;
+
+    do {
+      stack.push_back(blockInfo);
+      blockInfo = &nodeToInfo[indexToNode[blockInfo->parent]];
+    } while (blockInfo->parent >= LastLinked);
+
+    const NodeInfo *pInfo = blockInfo;
+    const NodeInfo *pLabelInfo = &nodeToInfo[pInfo->label];
+    do {
+      blockInfo = stack.back();
+      stack.pop_back();
+
+      blockInfo->parent = pInfo->parent;
+      const NodeInfo *labelInfo = &nodeToInfo[blockInfo->label];
+      if (pLabelInfo->semi < labelInfo->semi) {
+        blockInfo->label = pInfo->label;
+      } else {
+        pLabelInfo = labelInfo;
+      }
+
+      pInfo = blockInfo;
+    } while (!stack.empty());
+    return blockInfo->label;
+  }
+
+  DomTreeNode *getNodeForBlock(BasicBlockPtrT BB, DomTree<BasicBlockPtrT> &DT) {
+    if (auto Node = DT.getNode(BB))
+      return Node;
+
+    BasicBlockPtrT IDom = getIDom(BB);
+    auto IDomNode = getNodeForBlock(IDom, DT);
+
+    return DT.createChild(BB, IDomNode);
+  }
+
+  BasicBlockPtrT getIDom(BasicBlockPtrT BB) const {
+    auto InfoIt = nodeToInfo.find(BB);
+    if (InfoIt == nodeToInfo.end())
+      return nullptr;
+
+    return InfoIt->second.immDom;
+  }
+
+public:
+  template <typename WalkFn>
+  DomTree<BasicBlockPtrT> build(BasicBlockPtrT root,
+                                const WalkFn &walkSuccessors) {
+    runDFS(root, walkSuccessors);
+    runSemiNCA();
+
+    DomTree<BasicBlockPtrT> domTree;
+    domTree.createRoot(root);
+
+    nodeToInfo[indexToNode[1]].immDom = root;
+
+    for (size_t i = 1, e = indexToNode.size(); i != e; ++i) {
+      BasicBlockPtrT node = indexToNode[i];
+
+      if (domTree.getNode(node))
+        continue;
+
+      BasicBlockPtrT immDom = getIDom(node);
+
+      auto immDomNode = getNodeForBlock(immDom, domTree);
+      domTree.createChild(node, immDomNode);
+    }
+
+    domTree.updateDFSNumbers();
+    return domTree;
+  }
+};
+
+template <typename BasicBlockPtrT>
+DomTree<BasicBlockPtrT> buildDomTree(BasicBlockPtrT root, auto &&walkSuccessors)
+  requires requires(void (*cb)(BasicBlockPtrT)) { walkSuccessors(root, cb); }
+{
+  return DomTreeBuilder<BasicBlockPtrT>().build(root, walkSuccessors);
+}
+} // namespace graph
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir.hpp
@ -0,0 +1,14 @@
+#pragma once
+
+#include "ir/Context.hpp" // IWYU pragma: export
+#include "ir/Instruction.hpp" // IWYU pragma: export
+#include "ir/Location.hpp" // IWYU pragma: export
+#include "ir/Node.hpp" // IWYU pragma: export
+#include "ir/Operand.hpp" // IWYU pragma: export
+#include "ir/PointerWrapper.hpp" // IWYU pragma: export
+#include "ir/PrintableWrapper.hpp" // IWYU pragma: export
+#include "ir/Value.hpp" // IWYU pragma: export
+#include "ir/Builder.hpp" // IWYU pragma: export
+#include "ir/Region.hpp" // IWYU pragma: export
+#include "ir/OperandPrint.hpp" // IWYU pragma: export
+#include "ir/Impl.hpp" // IWYU pragma: export
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Block.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Block.hpp
@ -0,0 +1,52 @@
+#pragma once
+
+#include "RegionLike.hpp"
+#include "RegionLikeImpl.hpp"
+#include "ValueImpl.hpp"
+
+namespace shader::ir {
+template <typename ImplT>
+struct BlockWrapper : RegionLikeWrapper<ImplT, ValueWrapper> {
+  using RegionLikeWrapper<ImplT, ValueWrapper>::RegionLikeWrapper;
+  using RegionLikeWrapper<ImplT, ValueWrapper>::operator=;
+};
+
+struct BlockImpl;
+
+struct Block : BlockWrapper<BlockImpl> {
+  using BlockWrapper<BlockImpl>::BlockWrapper;
+  using BlockWrapper<BlockImpl>::operator=;
+};
+
+struct BlockImpl : ValueImpl, RegionLikeImpl {
+  BlockImpl(Location loc);
+  Node clone(Context &context, CloneMap &map) const override;
+
+  void print(std::ostream &os, NameStorage &ns) const override {
+    os << '%' << ns.getNameOf(const_cast<BlockImpl *>(this));
+    os << " = ";
+
+    if (!getOperands().empty()) {
+      os << '[';
+      for (bool first = true; auto &operand : getOperands()) {
+        if (first) {
+          first = false;
+        } else {
+          os << ", ";
+        }
+
+        operand.print(os, ns);
+      }
+      os << "] ";
+    }
+
+    os << "{\n";
+    for (auto child : children()) {
+      os << "  ";
+      child.print(os, ns);
+      os << "\n";
+    }
+    os << "}";
+  }
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Builder.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Builder.hpp
@ -0,0 +1,84 @@
+#pragma once
+#include "Context.hpp"
+#include "Node.hpp"
+#include "RegionLikeImpl.hpp"
+
+namespace shader::ir {
+template <typename BuilderT, typename ImplT> struct BuilderFacade {
+  ImplT &instance() {
+    return *static_cast<ImplT *>(static_cast<BuilderT *>(this));
+  }
+  Context &getContext() { return instance().getContext(); }
+
+  Node getInsertionStorage() { return instance().getInsertionStorage(); }
+  template <typename T, typename... ArgsT>
+    requires requires {
+      typename T::underlying_type;
+      requires std::is_constructible_v<typename T::underlying_type, ArgsT...>;
+      requires std::is_base_of_v<NodeImpl, typename T::underlying_type>;
+    }
+  T create(ArgsT &&...args) {
+    return instance().template create<T>(std::forward<ArgsT>(args)...);
+  }
+};
+
+template <template <typename> typename... InterfaceTs>
+class Builder : public InterfaceTs<Builder<InterfaceTs...>>... {
+  Context *mContext{};
+  RegionLike mInsertionStorage;
+  Instruction mInsertionPoint;
+
+public:
+  Builder() = default;
+  Builder(Context &context) : mContext(&context) {}
+
+  static Builder createInsertAfter(Context &context, Instruction point) {
+    auto result = Builder(context);
+    result.mInsertionStorage = point.getParent();
+    result.mInsertionPoint = point;
+    return result;
+  }
+
+  static Builder createInsertBefore(Context &context, Instruction point) {
+    auto result = Builder(context);
+    result.mInsertionStorage = point.getParent();
+    result.mInsertionPoint = point.getPrev().cast<Instruction>();
+    return result;
+  }
+
+  static Builder createAppend(Context &context, RegionLike storage) {
+    auto result = Builder(context);
+    result.mInsertionStorage = storage;
+    result.mInsertionPoint = storage.getLast().cast<Instruction>();
+    return result;
+  }
+
+  static Builder createPrepend(Context &context, RegionLike storage) {
+    auto result = Builder(context);
+    result.mInsertionStorage = storage;
+    result.mInsertionPoint = nullptr;
+    return result;
+  }
+
+  Context &getContext() { return *mContext; }
+  RegionLike getInsertionStorage() { return mInsertionStorage; }
+  Instruction getInsertionPoint() { return mInsertionPoint; }
+  void setInsertionPoint(Instruction inst) { mInsertionPoint = inst; }
+
+  template <typename T, typename... ArgsT>
+    requires requires {
+      typename T::underlying_type;
+      requires std::is_constructible_v<typename T::underlying_type, ArgsT...>;
+      requires std::is_base_of_v<NodeImpl, typename T::underlying_type>;
+    }
+  T create(ArgsT &&...args) {
+    auto result = getContext().template create<T>(std::forward<ArgsT>(args)...);
+    using InstanceType = typename T::underlying_type;
+    getInsertionStorage().insertAfter(getInsertionPoint(), result);
+    if constexpr (requires { mInsertionPoint = Instruction(result); }) {
+      mInsertionPoint = Instruction(result);
+    }
+    return result;
+  }
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Context.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Context.hpp
@ -0,0 +1,84 @@
+#pragma once
+
+#include "Location.hpp"
+#include "NodeImpl.hpp"
+#include "Operand.hpp"
+
+#include <forward_list>
+#include <memory>
+#include <set>
+#include <type_traits>
+#include <utility>
+
+namespace shader::ir {
+struct UniqPtrCompare {
+  static bool operator()(const auto &lhs, const auto &rhs)
+    requires requires { *lhs <=> *rhs; }
+  {
+    return (*lhs <=> *rhs) == std::strong_ordering::less;
+  }
+};
+
+class Context {
+  std::forward_list<std::unique_ptr<NodeImpl>> mNodes;
+  std::set<std::unique_ptr<LocationImpl>, UniqPtrCompare> mLocations;
+  std::unique_ptr<UnknownLocationImpl> mUnknownLocation;
+
+public:
+  Context() = default;
+  Context(const Context &) = delete;
+  Context(Context &&) = default;
+  Context& operator=(Context &&) = default;
+
+  template <typename T, typename... ArgsT>
+    requires requires {
+      typename T::underlying_type;
+      requires std::is_constructible_v<typename T::underlying_type, ArgsT...>;
+      requires std::is_base_of_v<NodeImpl, typename T::underlying_type>;
+    }
+  T create(ArgsT &&...args) {
+    auto result = new typename T::underlying_type(std::forward<ArgsT>(args)...);
+    mNodes.emplace_front(std::unique_ptr<NodeImpl>{result});
+    return T(result);
+  }
+
+  template <typename T, typename... ArgsT>
+    requires requires {
+      typename T::underlying_type;
+      requires std::is_constructible_v<typename T::underlying_type, ArgsT...>;
+      requires std::is_base_of_v<LocationImpl, typename T::underlying_type>;
+    }
+  T getLocation(ArgsT &&...args) {
+    auto result = std::make_unique<typename T::underlying_type>(
+        std::forward<ArgsT>(args)...);
+    auto ptr = mLocations.insert(std::move(result)).first->get();
+    return T(static_cast<typename T::underlying_type *>(ptr));
+  }
+
+  PathLocation getPathLocation(std::string path) {
+    return getLocation<PathLocation>(std::move(path));
+  }
+  TextFileLocation getTextFileLocation(PathLocation location,
+                                       std::uint64_t line,
+                                       std::uint64_t column = 0) {
+    return getLocation<TextFileLocation>(location, line, column);
+  }
+  TextFileLocation getTextFileLocation(std::string path, std::uint64_t line,
+                                       std::uint64_t column = 0) {
+    return getLocation<TextFileLocation>(getPathLocation(path), line, column);
+  }
+  OffsetLocation getOffsetLocation(Location baseLocation,
+                                   std::uint64_t offset) {
+    return getLocation<OffsetLocation>(baseLocation, offset);
+  }
+  MemoryLocation getMemoryLocation(std::uint64_t address, std::uint64_t size) {
+    return getLocation<MemoryLocation>(address, size);
+  }
+  UnknownLocation getUnknownLocation() {
+    if (mUnknownLocation == nullptr) {
+      mUnknownLocation = std::make_unique<UnknownLocationImpl>();
+    }
+    return mUnknownLocation.get();
+  }
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Impl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Impl.hpp
@ -0,0 +1,361 @@
+#pragma once
+#include "../dialect/builtin.hpp"
+#include "../dialect/memssa.hpp"
+#include "Block.hpp"
+#include "Context.hpp"
+#include "InstructionImpl.hpp"
+#include "NodeImpl.hpp"
+#include "RegionImpl.hpp"
+#include "ValueImpl.hpp"
+
+namespace shader::ir {
+inline void InstructionImpl::addOperand(Operand operand) {
+  if (operand != nullptr) {
+    if (auto value = operand.getAsValue()) {
+      value.get()->addUse(this, operands.size());
+    }
+  }
+
+  operands.addOperand(std::move(operand));
+}
+
+inline Operand InstructionImpl::replaceOperand(int index, Operand operand) {
+  if (operands.size() <= unsigned(index)) {
+    std::abort();
+  }
+
+  if (!operands[index].isNull()) {
+    if (auto value = operands[index].getAsValue()) {
+      value.get()->removeUse(this, index);
+    }
+  }
+
+  if (auto value = operand.getAsValue()) {
+    value.get()->addUse(this, index);
+  }
+
+  return std::exchange(operands[index], std::move(operand));
+}
+
+inline Operand InstructionImpl::eraseOperand(int index, int count) {
+  if (index + count == operands.size()) {
+    auto result = replaceOperand(index, nullptr);
+
+    for (int i = 1; i < count; ++i) {
+      replaceOperand(i + index, nullptr);
+    }
+
+    operands.resize(operands.size() - count);
+    return result;
+  }
+
+  auto result = replaceOperand(index, replaceOperand(index + 1, nullptr));
+
+  for (int i = 1; i < count; ++i) {
+    replaceOperand(index + i, nullptr);
+  }
+
+  for (int i = index + 1; i < operands.size() - count; ++i) {
+    replaceOperand(i, replaceOperand(i + count, nullptr));
+  }
+
+  operands.resize(operands.size() - count);
+  return result;
+}
+
+inline void InstructionImpl::remove() {
+  if (auto value = Instruction(this).cast<Value>()) {
+    if (!value.isUnused()) {
+      std::abort();
+    }
+  }
+
+  for (int index = 0; auto &operand : operands) {
+    if (auto value = operand.getAsValue()) {
+      value.get()->removeUse(this, index);
+    }
+    index++;
+  }
+
+  operands.clear();
+
+  if (parent != nullptr) {
+    erase();
+  }
+}
+
+inline void InstructionImpl::erase() {
+  assert(parent != nullptr);
+
+  if (prev != nullptr) {
+    prev.get()->next = next;
+  } else {
+    parent.get()->first = next;
+  }
+  if (next != nullptr) {
+    next.get()->prev = prev;
+  } else {
+    parent.get()->last = prev;
+  }
+
+  prev = nullptr;
+  next = nullptr;
+  parent = nullptr;
+}
+
+template <typename ImplT, template <typename> typename BaseWrapper>
+void RegionLikeWrapper<ImplT, BaseWrapper>::appendRegion(RegionLike other) {
+  for (auto child = other.getFirst(); child != nullptr;) {
+    auto node = child;
+    child = child.getNext();
+    node.erase();
+    this->addChild(node);
+  }
+}
+
+inline void RegionLikeImpl::insertAfter(Instruction point, Instruction node) {
+  assert(point == nullptr || point.getParent() == this);
+  assert(node.getParent() == nullptr);
+  assert(node.getPrev() == nullptr);
+  assert(node.getNext() == nullptr);
+
+  if (point == nullptr) {
+    prependChild(node);
+    return;
+  }
+
+  assert(first != nullptr);
+  assert(last != nullptr);
+
+  node.get()->parent = this;
+  node.get()->prev = point.get();
+
+  if (auto pointNext = point.getNext()) {
+    pointNext.get()->prev = node.get();
+    node.get()->next = pointNext.get();
+  } else {
+    assert(last == point);
+    last = node.get();
+  }
+
+  point.get()->next = node.get();
+}
+
+inline void RegionLikeImpl::prependChild(Instruction node) {
+  assert(node.getParent() == nullptr);
+  assert(node.getPrev() == nullptr);
+  assert(node.getNext() == nullptr);
+
+  node.get()->parent = this;
+  if (last == nullptr) {
+    last = node;
+  } else {
+    first.get()->prev = node;
+    node.get()->next = first;
+  }
+  first = node;
+}
+
+inline void RegionLikeImpl::addChild(Instruction node) {
+  assert(node.getParent() == nullptr);
+  assert(node.getPrev() == nullptr);
+  assert(node.getNext() == nullptr);
+
+  node.get()->parent = this;
+  if (first == nullptr) {
+    first = node;
+  } else {
+    last.get()->next = node;
+    node.get()->prev = last;
+  }
+  last = node;
+}
+
+inline void RegionImpl::print(std::ostream &os, NameStorage &ns) const {
+  os << "{\n";
+  for (auto child : children()) {
+    os << "  ";
+    child.print(os, ns);
+    os << "\n";
+  }
+  os << "}";
+}
+
+inline Value Operand::getAsValue() const {
+  if (auto node = std::get_if<ValueImpl *>(&value)) {
+    return Value(const_cast<ValueImpl *>(*node));
+  }
+
+  return {};
+}
+
+template <typename T>
+T clone(T object, Context &context, CloneMap &map, bool isOperand = false)
+  requires requires {
+    map.getOrClone(context, object, isOperand).template staticCast<T>();
+  }
+{
+  return map.getOrClone(context, object, isOperand).template staticCast<T>();
+}
+
+template <typename T>
+T clone(T object, Context &context)
+  requires requires(CloneMap map) { clone(object, context, map); }
+{
+  CloneMap map;
+  return clone(object, context, map);
+}
+
+template <typename T>
+T clone(T location, Context &context)
+  requires requires { Location(location).get()->clone(context); }
+{
+  if (location == nullptr) {
+    return nullptr;
+  }
+  return Location(location).get()->clone(context).staticCast<T>();
+}
+
+namespace detail {
+template <typename T, typename U, typename... ArgsT>
+  requires(std::is_same_v<typename T::underlying_type, U>)
+T cloneInstructionImpl(const U *object, Context &context, CloneMap &map,
+                       ArgsT &&...args) {
+  auto result = context.create<T>(clone(object->getLocation(), context),
+                                  std::forward<ArgsT>(args)...);
+
+  for (auto &&operand : object->getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  return result;
+}
+} // namespace detail
+
+inline Node InstructionImpl::clone(Context &context, CloneMap &map) const {
+  return detail::cloneInstructionImpl<Instruction>(this, context, map, kind,
+                                                   op);
+}
+
+inline Node ValueImpl::clone(Context &context, CloneMap &map) const {
+  return detail::cloneInstructionImpl<Value>(this, context, map, kind, op);
+}
+
+inline Node RegionImpl::clone(Context &context, CloneMap &map) const {
+  auto result = context.create<Region>(ir::clone(getLocation(), context));
+  for (auto &&child : children()) {
+    result.addChild(ir::clone(child, context, map));
+  }
+
+  return result;
+}
+
+inline BlockImpl::BlockImpl(Location loc)
+    : ValueImpl(loc, ir::Kind::Builtin, builtin::BLOCK) {}
+
+inline Node BlockImpl::clone(Context &context, CloneMap &map) const {
+  auto result = context.create<Block>(ir::clone(getLocation(), context));
+  for (auto &&operand : getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  for (auto &&child : children()) {
+    result.addChild(ir::clone(child, context, map));
+  }
+
+  return result;
+}
+
+inline Operand Operand::clone(Context &context, CloneMap &map) const {
+  if (auto value = getAsValue()) {
+    return ir::clone(value, context, map, true);
+  }
+
+  return *this;
+}
+
+inline Node memssa::PhiImpl::clone(Context &context, CloneMap &map) const {
+  auto self = Phi(const_cast<PhiImpl *>(this));
+  auto result = context.create<Phi>(ir::clone(self.getLocation(), context),
+                                    self.getKind(), self.getOp());
+
+  for (auto &&operand : self.getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  return result;
+}
+
+inline Node memssa::VarImpl::clone(Context &context, CloneMap &map) const {
+  auto self = Var(const_cast<VarImpl *>(this));
+  auto result = context.create<Var>(ir::clone(self.getLocation(), context),
+                                    self.getKind(), self.getOp());
+
+  for (auto &&operand : self.getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  return result;
+}
+
+inline Node memssa::UseImpl::clone(Context &context, CloneMap &map) const {
+  auto self = Use(const_cast<UseImpl *>(this));
+  auto result = context.create<Use>(ir::clone(self.getLocation(), context),
+                                    self.getKind(), self.getOp());
+
+  for (auto &&operand : self.getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  return result;
+}
+
+inline Node memssa::DefImpl::clone(Context &context, CloneMap &map) const {
+  auto self = Def(const_cast<DefImpl *>(this));
+  auto result = context.create<Def>(ir::clone(self.getLocation(), context),
+                                    self.getKind(), self.getOp());
+
+  for (auto &&operand : self.getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  return result;
+}
+
+inline Node memssa::ScopeImpl::clone(Context &context, CloneMap &map) const {
+  auto self = Scope(const_cast<ScopeImpl *>(this));
+  auto result = context.create<Scope>(ir::clone(self.getLocation(), context));
+
+  for (auto &&operand : self.getOperands()) {
+    result.addOperand(operand.clone(context, map));
+  }
+
+  for (auto child : self.children()) {
+    result.addChild(ir::clone(child, context, map));
+  }
+
+  return result;
+}
+
+inline Location PathLocationImpl::clone(Context &context) const {
+  return context.getPathLocation(data.path);
+}
+inline Location TextFileLocationImpl::clone(Context &context) const {
+  return context.getTextFileLocation(data.file, data.line, data.column);
+}
+inline Location OffsetLocationImpl::clone(Context &context) const {
+  return context.getOffsetLocation(baseLocation, offset);
+}
+inline Location MemoryLocationImpl::clone(Context &context) const {
+  return context.getMemoryLocation(data.address, data.size);
+}
+inline Location UnknownLocationImpl::clone(Context &context) const {
+  return context.getUnknownLocation();
+}
+
+inline Node CloneMap::getOrCloneImpl(Context &context, Node node, bool) {
+  Node result = node.get()->clone(context, *this);
+  overrides[node] = result;
+  return result;
+}
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Instruction.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Instruction.hpp
@ -0,0 +1,72 @@
+#pragma once
+
+#include "Kind.hpp"
+#include "Node.hpp"
+
+namespace shader::ir {
+enum class InstructionId : std::uint32_t {};
+
+constexpr InstructionId getInstructionId(ir::Kind kind, unsigned op) {
+  return static_cast<InstructionId>(static_cast<std::uint32_t>(kind) |
+                                    static_cast<std::uint32_t>(op) << 5);
+}
+
+constexpr ir::Kind getInstructionKind(InstructionId id) {
+  return static_cast<ir::Kind>(static_cast<std::uint32_t>(id) & 0x1f);
+}
+constexpr unsigned getInstructionOp(InstructionId id) {
+  return static_cast<unsigned>(static_cast<std::uint32_t>(id) >> 5);
+}
+
+struct Region;
+struct InstructionImpl;
+struct Instruction;
+
+template <typename ImplT> struct InstructionWrapper : NodeWrapper<ImplT> {
+  using NodeWrapper<ImplT>::NodeWrapper;
+  using NodeWrapper<ImplT>::operator=;
+
+  Kind getKind() const { return this->impl->kind; }
+  unsigned getOp() const { return this->impl->op; }
+  InstructionId getInstId() const {
+    return getInstructionId(getKind(), getOp());
+  }
+
+  auto getParent() const { return this->impl->parent; };
+  bool hasParent() const { return this->impl->parent != nullptr; }
+  auto getNext() const { return Instruction(this->impl->next); }
+  auto getPrev() const { return Instruction(this->impl->prev); }
+
+  void addOperand(Operand operand) const { this->impl->addOperand(operand); }
+
+  decltype(auto) replaceOperand(int index, Operand operand) const {
+    return this->impl->replaceOperand(index, operand);
+  }
+  decltype(auto) eraseOperand(int index, int count = 1) const {
+    return this->impl->eraseOperand(index, count);
+  }
+  void insertAfter(Node point, Node node) const {
+    this->impl->insertAfter(point, node);
+  }
+  void erase() const { this->impl->erase(); }
+  void remove() const { this->impl->remove(); }
+
+  template <typename T = Node> auto children() const {
+    return this->impl->template children<T>();
+  }
+  decltype(auto) getOperand(std::size_t i) const { return this->impl->getOperand(i); }
+  decltype(auto) getOperands() const { return this->impl->getOperands(); }
+  std::size_t getOperandCount() const { return getOperands().size(); }
+
+  template <typename T>
+    requires std::is_enum_v<T>
+  void addOperand(T enumValue) {
+    addOperand(std::to_underlying(enumValue));
+  }
+};
+
+struct Instruction : InstructionWrapper<InstructionImpl> {
+  using InstructionWrapper<InstructionImpl>::InstructionWrapper;
+  using InstructionWrapper<InstructionImpl>::operator=;
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/InstructionImpl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/InstructionImpl.hpp
@ -0,0 +1,69 @@
+#pragma once
+
+#include "Instruction.hpp"
+#include "Kind.hpp"
+#include "Location.hpp"
+#include "NodeImpl.hpp"
+#include "PrintableWrapper.hpp"
+#include "RegionLike.hpp"
+#include <ostream>
+#include <span>
+
+namespace shader::ir {
+struct InstructionImpl : NodeImpl {
+  Kind kind;
+  unsigned op;
+
+  RegionLike parent;
+  Instruction prev;
+  Instruction next;
+  OperandList operands;
+
+  InstructionImpl(Location location, Kind kind, unsigned op,
+                  std::span<const Operand> operands = {})
+      : kind(kind), op(op) {
+    setLocation(location);
+
+    for (auto &&op : operands) {
+      addOperand(std::move(op));
+    }
+  }
+
+  template <typename T>
+    requires std::is_enum_v<T>
+  void addOperand(T enumValue) {
+    addOperand(std::to_underlying(enumValue));
+  }
+
+  void addOperand(Operand operand);
+  Operand replaceOperand(int index, Operand operand);
+  Operand eraseOperand(int index, int count);
+  void remove();
+  void erase();
+
+  decltype(auto) getOperand(std::size_t i) const {
+    return operands.getOperand(i);
+  }
+
+  decltype(auto) getOperands() const { return std::span(operands); }
+
+  void print(std::ostream &os, NameStorage &ns) const override {
+    os << getInstructionName(kind, op);
+
+    if (!operands.empty()) {
+      os << "(";
+      for (bool first = true; auto operand : operands) {
+        if (first) {
+          first = false;
+        } else {
+          os << ", ";
+        }
+        operand.print(os, ns);
+      }
+      os << ")";
+    }
+  }
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Kind.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Kind.hpp
@ -0,0 +1,205 @@
+#pragma once
+
+#include <string>
+namespace shader::ir {
+enum class Kind {
+  Spv,
+  Builtin,
+  AmdGpu,
+  Vop2,
+  Sop2,
+  Sopk,
+  Smrd,
+  Vop3,
+  Mubuf,
+  Mtbuf,
+  Mimg,
+  Ds,
+  Vintrp,
+  Exp,
+  Vop1,
+  Vopc,
+  Sop1,
+  Sopc,
+  Sopp,
+  MemSSA,
+
+  Count,
+};
+
+namespace spv {
+const char *getInstructionName(unsigned id);
+}
+namespace builtin {
+const char *getInstructionName(unsigned id);
+}
+namespace amdgpu {
+const char *getInstructionName(unsigned id);
+}
+namespace vop2 {
+const char *getInstructionName(unsigned id);
+}
+namespace sop2 {
+const char *getInstructionName(unsigned id);
+}
+namespace sopk {
+const char *getInstructionName(unsigned id);
+}
+namespace smrd {
+const char *getInstructionName(unsigned id);
+}
+namespace vop3 {
+const char *getInstructionName(unsigned id);
+}
+namespace mubuf {
+const char *getInstructionName(unsigned id);
+}
+namespace mtbuf {
+const char *getInstructionName(unsigned id);
+}
+namespace mimg {
+const char *getInstructionName(unsigned id);
+}
+namespace ds {
+const char *getInstructionName(unsigned id);
+}
+namespace vintrp {
+const char *getInstructionName(unsigned id);
+}
+namespace exp {
+const char *getInstructionName(unsigned id);
+}
+namespace vop1 {
+const char *getInstructionName(unsigned id);
+}
+namespace vopc {
+const char *getInstructionName(unsigned id);
+}
+namespace sop1 {
+const char *getInstructionName(unsigned id);
+}
+namespace sopc {
+const char *getInstructionName(unsigned id);
+}
+namespace sopp {
+const char *getInstructionName(unsigned id);
+}
+
+namespace memssa {
+const char *getInstructionName(unsigned id);
+}
+
+inline const char *getKindName(Kind kind) {
+  switch (kind) {
+  case Kind::Spv:
+    return "spv";
+  case Kind::Builtin:
+    return "builtin";
+  case Kind::AmdGpu:
+    return "amdgpu";
+  case Kind::Vop2:
+    return "vop2";
+  case Kind::Sop2:
+    return "sop2";
+  case Kind::Sopk:
+    return "sopk";
+  case Kind::Smrd:
+    return "smrd";
+  case Kind::Vop3:
+    return "vop3";
+  case Kind::Mubuf:
+    return "mubuf";
+  case Kind::Mtbuf:
+    return "mtbuf";
+  case Kind::Mimg:
+    return "mimg";
+  case Kind::Ds:
+    return "ds";
+  case Kind::Vintrp:
+    return "vintrp";
+  case Kind::Exp:
+    return "exp";
+  case Kind::Vop1:
+    return "vop1";
+  case Kind::Vopc:
+    return "vopc";
+  case Kind::Sop1:
+    return "sop1";
+  case Kind::Sopc:
+    return "sopc";
+  case Kind::Sopp:
+    return "sopp";
+  case Kind::MemSSA:
+    return "memssa";
+
+  case Kind::Count:
+    break;
+  }
+
+  return "<invalid>";
+}
+inline const char *getInstructionShortName(Kind kind, unsigned op) {
+  switch (kind) {
+  case Kind::Spv:
+    return spv::getInstructionName(op);
+  case Kind::Builtin:
+    return builtin::getInstructionName(op);
+  case Kind::AmdGpu:
+    return amdgpu::getInstructionName(op);
+  case Kind::Vop2:
+    return vop2::getInstructionName(op);
+  case Kind::Sop2:
+    return sop2::getInstructionName(op);
+  case Kind::Sopk:
+    return sopk::getInstructionName(op);
+  case Kind::Smrd:
+    return smrd::getInstructionName(op);
+  case Kind::Vop3:
+    return vop3::getInstructionName(op);
+  case Kind::Mubuf:
+    return mubuf::getInstructionName(op);
+  case Kind::Mtbuf:
+    return mtbuf::getInstructionName(op);
+  case Kind::Mimg:
+    return mimg::getInstructionName(op);
+  case Kind::Ds:
+    return ds::getInstructionName(op);
+  case Kind::Vintrp:
+    return vintrp::getInstructionName(op);
+  case Kind::Exp:
+    return exp::getInstructionName(op);
+  case Kind::Vop1:
+    return vop1::getInstructionName(op);
+  case Kind::Vopc:
+    return vopc::getInstructionName(op);
+  case Kind::Sop1:
+    return sop1::getInstructionName(op);
+  case Kind::Sopc:
+    return sopc::getInstructionName(op);
+  case Kind::Sopp:
+    return sopp::getInstructionName(op);
+  case Kind::MemSSA:
+    return memssa::getInstructionName(op);
+
+  case Kind::Count:
+    break;
+  }
+
+  return nullptr;
+}
+
+inline std::string getInstructionName(Kind kind, unsigned op) {
+  std::string result = getKindName(kind);
+  result += '.';
+
+  if (auto name = getInstructionShortName(kind, op)) {
+    result += name;
+  } else {
+    result += "<invalid ";
+    result += std::to_string(op);
+    result += ">";
+  }
+
+  return result;
+}
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Location.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Location.hpp
@ -0,0 +1,200 @@
+#pragma once
+#include "PrintableWrapper.hpp"
+#include <cstdint>
+#include <string>
+
+namespace shader::ir {
+struct LocationImpl;
+struct CloneMap;
+class Context;
+
+template <typename ImplT> struct LocationWrapper : PrintableWrapper<ImplT> {
+  using PrintableWrapper<ImplT>::PrintableWrapper;
+  using PrintableWrapper<ImplT>::operator=;
+};
+
+using Location = LocationWrapper<LocationImpl>;
+
+struct LocationImpl {
+  virtual ~LocationImpl() {}
+  virtual void print(std::ostream &os) = 0;
+  virtual std::strong_ordering compare(const LocationImpl &other) const = 0;
+
+  virtual Location clone(Context &context) const = 0;
+  auto operator<=>(const LocationImpl &other) const { return compare(other); }
+};
+
+struct PathLocationImpl final : LocationImpl {
+  struct Data {
+    std::string path;
+    auto operator<=>(const Data &other) const = default;
+  } data;
+
+  PathLocationImpl(std::string path) : data{.path = std::move(path)} {}
+
+  void print(std::ostream &os) override { os << data.path; }
+
+  std::strong_ordering compare(const LocationImpl &other) const override {
+    if (this == &other) {
+      return std::strong_ordering::equal;
+    }
+
+    if (auto p = dynamic_cast<const PathLocationImpl *>(&other)) {
+      return this->data <=> p->data;
+    }
+
+    return this <=> &other;
+  }
+
+  Location clone(Context &context) const override;
+};
+
+struct PathLocation : LocationWrapper<PathLocationImpl> {
+  using LocationWrapper::LocationWrapper;
+  using LocationWrapper::operator=;
+  const std::string &getPath() const { return impl->data.path; }
+};
+
+struct TextFileLocationImpl final : LocationImpl {
+  struct Data {
+    PathLocation file;
+    std::uint64_t line;
+    std::uint64_t column;
+    auto operator<=>(const Data &other) const = default;
+
+  } data;
+
+  TextFileLocationImpl(PathLocation file, std::uint64_t line,
+                       std::uint64_t column)
+      : data{.file = file, .line = line, .column = column} {}
+
+  void print(std::ostream &os) override {
+    data.file.print(os);
+    os << ':' << data.line << ':' << data.column;
+  }
+
+  auto operator<=>(const TextFileLocationImpl &other) const = default;
+  std::strong_ordering compare(const LocationImpl &other) const override {
+    if (this == &other) {
+      return std::strong_ordering::equal;
+    }
+
+    if (auto p = dynamic_cast<const TextFileLocationImpl *>(&other)) {
+      return *this <=> *p;
+    }
+
+    return this <=> &other;
+  }
+
+  Location clone(Context &context) const override;
+};
+
+struct TextFileLocation : LocationWrapper<TextFileLocationImpl> {
+  using LocationWrapper::LocationWrapper;
+  using LocationWrapper::operator=;
+  PathLocation getFile() const { return impl->data.file; }
+  std::uint64_t getLine() const { return impl->data.line; }
+  std::uint64_t getColumn() const { return impl->data.column; }
+};
+
+struct OffsetLocationData {
+  Location baseLocation;
+  std::uint64_t offset;
+
+  OffsetLocationData(Location baseLocation, std::uint64_t offset)
+      : baseLocation(baseLocation), offset(offset) {}
+
+  auto operator<=>(const OffsetLocationData &other) const = default;
+};
+
+struct OffsetLocationImpl final : OffsetLocationData, LocationImpl {
+  OffsetLocationImpl(Location file, std::uint64_t offset)
+      : OffsetLocationData(file, offset) {}
+
+  void print(std::ostream &os) override {
+    baseLocation.print(os);
+    os << '+' << offset;
+  }
+
+  std::strong_ordering compare(const LocationImpl &other) const override {
+    if (this == &other) {
+      return std::strong_ordering::equal;
+    }
+
+    if (auto p = dynamic_cast<const OffsetLocationData *>(&other)) {
+      return static_cast<const OffsetLocationData &>(*this) <=> *p;
+    }
+
+    return this <=> &other;
+  }
+
+  Location clone(Context &context) const override;
+};
+
+struct OffsetLocation : LocationWrapper<OffsetLocationImpl> {
+  using LocationWrapper::LocationWrapper;
+  using LocationWrapper::operator=;
+  Location getBaseLocation() const { return impl->baseLocation; }
+  std::uint64_t getOffset() const { return impl->offset; }
+};
+
+struct MemoryLocationImpl final : LocationImpl {
+  struct Data {
+    std::uint64_t address;
+    std::uint64_t size;
+
+    auto operator<=>(const Data &other) const = default;
+  } data;
+
+  MemoryLocationImpl(std::uint64_t address, std::uint64_t size)
+      : data{.address = address, .size = size} {}
+
+  void print(std::ostream &os) override {
+    os << '(' << data.address << " - " << data.size << ')';
+  }
+
+  std::strong_ordering compare(const LocationImpl &other) const override {
+    if (this == &other) {
+      return std::strong_ordering::equal;
+    }
+
+    if (auto p = dynamic_cast<const MemoryLocationImpl *>(&other)) {
+      return data <=> p->data;
+    }
+
+    return this <=> &other;
+  }
+
+  Location clone(Context &context) const override;
+};
+
+struct MemoryLocation : LocationWrapper<MemoryLocationImpl> {
+  using LocationWrapper::LocationWrapper;
+  using LocationWrapper::operator=;
+  std::uint64_t getAddress() const { return impl->data.address; }
+  std::uint64_t getSize() const { return impl->data.size; }
+};
+
+struct UnknownLocationImpl final : LocationImpl {
+  void print(std::ostream &os) override { os << "unknown"; }
+
+  std::strong_ordering compare(const LocationImpl &other) const override {
+    if (this == &other) {
+      return std::strong_ordering::equal;
+    }
+
+    if (dynamic_cast<const MemoryLocationImpl *>(&other)) {
+      return std::strong_ordering::equal;
+    }
+
+    return this <=> &other;
+  }
+
+  Location clone(Context &context) const override;
+};
+
+struct UnknownLocation : LocationWrapper<UnknownLocationImpl> {
+  using LocationWrapper::LocationWrapper;
+  using LocationWrapper::operator=;
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/NameStorage.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/NameStorage.hpp
@ -0,0 +1,90 @@
+#pragma once
+
+#include "Node.hpp"
+#include <set>
+#include <string>
+#include <unordered_map>
+
+namespace shader::ir {
+class NameStorage {
+  std::set<std::string> mNames;
+  std::unordered_map<const NodeImpl *, const std::string *> mNodeToName;
+
+public:
+  void setUniqueNameOf(Node node, std::string name) {
+    auto [nodeIt, nodeInserted] = mNodeToName.try_emplace(node.impl, nullptr);
+
+    if (!nodeInserted && *nodeIt->second == name) {
+      return;
+    }
+
+    auto [nameIt, nameInserted] = mNames.insert(name);
+
+    if (!nameInserted) {
+      std::size_t i = 1;
+
+      while (true) {
+        auto newName = name + "_" + std::to_string(i);
+        auto [newNameIt, newNameInserted] = mNames.insert(std::move(newName));
+
+        if (!newNameInserted) {
+          ++i;
+          continue;
+        }
+
+        nameIt = newNameIt;
+        break;
+      }
+    }
+
+    nodeIt->second = &*nameIt;
+  }
+
+  void setNameOf(Node node, std::string name) {
+    auto [nodeIt, nodeInserted] = mNodeToName.try_emplace(node.impl, nullptr);
+
+    if (!nodeInserted && *nodeIt->second == name) {
+      return;
+    }
+
+    auto [nameIt, nameInserted] = mNames.insert(name);
+    nodeIt->second = &*nameIt;
+  }
+
+  std::string_view tryGetNameOf(Node node) const {
+    auto it = mNodeToName.find(node.impl);
+    if (it == mNodeToName.end()) {
+      return {};
+    }
+    return *it->second;
+  }
+
+  const std::string &getNameOf(Node node) {
+    auto [it, inserted] = mNodeToName.emplace(node.impl, nullptr);
+
+    if (inserted) {
+      std::size_t i = mNames.size() + 1;
+
+      while (true) {
+        auto newName = std::to_string(i);
+        auto [newNameIt, newNameInserted] = mNames.insert(std::move(newName));
+
+        if (!newNameInserted) {
+          ++i;
+          continue;
+        }
+
+        it->second = &*newNameIt;
+        break;
+      }
+    }
+
+    return *it->second;
+  }
+
+  void clear() {
+    mNames.clear();
+    mNodeToName.clear();
+  }
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Node.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Node.hpp
@ -0,0 +1,17 @@
+#pragma once
+
+#include "Operand.hpp"
+#include "PrintableWrapper.hpp"
+
+namespace shader::ir {
+template <typename ImplT> struct NodeWrapper;
+
+using Node = NodeWrapper<NodeImpl>;
+
+template <typename ImplT> struct NodeWrapper : PrintableWrapper<ImplT> {
+  using PrintableWrapper<ImplT>::PrintableWrapper;
+  using PrintableWrapper<ImplT>::operator=;
+
+  auto getLocation() const { return this->impl->getLocation(); }
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/NodeImpl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/NodeImpl.hpp
@ -0,0 +1,65 @@
+#pragma once
+
+#include "Location.hpp"
+#include "Node.hpp"
+#include "Operand.hpp"
+#include <cassert>
+#include <map>
+
+namespace shader::ir {
+struct NodeImpl;
+struct CloneMap;
+class NameStorage;
+class Context;
+
+// namespace debug {
+// [[gnu::used, gnu::noinline]] void dump(Node object);
+// [[gnu::used, gnu::noinline]] void dump(NodeImpl *object);
+// } // namespace debug
+
+struct CloneMap {
+  virtual ~CloneMap() = default;
+
+  std::map<Node, Node> overrides;
+  void setOverride(Node from, Node to) { overrides[from] = to; }
+  Node getOverride(Node from) {
+    if (auto it = overrides.find(from); it != overrides.end()) {
+      return it->second;
+    }
+    return {};
+  }
+  virtual Node getOrClone(Context &context, Node node, bool isOperand) {
+    // if (auto it = overrides.find(node); it != overrides.end()) {
+    //   return it->second;
+    // }
+
+    // return getOrCloneImpl(context, node, isOperand);
+
+    if (node == nullptr) {
+      return node;
+    }
+
+    auto [it, inserted] = overrides.insert({node, nullptr});
+
+    if (inserted) {
+      it->second = getOrCloneImpl(context, node, isOperand);
+      overrides[it->second] = it->second;
+    }
+
+    return it->second;
+  }
+
+  virtual Node getOrCloneImpl(Context &context, Node node, bool isOperand);
+};
+
+struct NodeImpl {
+  Location location;
+  virtual ~NodeImpl() = default;
+
+  void setLocation(Location newLocation) { location = newLocation; }
+  Location getLocation() const { return location; }
+
+  virtual void print(std::ostream &os, NameStorage &ns) const = 0;
+  virtual Node clone(Context &context, CloneMap &map) const = 0;
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Operand.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Operand.hpp
@ -0,0 +1,152 @@
+#pragma once
+
+#include "../Vector.hpp"
+#include <bit>
+#include <compare>
+#include <cstddef>
+#include <cstdint>
+#include <span>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace shader::ir {
+class NameStorage;
+class Context;
+struct ValueImpl;
+struct Value;
+struct NodeImpl;
+struct CloneMap;
+template <typename ImplT> struct NodeWrapper;
+using Node = NodeWrapper<NodeImpl>;
+
+struct Operand {
+  using UnderlyingT =
+      std::variant<std::nullptr_t, ValueImpl *, std::int64_t, std::int32_t,
+                   double, float, bool, std::string>;
+
+  UnderlyingT value{nullptr};
+
+  template <typename T>
+    requires(!std::is_integral_v<std::remove_cvref_t<T>> ||
+             std::is_same_v<bool, std::remove_cvref_t<T>>)
+  Operand(T &&value)
+    requires requires { UnderlyingT{std::forward<T>(value)}; }
+      : value(std::forward<T>(value)) {}
+
+  template <typename T>
+  Operand(T value)
+    requires requires {
+      requires(std::is_integral_v<std::remove_cvref_t<T>> &&
+               !std::is_same_v<bool, T> && sizeof(T) <= sizeof(std::int32_t));
+      UnderlyingT{static_cast<std::int32_t>(value)};
+    }
+      : value(static_cast<std::int32_t>(value)) {}
+
+  template <typename T>
+  Operand(T value)
+    requires requires {
+      requires(std::is_integral_v<std::remove_cvref_t<T>> &&
+               sizeof(T) == sizeof(std::int64_t));
+      UnderlyingT{static_cast<std::int64_t>(value)};
+    }
+      : value(static_cast<std::int64_t>(value)) {}
+
+  template <typename T>
+    requires(std::is_enum_v<std::remove_cvref_t<T>>)
+  Operand(T value) : Operand(std::to_underlying(value)) {}
+
+  template <typename T>
+  Operand(T &&value)
+    requires requires { Operand(value.impl); }
+      : Operand(value.impl) {
+    if (value.impl == nullptr) {
+      std::abort();
+    }
+  }
+
+  Operand() = default;
+  Operand(const Operand &) = default;
+  Operand(Operand &&) = default;
+  Operand &operator=(const Operand &) = default;
+  Operand &operator=(Operand &&) = default;
+
+  template <typename T>
+  Operand &operator=(T &&other)
+    requires requires { value = std::forward<T>(other); }
+  {
+    value = std::forward<T>(other);
+    return *this;
+  }
+
+  template <typename T> const T *getAs() const {
+    if (auto node = std::get_if<T>(&value)) {
+      return node;
+    }
+
+    return {};
+  }
+
+  Value getAsValue() const;
+
+  const std::string *getAsString() const { return getAs<std::string>(); }
+  const std::int32_t *getAsInt32() const { return getAs<std::int32_t>(); }
+  const std::int64_t *getAsInt64() const { return getAs<std::int64_t>(); }
+  const double *getAsDouble() const { return getAs<double>(); }
+  const float *getAsFloat() const { return getAs<float>(); }
+  const bool *getAsBool() const { return getAs<bool>(); }
+  bool isNull() const { return std::get_if<std::nullptr_t>(&value) != nullptr; }
+  explicit operator bool() const { return !isNull(); }
+
+  void print(std::ostream &os, NameStorage &ns) const;
+  Operand clone(Context &context, CloneMap &map) const;
+
+  std::partial_ordering operator<=>(const Operand &other) const {
+    auto result = value.index() <=> other.value.index();
+    if (result != 0) {
+      return result;
+    }
+
+    return std::visit(
+        [](auto &&lhs, auto &&rhs) -> std::partial_ordering {
+          using lhs_type = std::remove_cvref_t<decltype(lhs)>;
+          using rhs_type = std::remove_cvref_t<decltype(rhs)>;
+          if constexpr (std::is_same_v<lhs_type, rhs_type>) {
+            if constexpr (std::is_same_v<lhs_type, std::nullptr_t>) {
+              return std::strong_ordering::equal;
+            } else if constexpr (std::is_same_v<lhs_type, float>) {
+              return std::bit_cast<std::uint32_t>(lhs) <=>
+                     std::bit_cast<std::uint32_t>(rhs);
+            } else if constexpr (std::is_same_v<lhs_type, double>) {
+              return std::bit_cast<std::uint64_t>(lhs) <=>
+                     std::bit_cast<std::uint64_t>(rhs);
+            } else {
+              return lhs <=> rhs;
+            }
+          }
+
+          throw;
+        },
+        value, other.value);
+  }
+
+  bool operator==(const Operand &) const = default;
+};
+
+struct OperandList : std::vector<Operand> {
+  using std::vector<Operand>::vector;
+  using std::vector<Operand>::operator=;
+
+  template <typename T>
+    requires std::is_enum_v<T>
+  void addOperand(T enumValue) {
+    addOperand(std::to_underlying(enumValue));
+  }
+
+  void addOperand(Operand operand) { push_back(std::move(operand)); }
+
+  const Operand &getOperand(std::size_t i) const { return at(i); }
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/OperandPrint.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/OperandPrint.hpp
@ -0,0 +1,43 @@
+#pragma once
+
+#include "NameStorage.hpp"
+#include "Operand.hpp"
+#include "ValueImpl.hpp" // IWYU pragma: keep
+
+namespace shader::ir {
+inline void Operand::print(std::ostream &os, NameStorage &ns) const {
+  if (auto node = getAsValue()) {
+    os << '%' << ns.getNameOf(node);
+    return;
+  }
+  if (auto node = getAsString()) {
+    os << '"' << *node << '"';
+    return;
+  }
+  if (auto node = getAsInt32()) {
+    os << *node << "i32";
+    return;
+  }
+  if (auto node = getAsInt64()) {
+    os << *node << "i64";
+    return;
+  }
+  if (auto node = getAsFloat()) {
+    os << *node << 'f';
+    return;
+  }
+  if (auto node = getAsDouble()) {
+    os << *node << 'd';
+    return;
+  }
+  if (auto node = getAsBool()) {
+    os << (*node ? "true" : "false");
+    return;
+  }
+  if (isNull()) {
+    os << "null";
+    return;
+  }
+  os << "<invalid operand " << value.index() << ">";
+}
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PointerWrapper.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PointerWrapper.hpp
@ -0,0 +1,88 @@
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <type_traits>
+
+namespace shader::ir {
+template <typename ImplT> struct PointerWrapper {
+  using underlying_type = ImplT;
+  ImplT *impl = nullptr;
+  PointerWrapper() = default;
+  PointerWrapper(ImplT *impl) : impl(impl) {}
+
+  template <typename OtherT>
+    requires std::is_base_of_v<ImplT, OtherT>
+  PointerWrapper(PointerWrapper<OtherT> node) : impl(node.impl) {}
+
+  explicit operator bool() const { return impl != nullptr; }
+  bool operator==(std::nullptr_t) const { return impl == nullptr; }
+  bool operator==(ImplT *other) const { return impl == other; }
+
+  template <typename Self> Self &operator=(this Self &self, ImplT *other) {
+    self.impl = other;
+    return self;
+  }
+
+  template <typename Self, typename OtherT>
+    requires std::is_base_of_v<ImplT, OtherT>
+  Self &operator=(this Self &self, PointerWrapper<OtherT> other) {
+    self.impl = other.get();
+    return self;
+  }
+
+  // ImplT *operator->() const { return impl; }
+
+  ImplT *get() const { return impl; }
+
+  auto operator<=>(const PointerWrapper &) const = default;
+  bool operator==(const PointerWrapper &) const = default;
+
+  template <typename T>
+  T cast() const
+    requires requires { static_cast<typename T::underlying_type *>(impl); }
+  {
+    return T(dynamic_cast<typename T::underlying_type *>(impl));
+  }
+
+  template <typename T>
+  T staticCast() const
+    requires requires { static_cast<typename T::underlying_type *>(impl); }
+  {
+    assert(impl == nullptr || cast<T>() != nullptr);
+    return T(static_cast<typename T::underlying_type *>(impl));
+  }
+
+  template <typename T> bool isa() const {
+    if (impl == nullptr) {
+      return false;
+    }
+
+    if constexpr (std::is_same_v<std::remove_cvref_t<T>,
+                                 std::remove_cvref_t<ImplT>>) {
+      return true;
+    } else if constexpr (!requires { cast<T>() != nullptr; }) {
+      return false;
+    } else {
+      return cast<T>() != nullptr;
+    }
+  }
+
+  template <typename... T>
+    requires(sizeof...(T) > 1)
+  bool isa() const {
+    return (isa<T>() || ...);
+  }
+};
+} // namespace shader::ir
+
+namespace std {
+template <typename T>
+  requires std::is_base_of_v<
+      shader::ir::PointerWrapper<typename T::underlying_type>, T>
+struct hash<T> {
+  constexpr std::size_t operator()(const T &pointer) const noexcept {
+    return hash<typename T::underlying_type *>{}(pointer.impl);
+  }
+};
+} // namespace std
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PreincNodeIterable.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PreincNodeIterable.hpp
@ -0,0 +1,136 @@
+#pragma once
+
+#include "InstructionImpl.hpp" // IWYU pragma: keep
+
+namespace shader::ir {
+template <typename T> struct PreincNodeIterable {
+  struct EndIterator {};
+
+  struct Iterator {
+    Instruction nextElem;
+    Instruction currentElem;
+    Instruction endElem;
+
+    Iterator() = default;
+
+    Iterator(Instruction elem, Instruction end)
+        : currentElem(elem), endElem(end) {
+      nextElem = currentElem ? currentElem.getNext() : nullptr;
+
+      if constexpr (!std::is_same_v<Instruction, T>) {
+        while (currentElem != endElem && !currentElem.isa<T>()) {
+          advance();
+        }
+      }
+    }
+
+    T operator*() const { return currentElem.staticCast<T>(); }
+
+    Iterator &operator++() {
+      advance();
+
+      if constexpr (!std::is_same_v<Instruction, T>) {
+        while (currentElem != endElem && !currentElem.isa<T>()) {
+          advance();
+        }
+      }
+
+      return *this;
+    }
+
+    bool operator==(const Iterator &) const = default;
+
+    bool operator==(const EndIterator &) const {
+      return currentElem == endElem;
+    }
+
+    void advance() {
+      currentElem = nextElem;
+      if (nextElem) {
+        nextElem = nextElem.getNext();
+      }
+    }
+  };
+
+  PreincNodeIterable(Instruction beginIt, Instruction endIt)
+      : mBeginIt(beginIt), mEndIt(endIt) {}
+
+  Iterator begin() const { return Iterator(mBeginIt, mEndIt); }
+  EndIterator end() const { return EndIterator{}; }
+
+private:
+  Instruction mBeginIt;
+  Instruction mEndIt;
+};
+
+template <typename T> struct RevPreincNodeIterable {
+  struct EndIterator {};
+
+  struct Iterator {
+    Instruction nextElem;
+    Instruction currentElem;
+    Instruction endElem;
+
+    Iterator() = default;
+
+    Iterator(Instruction elem, Instruction end)
+        : currentElem(elem), endElem(end) {
+      nextElem = currentElem ? currentElem.getPrev() : nullptr;
+
+      if constexpr (!std::is_same_v<Instruction, T>) {
+        while (currentElem != endElem && !currentElem.isa<T>()) {
+          advance();
+        }
+      }
+    }
+
+    T operator*() const { return currentElem.staticCast<T>(); }
+
+    Iterator &operator++() {
+      advance();
+
+      if constexpr (!std::is_same_v<Instruction, T>) {
+        while (currentElem != endElem && !currentElem.isa<T>()) {
+          advance();
+        }
+      }
+
+      return *this;
+    }
+
+    bool operator==(const Iterator &) const = default;
+
+    bool operator==(const EndIterator &) const {
+      return currentElem == endElem;
+    }
+
+    void advance() {
+      currentElem = nextElem;
+      if (nextElem) {
+        nextElem = nextElem.getPrev();
+      }
+    }
+  };
+
+  RevPreincNodeIterable(Instruction beginIt, Instruction endIt)
+      : mBeginIt(beginIt), mEndIt(endIt) {}
+
+  Iterator begin() const { return Iterator(mBeginIt, mEndIt); }
+  EndIterator end() const { return EndIterator{}; }
+
+private:
+  Instruction mBeginIt;
+  Instruction mEndIt;
+};
+
+template <typename T = Instruction>
+inline PreincNodeIterable<T> range(Instruction begin,
+                                   Instruction end = nullptr) {
+  return {begin, end};
+}
+template <typename T = Instruction>
+inline RevPreincNodeIterable<T> revRange(Instruction begin,
+                                         Instruction end = nullptr) {
+  return {begin, end};
+}
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PrintableWrapper.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/PrintableWrapper.hpp
@ -0,0 +1,26 @@
+#pragma once
+
+#include "PointerWrapper.hpp"
+#include <ostream>
+
+namespace shader::ir {
+class NameStorage;
+template <typename T> struct PrintableWrapper : PointerWrapper<T> {
+  using PointerWrapper<T>::PointerWrapper;
+  using PointerWrapper<T>::operator=;
+
+  void print(std::ostream &os, NameStorage &ns) const {
+    if constexpr (requires { this->impl->print(os, ns); }) {
+      this->impl->print(os, ns);
+    } else {
+      this->impl->print(os);
+    }
+  }
+
+  void print(std::ostream &os) const
+    requires requires { this->impl->print(os); }
+  {
+    this->impl->print(os);
+  }
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Region.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Region.hpp
@ -0,0 +1,19 @@
+#pragma once
+
+#include "Node.hpp"
+#include "RegionLike.hpp"
+
+namespace shader::ir {
+template <typename ImplT>
+struct RegionWrapper : RegionLikeWrapper<ImplT, NodeWrapper> {
+  using RegionLikeWrapper<ImplT, NodeWrapper>::RegionLikeWrapper;
+  using RegionLikeWrapper<ImplT, NodeWrapper>::operator=;
+};
+
+struct RegionImpl;
+
+struct Region : RegionWrapper<RegionImpl> {
+  using RegionWrapper<RegionImpl>::RegionWrapper;
+  using RegionWrapper<RegionImpl>::operator=;
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionImpl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionImpl.hpp
@ -0,0 +1,15 @@
+#pragma once
+#include "NameStorage.hpp"
+#include "NodeImpl.hpp"
+#include "Region.hpp"
+#include "RegionLikeImpl.hpp"
+#include <ostream>
+
+namespace shader::ir {
+struct RegionImpl : NodeImpl, RegionLikeImpl {
+  RegionImpl(Location loc) { setLocation(loc); }
+
+  void print(std::ostream &os, NameStorage &ns) const override;
+  Node clone(Context &context, CloneMap &map) const override;
+};
+} // namespace ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionLike.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionLike.hpp
@ -0,0 +1,38 @@
+#pragma once
+
+#include "Instruction.hpp"
+
+namespace shader::ir {
+struct RegionLike;
+
+template <typename ImplT, template <typename> typename BaseWrapper>
+struct RegionLikeWrapper : BaseWrapper<ImplT> {
+  using BaseWrapper<ImplT>::BaseWrapper;
+  using BaseWrapper<ImplT>::operator=;
+
+  void appendRegion(RegionLike other);
+
+  auto getFirst() { return this->impl->first; }
+  auto getLast() { return this->impl->last; }
+  bool empty() { return this->impl->first == nullptr; }
+
+  void insertAfter(Instruction point, Instruction node) {
+    this->impl->insertAfter(point, node);
+  }
+  void prependChild(Instruction node) { this->impl->prependChild(node); }
+
+  void addChild(Instruction node) { this->impl->addChild(node); }
+  template <typename T = Instruction> auto children() {
+    return this->impl->template children<T>();
+  }
+  template <typename T = Instruction> auto revChildren() {
+    return this->impl->template revChildren<T>();
+  }
+};
+
+struct RegionLikeImpl;
+struct RegionLike : RegionLikeWrapper<RegionLikeImpl, PointerWrapper> {
+  using RegionLikeWrapper::RegionLikeWrapper;
+  using RegionLikeWrapper::operator=;
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionLikeImpl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/RegionLikeImpl.hpp
@ -0,0 +1,25 @@
+#pragma once
+
+#include "PreincNodeIterable.hpp"
+#include "RegionLike.hpp"
+
+namespace shader::ir {
+struct RegionLikeImpl {
+  Instruction first = nullptr;
+  Instruction last = nullptr;
+
+  virtual ~RegionLikeImpl() = default;
+
+  template <typename T = Instruction> auto children() const {
+    return PreincNodeIterable<T>{first, nullptr};
+  }
+
+  template <typename T = Instruction> auto revChildren() const {
+    return RevPreincNodeIterable<T>{last, nullptr};
+  }
+
+  virtual void insertAfter(Instruction point, Instruction node);
+  virtual void prependChild(Instruction node);
+  virtual void addChild(Instruction node);
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Value.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/Value.hpp
@ -0,0 +1,36 @@
+#pragma once
+
+#include "Instruction.hpp"
+#include "Operand.hpp"
+
+namespace shader::ir {
+struct Value;
+template <typename T> struct ValueWrapper : InstructionWrapper<T> {
+  using InstructionWrapper<T>::InstructionWrapper;
+  using InstructionWrapper<T>::operator=;
+
+  decltype(auto) getUserList() const { return this->impl->getUserList(); }
+  auto & getUseList() const { return this->impl->uses; }
+  void replaceAllUsesWith(Value other) const;
+
+  bool isUnused() const { return this->impl->uses.empty(); }
+};
+
+struct ValueImpl;
+struct Value : ValueWrapper<ValueImpl> {
+  using ValueWrapper::ValueWrapper;
+  using ValueWrapper::operator=;
+};
+
+template <typename T>
+void ValueWrapper<T>::replaceAllUsesWith(Value other) const {
+  this->impl->replaceAllUsesWith(other);
+}
+
+struct ValueUse {
+  Instruction user;
+  Value node;
+  int operandIndex;
+  auto operator<=>(const ValueUse &) const = default;
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/ValueImpl.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/ir/ValueImpl.hpp
@ -0,0 +1,55 @@
+#pragma once
+
+#include "InstructionImpl.hpp"
+#include "NameStorage.hpp"
+#include "Node.hpp"
+#include "Value.hpp"
+
+namespace shader::ir {
+struct ValueImpl : InstructionImpl {
+  std::set<ValueUse> uses;
+
+  ValueImpl(Location location, Kind kind, unsigned op,
+            std::span<const Operand> operands = {})
+      : InstructionImpl(location, kind, op, operands) {}
+
+  void addUse(Instruction user, int operandIndex) {
+    uses.insert({user, this, operandIndex});
+  }
+
+  void removeUse(Instruction user, int operandIndex) {
+    uses.erase({user, this, operandIndex});
+  }
+
+  std::set<Node> getUserList() const {
+    std::set<Node> list;
+    for (auto use : uses) {
+      list.insert(use.user);
+    }
+    return list;
+  }
+
+  void replaceAllUsesWith(Value other) {
+    if (other == this) {
+      std::abort();
+    }
+
+    while (!uses.empty()) {
+      auto use = *uses.begin();
+      if (other == nullptr) {
+        use.user.replaceOperand(use.operandIndex, nullptr);
+      } else {
+        use.user.replaceOperand(use.operandIndex, other);
+      }
+    }
+  }
+
+  void print(std::ostream &os, NameStorage &ns) const override {
+    os << '%' << ns.getNameOf(const_cast<ValueImpl *>(this));
+    os << " = ";
+    InstructionImpl::print(os, ns);
+  }
+
+  Node clone(Context &context, CloneMap &map) const override;
+};
+} // namespace shader::ir
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/opt.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/opt.hpp
@ -0,0 +1,7 @@
+#pragma once
+#include "ir/Context.hpp"
+#include "ir/Region.hpp"
+
+namespace shader {
+bool optimize(ir::Context &context, ir::Region region);
+}
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/spv.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/spv.hpp
@ -0,0 +1,173 @@
+#pragma once
+
+#include "ir/Context.hpp"
+#include "ir/Region.hpp"
+#include "ir/RegionImpl.hpp"
+#include <optional>
+#include <span>
+#include <spirv-tools/optimizer.hpp>
+
+namespace shader::spv {
+
+struct BinaryLayout {
+  enum {
+    kCapabilities,
+    kExtensions,
+    kExtInstImports,
+    kMemoryModels,
+    kEntryPoints,
+    kExecutionModes,
+    kDebugs,
+    kAnnotations,
+    kGlobals,
+    kFunctionDeclarations,
+    kFunctions,
+
+    kRegionCount
+  };
+
+  ir::Region regions[kRegionCount];
+
+  ir::Region getOrCreateRegion(ir::Context &context, int index) {
+    if (regions[index] == nullptr) {
+      regions[index] = context.create<ir::Region>(context.getUnknownLocation());
+    }
+
+    return regions[index];
+  }
+
+  ir::Region getOrCreateCapabilities(ir::Context &context) {
+    return getOrCreateRegion(context, kCapabilities);
+  }
+  ir::Region getOrCreateExtensions(ir::Context &context) {
+    return getOrCreateRegion(context, kExtensions);
+  }
+  ir::Region getOrCreateExtInstImports(ir::Context &context) {
+    return getOrCreateRegion(context, kExtInstImports);
+  }
+  ir::Region getOrCreateMemoryModels(ir::Context &context) {
+    return getOrCreateRegion(context, kMemoryModels);
+  }
+  ir::Region getOrCreateEntryPoints(ir::Context &context) {
+    return getOrCreateRegion(context, kEntryPoints);
+  }
+  ir::Region getOrCreateExecutionModes(ir::Context &context) {
+    return getOrCreateRegion(context, kExecutionModes);
+  }
+  ir::Region getOrCreateDebugs(ir::Context &context) {
+    return getOrCreateRegion(context, kDebugs);
+  }
+  ir::Region getOrCreateAnnotations(ir::Context &context) {
+    return getOrCreateRegion(context, kAnnotations);
+  }
+  ir::Region getOrCreateGlobals(ir::Context &context) {
+    return getOrCreateRegion(context, kGlobals);
+  }
+  ir::Region getOrCreateFunctionDeclarations(ir::Context &context) {
+    return getOrCreateRegion(context, kFunctionDeclarations);
+  }
+  ir::Region getOrCreateFunctions(ir::Context &context) {
+    return getOrCreateRegion(context, kFunctions);
+  }
+
+  ///
+  /// \brief Merge all regions into a single one.
+  ///
+  /// After calling this function, all regions in the object
+  /// become empty.
+  ///
+  ir::Region merge(ir::Context &context) {
+    auto result = context.create<ir::Region>(context.getUnknownLocation());
+    for (auto &region : regions) {
+      if (region == nullptr) {
+        continue;
+      }
+
+      result.appendRegion(std::move(region));
+      region = {};
+    }
+
+    return result;
+  }
+};
+
+///
+/// Deserialize a SPIR-V binary into an intermediate representation.
+///
+/// \param context context to attach the IR to
+/// \param spv SPIR-V binary
+/// \param loc location to use for error reporting
+/// \returns the deserialized IR, or std::nullopt if deserialization failed
+///
+std::optional<BinaryLayout> deserialize(ir::Context &context,
+                                        std::span<const std::uint32_t> spv,
+                                        ir::Location loc);
+///
+/// \brief Serialize SPIR-V from an IR region.
+///
+/// This function generates a SPIR-V binary from an IR region.
+/// The SPIR-V binary is stored in the returned vector.
+///
+/// \returns A vector of u32 values representing the SPIR-V binary.
+///
+std::vector<std::uint32_t> serialize(ir::Region body);
+
+inline std::vector<std::uint32_t> serialize(ir::Context &context,
+                                            BinaryLayout &&layout) {
+  return serialize(layout.merge(context));
+}
+
+///
+/// \brief Returns true if the instruction is a terminator.
+///
+bool isTerminatorInst(ir::InstructionId inst);
+
+///
+/// \brief Disassemble a SPIR-V binary into text and print result to stderr.
+///
+/// \param spv The SPIR-V binary to disassemble.
+/// \param pretty If true, emit friendly names for functions, variables, and
+/// other values.  If false, emit the SPIR-V ID for each value.
+///
+/// \note The SPIR-V binary is not validated or checked for errors.  If the
+/// input is invalid, the output is undefined.
+void dump(std::span<const std::uint32_t> spv, bool pretty = false);
+
+///
+/// \brief Disassemble a SPIR-V binary into text.
+///
+/// \param spv The SPIR-V binary to disassemble.
+/// \param pretty If true, emit friendly names for functions, variables, and
+/// other values.  If false, emit the SPIR-V ID for each value.
+/// \return the assembly text
+///
+/// \note The SPIR-V binary is not validated or checked for errors.  If the
+/// input is invalid, the output is undefined.
+std::string disassembly(std::span<const std::uint32_t> spv, bool pretty = false);
+
+///
+/// \brief Validates a given SPIR-V binary against the SPIR-V spec
+///
+/// \param spv the SPIR-V binary to validate
+/// \return whether the SPIR-V binary is valid
+///
+/// This functions uses the SPIR-V Tools validator to check the given SPIR-V
+/// binary against the SPIR-V spec. If the SPIR-V is invalid, the function
+/// will print out the validation error messages and return false. If the
+/// SPIR-V is valid, the function simply returns true.
+bool validate(std::span<const std::uint32_t> spv);
+
+///
+/// \brief Optimize a SPIR-V module.
+///
+/// \param spv the SPIR-V binary to optimize
+/// \return the optimized SPIR-V binary or an empty optional if binary is
+/// invalid
+///
+/// This function takes a SPIR-V module and runs a series of optimization passes
+/// on it using SPIR-V Tools opt.  If the optimization is successful, the
+/// optimized module is returned. Otherwise, an empty optional is returned.
+///
+std::optional<std::vector<std::uint32_t>>
+optimize(std::span<const std::uint32_t> spv);
+} // namespace shader::spv
--- a/rpcsx-gpu2/lib/gcn-shader/include/shader/transform.hpp
+++ b/rpcsx-gpu2/lib/gcn-shader/include/shader/transform.hpp
@ -0,0 +1,8 @@
+#pragma once
+#include "SpvConverter.hpp"
+#include "ir.hpp"
+
+namespace shader {
+void structurizeCfg(spv::Context &context, ir::RegionLike region,
+                    ir::Value exitLabel);
+}
--- a/rpcsx-gpu2/lib/gcn-shader/shaders/CMakeLists.txt
+++ b/rpcsx-gpu2/lib/gcn-shader/shaders/CMakeLists.txt
@ -0,0 +1,19 @@
+set(OUTPUT_FILENAME rdna-semantic-spirv.hpp)
+set(INCLUDE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include)
+set(OUTPUT_DIRECTORY ${INCLUDE_DIRECTORY}/shaders)
+set(OUTPUT_FILE ${OUTPUT_DIRECTORY}/${OUTPUT_FILENAME})
+set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/rdna.glsl)
+file(MAKE_DIRECTORY ${OUTPUT_DIRECTORY})
+
+add_custom_command(
+    OUTPUT ${OUTPUT_FILE}
+    COMMAND $<TARGET_FILE:shader-tool> --output-type spirv-header --output-var-name g_rdna_semantic_spirv -i ${INPUT_FILE} -o ${OUTPUT_FILE}
+    DEPENDS shader-tool ${INPUT_FILE}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Generating ${OUTPUT_FILE}..."
+)
+
+add_custom_target(rdna-semantic-spirv-gen DEPENDS ${OUTPUT_FILE})
+add_library(rdna-semantic-spirv INTERFACE)
+add_dependencies(rdna-semantic-spirv rdna-semantic-spirv-gen)
+target_include_directories(rdna-semantic-spirv INTERFACE ${INCLUDE_DIRECTORY})
--- a/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl
+++ b/rpcsx-gpu2/lib/gcn-shader/shaders/rdna.glsl
--- a/rpcsx-gpu2/lib/gcn-shader/src/Evaluator.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/Evaluator.cpp
@ -0,0 +1,274 @@
+#include "Evaluator.hpp"
+#include "dialect.hpp"
+#include "ir.hpp"
+
+using namespace shader;
+
+eval::Value eval::Evaluator::eval(const ir::Operand &op, ir::Value type) {
+  if (auto val = op.getAsValue()) {
+    auto [it, inserted] = values.try_emplace(val, Value{});
+    if (inserted) {
+      it->second = eval(val);
+    }
+    return it->second;
+  }
+
+  if (auto result = op.getAsInt32()) {
+    if (type != nullptr) {
+      bool isSigned = *type.getOperand(1).getAsInt32() != 0;
+      switch (*type.getOperand(0).getAsInt32()) {
+      case 8:
+        if (isSigned) {
+          return static_cast<std::int8_t>(*result);
+        }
+
+        return static_cast<std::uint8_t>(*result);
+
+      case 16:
+        if (isSigned) {
+          return static_cast<std::int16_t>(*result);
+        }
+
+        return static_cast<std::uint16_t>(*result);
+
+      case 32:
+        if (isSigned) {
+          return static_cast<std::int32_t>(*result);
+        }
+
+        return static_cast<std::uint32_t>(*result);
+      }
+
+      return {};
+    }
+
+    return *result;
+  }
+
+  if (auto result = op.getAsInt64()) {
+    if (type != nullptr) {
+      bool isSigned = *type.getOperand(1).getAsInt32() != 0;
+
+      if (isSigned) {
+        return static_cast<std::int64_t>(*result);
+      }
+
+      return static_cast<std::uint64_t>(*result);
+    }
+
+    return *result;
+  }
+
+  if (auto result = op.getAsBool()) {
+    return *result;
+  }
+
+  if (auto result = op.getAsFloat()) {
+    if (type != nullptr) {
+      if (*type.getOperand(0).getAsInt32() == 16) {
+        return static_cast<float16_t>(*result);
+      }
+
+      return static_cast<std::uint64_t>(*result);
+    }
+
+    return *result;
+  }
+
+  if (auto result = op.getAsDouble()) {
+    return *result;
+  }
+
+  return {};
+}
+eval::Value eval::Evaluator::eval(ir::InstructionId instId,
+                                  std::span<const ir::Operand> operands) {
+  if (instId == ir::spv::OpConstant) {
+    return eval(operands[1], operands[0].getAsValue());
+  }
+
+  if (instId == ir::spv::OpBitcast) {
+    return eval(operands[1]).bitcast(operands[0].getAsValue());
+  }
+
+  if (instId == ir::spv::OpSConvert || instId == ir::spv::OpUConvert) {
+    if (auto rhs = eval(operands[1])) {
+      return rhs.iConvert(operands[0].getAsValue(),
+                          instId == ir::spv::OpSConvert);
+    }
+
+    return {};
+  }
+
+  if (instId == ir::spv::OpSelect) {
+    return eval(operands[1]).select(eval(operands[2]), eval(operands[3]));
+  }
+
+  if (instId == ir::spv::OpIAdd || instId == ir::spv::OpFAdd) {
+    return eval(operands[1]) + eval(operands[2]);
+  }
+  if (instId == ir::spv::OpISub || instId == ir::spv::OpFSub) {
+    return eval(operands[1]) - eval(operands[2]);
+  }
+  if (instId == ir::spv::OpSDiv || instId == ir::spv::OpUDiv ||
+      instId == ir::spv::OpFDiv) {
+    return eval(operands[1]) / eval(operands[2]);
+  }
+  if (instId == ir::spv::OpSMod || instId == ir::spv::OpUMod ||
+      instId == ir::spv::OpFMod) {
+    return eval(operands[1]) % eval(operands[2]);
+  }
+  if (instId == ir::spv::OpSRem) {
+    return eval(operands[1]) % eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFRem) {
+    return eval(operands[1]) % eval(operands[2]);
+  }
+  if (instId == ir::spv::OpSNegate || instId == ir::spv::OpFNegate) {
+    return -eval(operands[0]);
+  }
+
+  if (instId == ir::spv::OpNot) {
+    return ~eval(operands[1]);
+  }
+  if (instId == ir::spv::OpLogicalNot) {
+    return !eval(operands[1]);
+  }
+
+  if (instId == ir::spv::OpLogicalEqual || instId == ir::spv::OpIEqual) {
+    return eval(operands[1]) == eval(operands[2]);
+  }
+  if (instId == ir::spv::OpLogicalNotEqual || instId == ir::spv::OpINotEqual) {
+    return eval(operands[1]) != eval(operands[2]);
+  }
+  if (instId == ir::spv::OpLogicalOr) {
+    return eval(operands[1]) || eval(operands[2]);
+  }
+  if (instId == ir::spv::OpLogicalAnd) {
+    return eval(operands[1]) && eval(operands[2]);
+  }
+  if (instId == ir::spv::OpUGreaterThan || instId == ir::spv::OpSGreaterThan) {
+    return eval(operands[1]) > eval(operands[2]);
+  }
+  if (instId == ir::spv::OpUGreaterThanEqual ||
+      instId == ir::spv::OpSGreaterThanEqual) {
+    return eval(operands[1]) >= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpULessThan || instId == ir::spv::OpSLessThan) {
+    return eval(operands[1]) < eval(operands[2]);
+  }
+  if (instId == ir::spv::OpULessThanEqual ||
+      instId == ir::spv::OpSLessThanEqual) {
+    return eval(operands[1]) <= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdEqual) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) == eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordEqual) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) == eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdNotEqual) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) != eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordNotEqual) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) != eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdLessThan) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) < eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordLessThan) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) < eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdGreaterThan) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) > eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordGreaterThan) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) > eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdLessThanEqual) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) <= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordLessThanEqual) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) <= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFOrdGreaterThanEqual) {
+    return !eval(operands[1]).isNan() && !eval(operands[2]).isNan() &&
+           eval(operands[1]) >= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpFUnordGreaterThanEqual) {
+    return eval(operands[1]).isNan() || eval(operands[2]).isNan() ||
+           eval(operands[1]) >= eval(operands[2]);
+  }
+  if (instId == ir::spv::OpShiftRightLogical) {
+    return eval(operands[1]) >> eval(operands[2]);
+  }
+  if (instId == ir::spv::OpShiftRightArithmetic) {
+    return eval(operands[1]) >> eval(operands[2]);
+  }
+  if (instId == ir::spv::OpShiftLeftLogical) {
+    return eval(operands[1]) << eval(operands[2]);
+  }
+  if (instId == ir::spv::OpBitwiseOr) {
+    return eval(operands[1]) | eval(operands[2]);
+  }
+  if (instId == ir::spv::OpBitwiseXor) {
+    return eval(operands[1]) ^ eval(operands[2]);
+  }
+  if (instId == ir::spv::OpBitwiseAnd) {
+    return eval(operands[1]) & eval(operands[2]);
+  }
+
+  if (instId == ir::spv::OpIsNan) {
+    return eval(operands[1]).isNan();
+  }
+  if (instId == ir::spv::OpIsInf) {
+    return eval(operands[1]).isInf();
+  }
+  if (instId == ir::spv::OpIsFinite) {
+    return eval(operands[1]).isFinite();
+  }
+
+  if (instId == ir::spv::OpCompositeConstruct) {
+    std::vector<Value> constituents;
+    constituents.reserve(operands.size() - 1);
+    for (auto &op : operands.subspan(1)) {
+      constituents.push_back(eval(op));
+    }
+    return Value::compositeConstruct(operands[0].getAsValue(), constituents);
+  }
+
+  if (instId == ir::spv::OpCompositeExtract) {
+    auto composite = eval(operands[1].getAsValue());
+    if (composite.empty()) {
+      return{};
+    }
+
+    std::vector<Value> indexes;
+    indexes.reserve(operands.size() - 2);
+    for (auto &op : operands.subspan(2)) {
+      indexes.push_back(eval(op));
+    }
+
+    if (indexes.size() != 1) {
+      return{};
+    }
+
+    return composite.compositeExtract(indexes[0]);
+  }
+
+  return {};
+}
+
+eval::Value eval::Evaluator::eval(ir::Value op) {
+  return eval(op.getInstId(), op.getOperands());
+}
--- a/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/GcnConverter.cpp
--- a/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/GcnInstruction.cpp
--- a/rpcsx-gpu2/lib/gcn-shader/src/ModuleInfo.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/ModuleInfo.cpp
@ -0,0 +1,106 @@
+#include "ModuleInfo.hpp"
+#include "analyze.hpp"
+#include "dialect.hpp"
+#include "ir.hpp"
+
+shader::ModuleInfo::Function &
+shader::collectFunctionInfo(ModuleInfo &moduleInfo, ir::Value function) {
+  auto [fnIt, fnInserted] =
+      moduleInfo.functions.try_emplace(function, ModuleInfo::Function{});
+  if (!fnInserted) {
+    return fnIt->second;
+  }
+
+  auto &result = fnIt->second;
+  std::map<ir::Value, int> params;
+
+  result.returnType = function.getOperand(0).getAsValue();
+
+  auto trackAccess = [&](ir::Value pointer, Access access) {
+    pointer = unwrapPointer(pointer);
+
+    if (auto it = params.find(pointer); it != params.end()) {
+      result.parameters[it->second].access |= access;
+      return;
+    }
+
+    if (pointer == ir::spv::OpVariable) {
+      auto storagePtr = pointer.getOperand(1).getAsInt32();
+      if (!storagePtr) {
+        return;
+      }
+
+      auto storage = ir::spv::StorageClass(*storagePtr);
+
+      if (storage != ir::spv::StorageClass::Function) {
+        result.variables[pointer] = access;
+      }
+    }
+  };
+
+  for (auto inst : ir::range(function.getNext())) {
+    if (inst == ir::spv::OpFunctionEnd) {
+      break;
+    }
+
+    if (inst == ir::spv::OpFunctionParameter) {
+      auto type = inst.getOperand(0).getAsValue();
+      params[inst.staticCast<ir::Value>()] = result.parameters.size();
+      result.parameters.push_back({.type = type, .access = Access::None});
+      continue;
+    }
+
+    if (inst == ir::spv::OpFunctionCall) {
+      auto callee = inst.getOperand(1).getAsValue();
+      auto &calleeInfo = collectFunctionInfo(moduleInfo, callee);
+      auto args = inst.getOperands().subspan(2);
+
+      for (std::size_t index = 0; auto &[_, access] : calleeInfo.parameters) {
+        trackAccess(args[index++].getAsValue(), access);
+      }
+      for (auto &[global, access] : calleeInfo.variables) {
+        trackAccess(global, access);
+      }
+      continue;
+    }
+
+    if (inst == ir::spv::OpLoad || inst == ir::spv::OpAtomicLoad) {
+      trackAccess(inst.getOperand(1).getAsValue(), Access::Read);
+      continue;
+    }
+
+    if (inst == ir::spv::OpStore || inst == ir::spv::OpAtomicStore) {
+      trackAccess(inst.getOperand(0).getAsValue(), Access::Write);
+      continue;
+    }
+
+    if (inst == ir::spv::OpAtomicExchange ||
+        inst == ir::spv::OpAtomicCompareExchange ||
+        inst == ir::spv::OpAtomicCompareExchangeWeak ||
+        inst == ir::spv::OpAtomicIIncrement ||
+        inst == ir::spv::OpAtomicIDecrement || inst == ir::spv::OpAtomicIAdd ||
+        inst == ir::spv::OpAtomicISub || inst == ir::spv::OpAtomicSMin ||
+        inst == ir::spv::OpAtomicUMin || inst == ir::spv::OpAtomicSMax ||
+        inst == ir::spv::OpAtomicUMax || inst == ir::spv::OpAtomicAnd ||
+        inst == ir::spv::OpAtomicOr || inst == ir::spv::OpAtomicXor) {
+      trackAccess(inst.getOperand(1).getAsValue(), Access::ReadWrite);
+    }
+  }
+
+  return result;
+}
+
+void shader::collectModuleInfo(ModuleInfo &moduleInfo,
+                               const spv::BinaryLayout &layout) {
+  auto functions = layout.regions[spv::BinaryLayout::kFunctions];
+
+  if (!functions) {
+    return;
+  }
+
+  for (auto child : functions.children<ir::Value>()) {
+    if (child == ir::spv::OpFunction) {
+      collectFunctionInfo(moduleInfo, child);
+    }
+  }
+}
--- a/rpcsx-gpu2/lib/gcn-shader/src/SemanticModuleInfo.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/SemanticModuleInfo.cpp
@ -0,0 +1,149 @@
+#include "SemanticInfo.hpp"
+#include "dialect.hpp"
+
+using namespace shader;
+
+static std::size_t getOpCount(ir::Kind kind) {
+  switch (kind) {
+  case ir::Kind::Spv:
+  case ir::Kind::Builtin:
+  case ir::Kind::MemSSA:
+    break;
+
+  case ir::Kind::AmdGpu:
+    return ir::amdgpu::OpCount;
+  case ir::Kind::Vop2:
+    return ir::vop2::OpCount;
+  case ir::Kind::Sop2:
+    return ir::sop2::OpCount;
+  case ir::Kind::Sopk:
+    return ir::sopk::OpCount;
+  case ir::Kind::Smrd:
+    return ir::smrd::OpCount;
+  case ir::Kind::Vop3:
+    return ir::vop3::OpCount;
+  case ir::Kind::Mubuf:
+    return ir::mubuf::OpCount;
+  case ir::Kind::Mtbuf:
+    return ir::mtbuf::OpCount;
+  case ir::Kind::Mimg:
+    return ir::mimg::OpCount;
+  case ir::Kind::Ds:
+    return ir::ds::OpCount;
+  case ir::Kind::Vintrp:
+    return ir::vintrp::OpCount;
+  case ir::Kind::Exp:
+    return 1;
+  case ir::Kind::Vop1:
+    return ir::vop1::OpCount;
+  case ir::Kind::Vopc:
+    return ir::vopc::OpCount;
+  case ir::Kind::Sop1:
+    return ir::sop1::OpCount;
+  case ir::Kind::Sopc:
+    return ir::sopc::OpCount;
+  case ir::Kind::Sopp:
+    return ir::sopp::OpCount;
+  case ir::Kind::Count:
+    break;
+  }
+
+  return 0;
+}
+
+void shader::collectSemanticModuleInfo(SemanticModuleInfo &moduleInfo,
+                                       const spv::BinaryLayout &layout) {
+  static auto instNameToIds = [] {
+    std::map<std::string, std::vector<ir::InstructionId>, std::less<>> result;
+    for (std::size_t kind = 0; kind < std::size_t(ir::Kind::Count); ++kind) {
+      auto opCount = getOpCount(ir::Kind(kind));
+
+      for (unsigned op = 0; op < opCount; ++op) {
+        auto name = getInstructionShortName(ir::Kind(kind), op);
+        if (name == nullptr) {
+          continue;
+        }
+
+        result[name].push_back(ir::getInstructionId(ir::Kind(kind), op));
+      }
+    }
+    return result;
+  }();
+
+  collectModuleInfo(moduleInfo, layout);
+
+  static auto wideInstNameToIds = [] {
+    std::map<std::string, std::vector<ir::InstructionId>, std::less<>> result;
+    for (std::size_t kind = 0; kind < std::size_t(ir::Kind::Count); ++kind) {
+      auto opCount = getOpCount(ir::Kind(kind));
+      if (opCount == 0) {
+        continue;
+      }
+
+      for (unsigned op = 0; op < opCount; ++op) {
+        auto name = getInstructionShortName(ir::Kind(kind), op);
+        if (name == nullptr) {
+          continue;
+        }
+
+        std::string wideName = getKindName(ir::Kind(kind));
+        wideName += '_';
+        wideName += name;
+
+        result[std::move(wideName)].push_back(
+            ir::getInstructionId(ir::Kind(kind), op));
+      }
+    }
+    return result;
+  }();
+
+  for (auto &[fn, info] : moduleInfo.functions) {
+    for (auto &use : fn.getUseList()) {
+      if (use.user != ir::spv::OpName) {
+        continue;
+      }
+
+      auto mangledNameString = use.user.getOperand(1).getAsString();
+
+      if (mangledNameString == nullptr) {
+        break;
+      }
+
+      auto mangledName = std::string_view(*mangledNameString);
+      std::string_view name;
+      if (auto pos = mangledName.find('('); pos != std::string_view::npos) {
+        name = mangledName.substr(0, pos);
+      } else {
+        break;
+      }
+
+      std::vector<ir::InstructionId> *ids = nullptr;
+      std::vector<ir::InstructionId> *wideIds = nullptr;
+
+      if (auto it = wideInstNameToIds.find(name);
+          it != wideInstNameToIds.end()) {
+        wideIds = &it->second;
+      }
+
+      if (auto it = instNameToIds.find(name); it != instNameToIds.end()) {
+        ids = &it->second;
+      }
+
+      if (ids == nullptr && wideIds == nullptr) {
+        break;
+      }
+
+      if (wideIds != nullptr) {
+        for (auto id : *wideIds) {
+          moduleInfo.semantics[id] = fn;
+        }
+      } else {
+        for (auto id : *ids) {
+          moduleInfo.semantics.emplace(id, fn);
+        }
+      }
+
+      break;
+    }
+  }
+}
--- a/rpcsx-gpu2/lib/gcn-shader/src/SpvConverter.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/SpvConverter.cpp
@ -0,0 +1,641 @@
+#include "SpvConverter.hpp"
+#include "dialect.hpp"
+#include "dialect/spv.hpp"
+#include <string>
+
+using namespace shader;
+
+using Builder = ir::Builder<ir::spv::Builder, ir::builtin::Builder>;
+
+static std::string getTypeName(ir::Value type);
+
+static std::string getConstantName(ir::Value constant) {
+  if (constant == ir::spv::OpConstant) {
+    auto typeValue = constant.getOperand(0).getAsValue();
+    auto value = constant.getOperand(1);
+
+    if (typeValue == ir::spv::OpTypeInt) {
+      auto width = *typeValue.getOperand(0).getAsInt32();
+
+      if (width <= 32) {
+        if (value.getAsInt32() == nullptr) {
+          std::abort();
+        }
+        return "_" + std::to_string(*value.getAsInt32());
+      }
+      if (value.getAsInt64() == nullptr) {
+        std::abort();
+      }
+      return "c_" + std::to_string(*value.getAsInt64());
+    }
+
+    if (typeValue == ir::spv::OpTypeFloat) {
+      auto width = *typeValue.getOperand(0).getAsInt32();
+
+      if (width == 32) {
+        if (value.getAsFloat() == nullptr) {
+          std::abort();
+        }
+        return "c_" + std::to_string(*value.getAsFloat());
+      }
+      if (value.getAsDouble() == nullptr) {
+        std::abort();
+      }
+      return "c_" + std::to_string(*value.getAsDouble());
+    }
+
+    return {};
+  }
+
+  if (constant == ir::spv::OpConstantTrue) {
+    return "true";
+  }
+
+  if (constant == ir::spv::OpConstantFalse) {
+    return "false";
+  }
+
+  if (constant == ir::spv::OpConstantNull) {
+    return "null_" + getTypeName(constant.getOperand(0).getAsValue());
+  }
+
+  return {};
+}
+
+static std::string getTypeName(ir::Value type) {
+  if (type == ir::spv::OpTypeInt) {
+    if (type.getOperand(1) != 0) {
+      return "s" + std::to_string(*type.getOperand(0).getAsInt32());
+    }
+    return "u" + std::to_string(*type.getOperand(0).getAsInt32());
+  }
+
+  if (type == ir::spv::OpTypeFloat) {
+    return "f" + std::to_string(*type.getOperand(0).getAsInt32());
+  }
+
+  if (type == ir::spv::OpTypeBool) {
+    return "bool";
+  }
+
+  if (type == ir::spv::OpTypeVoid) {
+    return "void";
+  }
+
+  if (type == ir::spv::OpTypeSampler) {
+    return "sampler";
+  }
+
+  if (type == ir::spv::OpTypeVector) {
+    return getTypeName(type.getOperand(0).getAsValue()) + 'x' +
+           std::to_string(*type.getOperand(1).getAsInt32());
+  }
+
+  if (type == ir::spv::OpTypeArray) {
+    auto count = type.getOperand(1).getAsValue();
+    if (count == ir::spv::OpConstant) {
+      if (auto n = count.getOperand(1).getAsInt32()) {
+        return getTypeName(type.getOperand(0).getAsValue()) + '[' +
+               std::to_string(*n) + ']';
+      }
+    }
+
+    return getTypeName(type.getOperand(0).getAsValue()) + "[N]";
+  }
+
+  if (type == ir::spv::OpTypeRuntimeArray) {
+    return getTypeName(type.getOperand(0).getAsValue()) + "[]";
+  }
+
+  if (type == ir::spv::OpTypeStruct) {
+    std::string result = "struct{";
+    for (bool first = true; auto &op : type.getOperands()) {
+      if (!first) {
+        result += ", ";
+      } else {
+        first = false;
+      }
+      result += getTypeName(op.getAsValue());
+    }
+
+    result += "}";
+    return result;
+  }
+
+  if (type == ir::spv::OpTypePointer) {
+    return getTypeName(type.getOperand(1).getAsValue()) + "*";
+  }
+
+  return {};
+}
+
+spv::Context::Context() {
+  localVariables = create<ir::Region>(getUnknownLocation());
+  epilogue = createRegionWithLabel(getUnknownLocation()).getParent();
+}
+
+ir::Node spv::Import::getOrCloneImpl(ir::Context &context, ir::Node node,
+                                     bool isOperand) {
+  auto inst = node.cast<ir::Instruction>();
+
+  if (inst == nullptr) {
+    return CloneMap::getOrCloneImpl(context, node, isOperand);
+  }
+
+  auto &spvContext = static_cast<spv::Context &>(context);
+
+  auto redefine = [&](ir::Node newNode) {
+    setOverride(node, newNode);
+    return newNode;
+  };
+
+  auto cloneDecorationsAndDebugs = [&](ir::Node inst = nullptr) {
+    if (inst == nullptr) {
+      inst = node;
+    }
+
+    auto annotations = spvContext.layout.getOrCreateAnnotations(context);
+    auto debugs = spvContext.layout.getOrCreateDebugs(context);
+    auto value = inst.cast<ir::Value>();
+    if (value == nullptr) {
+      return;
+    }
+
+    for (auto &use : value.getUseList()) {
+      if (use.user == ir::spv::OpDecorate ||
+          use.user == ir::spv::OpMemberDecorate ||
+          use.user == ir::spv::OpDecorationGroup ||
+          use.user == ir::spv::OpGroupDecorate ||
+          use.user == ir::spv::OpGroupMemberDecorate ||
+          use.user == ir::spv::OpDecorateId) {
+
+        annotations.addChild(ir::clone(use.user, context, *this));
+      }
+
+      if (use.user == ir::spv::OpName || use.user == ir::spv::OpMemberName) {
+        auto cloned = ir::clone(use.user, context, *this);
+        debugs.addChild(cloned);
+        if (use.user == ir::spv::OpName) {
+          auto demangled =
+              std::string_view(*cloned.getOperand(1).getAsString());
+          if (auto pos = demangled.find('('); pos != std::string::npos) {
+            demangled = demangled.substr(0, pos);
+          }
+          spvContext.setName(cloned.getOperand(0).getAsValue(),
+                             std::string(demangled));
+        }
+      }
+    }
+  };
+
+  auto hasDecoration = [&] {
+    for (auto use : node.staticCast<ir::Value>().getUseList()) {
+      if (use.user == ir::spv::OpDecorate ||
+          use.user == ir::spv::OpMemberDecorate) {
+        return true;
+      }
+    }
+
+    return false;
+  };
+
+  if (inst.getKind() == ir::Kind::Spv) {
+    if (inst.getOp() == ir::spv::OpExtInstImport) {
+      auto extensions = spvContext.layout.getOrCreateExtInstImports(context);
+      auto result = CloneMap::getOrCloneImpl(context, node, isOperand);
+      extensions.addChild(result.staticCast<ir::Value>());
+
+      return redefine(result);
+    }
+
+    if (ir::spv::isTypeOp(inst.getOp())) {
+      std::vector<ir::Operand> operands;
+
+      for (auto &op : inst.getOperands()) {
+        operands.push_back(op.clone(context, *this));
+      }
+
+      auto typeOp = static_cast<ir::spv::Op>(inst.getOp());
+
+      if ((inst != ir::spv::OpTypeArray || !hasDecoration()) &&
+          inst != ir::spv::OpTypeRuntimeArray &&
+          inst != ir::spv::OpTypeStruct) {
+        if (inst != ir::spv::OpTypePointer ||
+            inst.getOperand(0) == ir::spv::StorageClass::Function) {
+          if (auto result = spvContext.findGlobal(typeOp, operands)) {
+            return redefine(result);
+          }
+        }
+      }
+
+      auto result = spvContext.createGlobal(
+          static_cast<ir::spv::Op>(inst.getOp()), operands);
+      redefine(result);
+      cloneDecorationsAndDebugs();
+      return result;
+    }
+  }
+
+  if (inst == ir::spv::OpConstant || inst == ir::spv::OpConstantComposite ||
+      inst == ir::spv::OpConstantTrue || inst == ir::spv::OpConstantFalse ||
+      inst == ir::spv::OpConstantNull || inst == ir::spv::OpConstantSampler ||
+      inst == ir::spv::OpSpecConstantTrue ||
+      inst == ir::spv::OpSpecConstantFalse || inst == ir::spv::OpSpecConstant ||
+      inst == ir::spv::OpSpecConstantComposite) {
+    std::vector<ir::Operand> operands;
+
+    for (auto &op : inst.getOperands()) {
+      operands.push_back(op.clone(context, *this));
+    }
+
+    auto result = spvContext.getOrCreateGlobal(
+        static_cast<ir::spv::Op>(inst.getOp()), operands);
+    return redefine(result);
+  }
+
+  if (isOperand && inst == ir::spv::OpVariable) {
+    if (inst == ir::spv::OpVariable) {
+      auto storage = inst.getOperand(1).getAsInt32();
+      if (*storage == int(ir::spv::StorageClass::Function)) {
+        return CloneMap::getOrCloneImpl(context, node, isOperand);
+      }
+    }
+
+    auto globals = spvContext.layout.getOrCreateGlobals(context);
+    auto result = CloneMap::getOrCloneImpl(context, node, isOperand);
+    globals.addChild(result.staticCast<ir::Instruction>());
+    cloneDecorationsAndDebugs();
+    return result;
+  }
+
+  if (inst == ir::spv::OpConstant) {
+    auto type = inst.getOperand(0).clone(context, *this);
+    return redefine(
+        spvContext.getOrCreateConstant(type.getAsValue(), inst.getOperand(1)));
+  }
+
+  if (inst == ir::spv::OpFunction) {
+    auto functions = spvContext.layout.getOrCreateFunctions(context);
+
+    auto result = CloneMap::getOrCloneImpl(context, node, isOperand)
+                      .staticCast<ir::Value>();
+    functions.insertAfter(nullptr, result);
+    redefine(result);
+    cloneDecorationsAndDebugs();
+
+    ir::Instruction insertPoint = result;
+
+    for (auto child : ir::range(inst.getNext())) {
+      auto cloned = ir::clone(child, context, *this);
+      functions.insertAfter(insertPoint, cloned);
+      insertPoint = cloned;
+      cloneDecorationsAndDebugs(child);
+
+      if (child == ir::spv::OpFunctionEnd) {
+        break;
+      }
+    }
+
+    return result;
+  }
+
+  return CloneMap::getOrCloneImpl(context, node, isOperand);
+}
+
+ir::Value spv::Context::createRegionWithLabel(ir::Location loc) {
+  return Builder::createAppend(*this, create<ir::Region>(loc))
+      .createSpvLabel(loc);
+}
+
+void spv::Context::setName(ir::spv::IdRef inst, std::string name) {
+  ns.setNameOf(inst, name);
+  auto debugs = Builder::createAppend(*this, layout.getOrCreateDebugs(*this));
+  debugs.createSpvName(getUnknownLocation(), inst, std::move(name));
+}
+
+void spv::Context::setConstantName(ir::Value constant) {
+  auto name = getConstantName(constant);
+  if (!name.empty()) {
+    ns.setNameOf(constant, std::move(name));
+  }
+}
+
+ir::Value spv::Context::getOrCreateConstant(ir::Value typeValue,
+                                                     const ir::Operand &value) {
+  if (typeValue == getTypeBool()) {
+    return *value.getAsBool() ? getTrue() : getFalse();
+  }
+  return getOrCreateGlobal(ir::spv::OpConstant, {{typeValue, value}});
+}
+
+ir::Value spv::Context::getType(ir::spv::Op baseType, int width,
+                                         bool isSigned) {
+  switch (baseType) {
+  case ir::spv::OpTypeInt:
+    return getTypeInt(width, isSigned);
+  case ir::spv::OpTypeFloat:
+    return getTypeFloat(width);
+  case ir::spv::OpTypeBool:
+    return getTypeBool();
+  case ir::spv::OpTypeVoid:
+    return getTypeVoid();
+
+  default:
+    std::abort();
+  }
+}
+
+ir::Value spv::Context::getType(const TypeInfo &info) {
+  switch (info.baseType) {
+  case ir::spv::OpTypeInt:
+  case ir::spv::OpTypeFloat:
+  case ir::spv::OpTypeBool:
+  case ir::spv::OpTypeVoid:
+    return getType(info.baseType, info.componentWidth, info.isSigned);
+
+  case ir::spv::OpTypeVector:
+    return getTypeVector(
+        getType(info.componentType, info.componentWidth, info.isSigned),
+        info.componentsCount);
+
+  case ir::spv::OpTypeArray:
+    return getTypeArray(
+        getType(info.componentType, info.componentWidth, info.isSigned),
+        imm32(info.componentsCount));
+
+  default:
+    std::abort();
+  }
+}
+
+void spv::Context::setTypeName(ir::Value type) {
+  auto name = getTypeName(type);
+  if (!name.empty()) {
+    ns.setNameOf(type, std::move(name));
+  }
+}
+
+ir::Value
+spv::Context::findGlobal(ir::spv::Op op,
+                                  std::span<const ir::Operand> operands) const {
+  auto it = globals.find(ir::getInstructionId(ir::Kind::Spv, op));
+
+  if (it == globals.end()) {
+    return nullptr;
+  }
+
+  auto &types = it->second;
+
+  for (auto type : types) {
+    if (type.getOperandCount() != operands.size()) {
+      continue;
+    }
+
+    bool matches = true;
+    for (std::size_t i = 0; auto &operand : type.getOperands()) {
+      if (operands[i++] != operand) {
+        matches = false;
+        break;
+      }
+    }
+
+    if (matches) {
+      return type;
+    }
+  }
+
+  return nullptr;
+}
+
+ir::Value
+spv::Context::createGlobal(ir::spv::Op op,
+                                    std::span<const ir::Operand> operands) {
+  auto builder = Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+  auto result =
+      builder.createValue(getUnknownLocation(), ir::Kind::Spv, op, operands);
+
+  globals[ir::getInstructionId(op)].push_back(result);
+  if (ir::spv::isTypeOp(op)) {
+    setTypeName(result);
+  } else {
+    setConstantName(result);
+  }
+  return result;
+}
+
+ir::Value spv::Context::getOrCreateGlobal(
+    ir::spv::Op op, std::span<const ir::Operand> operands) {
+  if (auto result = findGlobal(op, operands)) {
+    return result;
+  }
+
+  return createGlobal(op, operands);
+}
+
+ir::Value spv::Context::getOperandValue(const ir::Operand &op,
+                                                 ir::Value type) {
+  if (auto result = op.getAsValue()) {
+    return result;
+  }
+
+  auto createConstant = [&](auto value, ir::Value expType) {
+    return getOrCreateConstant(type ? type : expType, value);
+  };
+
+  if (auto result = op.getAsInt32()) {
+    return createConstant(*result, getTypeSInt32());
+  }
+
+  if (auto result = op.getAsInt64()) {
+    return createConstant(*result, getTypeSInt64());
+  }
+
+  if (auto result = op.getAsFloat()) {
+    return createConstant(*result, getTypeFloat32());
+  }
+
+  if (auto result = op.getAsDouble()) {
+    return createConstant(*result, getTypeFloat64());
+  }
+
+  if (auto result = op.getAsBool()) {
+    return createConstant(*result, getTypeBool());
+  }
+
+  std::abort();
+}
+
+void spv::Context::createPerVertex() {
+  if (perVertex != nullptr) {
+    return;
+  }
+
+  auto loc = rootLocation;
+
+  auto float32 = getTypeFloat32();
+  auto arr1Float = getTypeArray(float32, getIndex(1));
+  auto float32x4 = getTypeVector(float32, 4);
+
+  auto gl_PerVertexStructT =
+      getTypeStruct(float32x4, float32, arr1Float, arr1Float);
+  auto gl_PerVertexPtrT =
+      getTypePointer(ir::spv::StorageClass::Output, gl_PerVertexStructT);
+  auto annotations =
+      Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+
+  annotations.createSpvDecorate(loc, gl_PerVertexStructT,
+                                ir::spv::Decoration::Block());
+  annotations.createSpvMemberDecorate(
+      loc, gl_PerVertexStructT, 0,
+      ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::Position));
+  annotations.createSpvMemberDecorate(
+      loc, gl_PerVertexStructT, 1,
+      ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::PointSize));
+  annotations.createSpvMemberDecorate(
+      loc, gl_PerVertexStructT, 2,
+      ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::ClipDistance));
+  annotations.createSpvMemberDecorate(
+      loc, gl_PerVertexStructT, 3,
+      ir::spv::Decoration::BuiltIn(ir::spv::BuiltIn::CullDistance));
+
+  auto globals = Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+
+  perVertex = globals.createSpvVariable(loc, gl_PerVertexPtrT,
+                                        ir::spv::StorageClass::Output);
+}
+
+ir::Value spv::Context::createUniformBuffer(int descriptorSet,
+                                                     int binding,
+                                                     ir::Value structType) {
+  auto globals = Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+  auto annotations =
+      Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+  auto loc = getUnknownLocation();
+
+  auto storageClass = ir::spv::StorageClass::StorageBuffer;
+  auto blockType = globals.createSpvTypePointer(loc, storageClass, structType);
+
+  auto blockVariable = globals.createSpvVariable(loc, blockType, storageClass);
+
+  annotations.createSpvDecorate(
+      loc, blockVariable, ir::spv::Decoration::DescriptorSet(descriptorSet));
+  annotations.createSpvDecorate(loc, blockVariable,
+                                ir::spv::Decoration::Binding(binding));
+  annotations.createSpvDecorate(loc, blockVariable,
+                                ir::spv::Decoration::Uniform());
+  return blockVariable;
+}
+
+ir::Value spv::Context::createRuntimeArrayUniformBuffer(
+    int descriptorSet, int binding, ir::Value elementType) {
+  auto globals = Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+  auto annotations =
+      Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+  auto loc = getUnknownLocation();
+
+  auto element = globals.createSpvTypeRuntimeArray(loc, elementType);
+  annotations.createSpvDecorate(
+      loc, element,
+      ir::spv::Decoration::ArrayStride(
+          shader::spv::getTypeInfo(elementType).width() / 8));
+
+  auto blockStruct = globals.createSpvTypeStruct(loc, {{element}});
+  annotations.createSpvDecorate(loc, blockStruct, ir::spv::Decoration::Block());
+  annotations.createSpvMemberDecorate(loc, blockStruct, 0,
+                                      ir::spv::Decoration::Offset(0));
+  return createUniformBuffer(descriptorSet, binding, blockStruct);
+}
+
+ir::Value spv::Context::createOutput(ir::Location loc, int index) {
+  auto &result = outputs[index];
+
+  if (result == nullptr) {
+    auto floatType = getTypeFloat32();
+    auto float32x4Type = getTypeVector(floatType, 4);
+    auto variableType =
+        getTypePointer(ir::spv::StorageClass::Output, float32x4Type);
+
+    auto globals =
+        Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+    auto annotations =
+        Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+    auto debugs = Builder::createAppend(*this, layout.getOrCreateDebugs(*this));
+
+    auto variable = globals.createSpvVariable(loc, variableType,
+                                              ir::spv::StorageClass::Output);
+
+    annotations.createSpvDecorate(loc, variable,
+                                  ir::spv::Decoration::Location(index));
+
+    setName(variable, "output" + std::to_string(index));
+    result = variable;
+  }
+
+  return result;
+}
+
+ir::Value spv::Context::createInput(ir::Location loc, int index) {
+  auto &result = inputs[index];
+
+  if (result == nullptr) {
+    auto floatType = getTypeFloat32();
+    auto float32x4Type = getTypeVector(floatType, 4);
+    auto variableType =
+        getTypePointer(ir::spv::StorageClass::Input, float32x4Type);
+
+    auto globals =
+        Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+    auto annotations =
+        Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+    auto debugs = Builder::createAppend(*this, layout.getOrCreateDebugs(*this));
+
+    auto variable = globals.createSpvVariable(loc, variableType,
+                                              ir::spv::StorageClass::Input);
+
+    annotations.createSpvDecorate(loc, variable,
+                                  ir::spv::Decoration::Location(index));
+
+    setName(variable, "input" + std::to_string(index));
+    result = variable;
+  }
+
+  return result;
+}
+
+ir::Value spv::Context::createAttr(ir::Location loc, int attrId,
+                                            bool perVertex, bool flat) {
+  auto &result = inputs[attrId];
+
+  if (result == nullptr) {
+    auto floatType = getTypeFloat32();
+    auto float32x4Type = getTypeVector(floatType, 4);
+
+    auto attrArrayType = getTypeArray(float32x4Type, imm32(3));
+    auto variableType =
+        getTypePointer(ir::spv::StorageClass::Input,
+                       perVertex ? attrArrayType : float32x4Type);
+
+    auto globals =
+        Builder::createAppend(*this, layout.getOrCreateGlobals(*this));
+    auto annotations =
+        Builder::createAppend(*this, layout.getOrCreateAnnotations(*this));
+    auto debugs = Builder::createAppend(*this, layout.getOrCreateDebugs(*this));
+
+    auto variable = globals.createSpvVariable(loc, variableType,
+                                              ir::spv::StorageClass::Input);
+
+    annotations.createSpvDecorate(loc, variable,
+                                  ir::spv::Decoration::Location(attrId));
+
+    if (perVertex) {
+      annotations.createSpvDecorate(loc, variable,
+                                    ir::spv::Decoration::PerVertexKHR());
+    } else if (flat) {
+      annotations.createSpvDecorate(loc, variable, ir::spv::Decoration::Flat());
+    }
+    setName(variable, "attr" + std::to_string(attrId));
+    result = variable;
+  }
+
+  return result;
+}
--- a/rpcsx-gpu2/lib/gcn-shader/src/SpvTypeInfo.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/SpvTypeInfo.cpp
@ -0,0 +1,71 @@
+#include "SpvTypeInfo.hpp"
+#include "dialect.hpp"
+
+using namespace shader;
+
+shader::spv::TypeInfo shader::spv::getTypeInfo(ir::Value type) {
+  if (type == ir::spv::OpTypeBool) {
+    return {
+        .baseType = ir::spv::OpTypeBool,
+        .componentWidth = 1,
+        .componentsCount = 1,
+    };
+  }
+
+  if (type == ir::spv::OpTypeInt) {
+    return {
+        .baseType = ir::spv::OpTypeInt,
+        .componentWidth = *type.getOperand(0).getAsInt32(),
+        .componentsCount = 1,
+        .isSigned = *type.getOperand(1).getAsInt32() ? true : false,
+    };
+  }
+
+  if (type == ir::spv::OpTypeFloat) {
+    return {
+        .baseType = ir::spv::OpTypeFloat,
+        .componentWidth = *type.getOperand(0).getAsInt32(),
+        .componentsCount = 1,
+    };
+  }
+
+  if (type == ir::spv::OpTypeVector) {
+    auto componentInfo = getTypeInfo(type.getOperand(0).getAsValue());
+
+    return {
+        .baseType = ir::spv::OpTypeVector,
+        .componentType = componentInfo.baseType,
+        .componentWidth = componentInfo.width(),
+        .componentsCount = *type.getOperand(1).getAsInt32(),
+    };
+  }
+
+  if (type == ir::spv::OpTypeArray) {
+    auto elementInfo = getTypeInfo(type.getOperand(0).getAsValue());
+    auto countOfElements = type.getOperand(1).getAsValue();
+
+    return {
+        .baseType = ir::spv::OpTypeArray,
+        .componentType = elementInfo.baseType,
+        .componentWidth = elementInfo.width(),
+        .componentsCount = *countOfElements.getOperand(1).getAsInt32(),
+    };
+  }
+
+  if (type == ir::spv::OpTypeRuntimeArray) {
+    auto elementInfo = getTypeInfo(type.getOperand(0).getAsValue());
+
+    return {
+        .baseType = ir::spv::OpTypeRuntimeArray,
+        .componentType = elementInfo.baseType,
+        .componentWidth = elementInfo.width(),
+        .componentsCount = 1,
+    };
+  }
+
+  return {
+      .baseType = static_cast<ir::spv::Op>(type.getOp()),
+      .componentWidth = 0,
+      .componentsCount = 0,
+  };
+}
--- a/rpcsx-gpu2/lib/gcn-shader/src/analyze.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/analyze.cpp
--- a/rpcsx-gpu2/lib/gcn-shader/src/eval.cpp
+++ b/rpcsx-gpu2/lib/gcn-shader/src/eval.cpp
@ -0,0 +1,688 @@
+#include "eval.hpp"
+#include "dialect.hpp"
+#include "ir.hpp"
+#include <cmath>
+#include <concepts>
+
+using namespace shader;
+
+template <typename Cond, typename... Args> consteval bool testVisitCond() {
+  if constexpr (std::is_same_v<Cond, void>) {
+    return true;
+  } else {
+    return Cond{}(std::remove_cvref_t<Args>{}...);
+  }
+};
+
+template <typename Cond, std::size_t U> consteval bool testVisitCond() {
+  if constexpr (U >= eval::Value::StorageSize) {
+    return false;
+  } else if constexpr (std::is_same_v<Cond, void>) {
+    return true;
+  } else {
+    return Cond{}(std::variant_alternative_t<U, eval::Value::Storage>{});
+  }
+};
+
+template <typename Cond = void, size_t I = 0>
+constexpr eval::Value visitImpl(const eval::Value &variant, auto &&fn) {
+
+#define DEFINE_CASE(N)                                                         \
+  case I + N:                                                                  \
+    if constexpr (testVisitCond<Cond, I + N>()) {                              \
+      return std::forward<decltype(fn)>(fn)(std::get<I + N>(variant.storage)); \
+    } else {                                                                   \
+      return {};                                                               \
+    }
+
+  switch (variant.storage.index()) {
+    DEFINE_CASE(0);
+    DEFINE_CASE(1);
+    DEFINE_CASE(2);
+    DEFINE_CASE(3);
+    DEFINE_CASE(4);
+    DEFINE_CASE(5);
+    DEFINE_CASE(6);
+    DEFINE_CASE(7);
+    DEFINE_CASE(8);
+    DEFINE_CASE(9);
+    DEFINE_CASE(10);
+    DEFINE_CASE(11);
+    DEFINE_CASE(12);
+    DEFINE_CASE(13);
+    DEFINE_CASE(14);
+    DEFINE_CASE(15);
+    DEFINE_CASE(16);
+    DEFINE_CASE(17);
+    DEFINE_CASE(18);
+    DEFINE_CASE(19);
+    DEFINE_CASE(20);
+    DEFINE_CASE(21);
+    DEFINE_CASE(22);
+    DEFINE_CASE(23);
+    DEFINE_CASE(24);
+    DEFINE_CASE(25);
+    DEFINE_CASE(26);
+    DEFINE_CASE(27);
+    DEFINE_CASE(28);
+    DEFINE_CASE(29);
+    DEFINE_CASE(30);
+    DEFINE_CASE(31);
+    DEFINE_CASE(32);
+    DEFINE_CASE(33);
+    DEFINE_CASE(34);
+    DEFINE_CASE(35);
+    DEFINE_CASE(36);
+    DEFINE_CASE(37);
+    DEFINE_CASE(38);
+    DEFINE_CASE(39);
+    DEFINE_CASE(40);
+    DEFINE_CASE(41);
+    DEFINE_CASE(42);
+    DEFINE_CASE(43);
+    DEFINE_CASE(44);
+    DEFINE_CASE(45);
+    DEFINE_CASE(46);
+    DEFINE_CASE(47);
+    DEFINE_CASE(48);
+    DEFINE_CASE(49);
+    DEFINE_CASE(50);
+    DEFINE_CASE(51);
+    DEFINE_CASE(52);
+    DEFINE_CASE(53);
+    DEFINE_CASE(54);
+    DEFINE_CASE(55);
+    DEFINE_CASE(56);
+    DEFINE_CASE(57);
+    DEFINE_CASE(58);
+    DEFINE_CASE(59);
+    DEFINE_CASE(60);
+    DEFINE_CASE(61);
+    DEFINE_CASE(62);
+    DEFINE_CASE(63);
+  }
+#undef DEFINE_CASE
+
+  constexpr auto NextIndex = I + 64;
+
+  if constexpr (NextIndex < eval::Value::StorageSize) {
+    return visitImpl<Cond, NextIndex>(std::forward<decltype(fn)>(fn),
+                                      std::forward<decltype(variant)>(variant));
+  }
+
+  return {};
+}
+
+template <typename Cond = void, typename Cb>
+constexpr eval::Value visitScalarType(ir::Value type, Cb &&cb)
+  requires requires {
+    { std::forward<Cb>(cb)(int{}) } -> std::same_as<eval::Value>;
+  }
+{
+  auto invoke = [&](auto type) -> eval::Value {
+    if constexpr (testVisitCond<Cond, std::remove_cvref_t<decltype(type)>>()) {
+      return std::forward<Cb>(cb)(type);
+    }
+    return {};
+  };
+
+  if (type == ir::spv::OpTypeBool) {
+    return invoke(bool{});
+  }
+
+  if (type == ir::spv::OpTypeInt) {
+    auto isSigned = *type.getOperand(1).getAsInt32();
+
+    switch (*type.getOperand(0).getAsInt32()) {
+    case 8:
+      if (isSigned) {
+        return invoke(std::int8_t{});
+      }
+      return invoke(std::uint8_t{});
+
+    case 16:
+      if (isSigned) {
+        return invoke(std::int16_t{});
+      }
+      return invoke(std::uint16_t{});
+
+    case 32:
+      if (isSigned) {
+        return invoke(std::int32_t{});
+      }
+      return invoke(std::uint32_t{});
+
+    case 64:
+      if (isSigned) {
+        return invoke(std::int64_t{});
+      }
+      return invoke(std::uint64_t{});
+    }
+
+    return {};
+  }
+
+  if (type == ir::spv::OpTypeFloat) {
+    switch (*type.getOperand(0).getAsInt32()) {
+    case 16:
+      return invoke(shader::float16_t{});
+
+    case 32:
+      return invoke(shader::float32_t{});
+
+    case 64:
+      return invoke(shader::float64_t{});
+    }
+
+    return {};
+  }
+
+  return {};
+}
+
+template <typename Cond = void, typename Cb>
+constexpr eval::Value visitType(ir::Value type, Cb &&cb)
+  requires requires {
+    { std::forward<Cb>(cb)(int{}) } -> std::same_as<eval::Value>;
+  }
+{
+  if (type == ir::spv::OpTypeInt || type == ir::spv::OpTypeFloat ||
+      type == ir::spv::OpTypeBool) {
+    return visitScalarType<Cond>(type, cb);
+  }
+
+  auto invoke = [&](auto type) -> eval::Value {
+    if constexpr (testVisitCond<Cond, std::remove_cvref_t<decltype(type)>>()) {
+      return std::forward<Cb>(cb)(type);
+    } else {
+      return {};
+    }
+  };
+
+  if (type == ir::spv::OpTypeVector) {
+    switch (*type.getOperand(1).getAsInt32()) {
+    case 2:
+      return visitScalarType(
+          type.getOperand(0).getAsValue(),
+          [&]<typename T>(T) { return invoke(shader::Vector<T, 2>{}); });
+
+    case 3:
+      return visitScalarType(
+          type.getOperand(0).getAsValue(),
+          [&]<typename T>(T) { return invoke(shader::Vector<T, 3>{}); });
+
+    case 4:
+      return visitScalarType(
+          type.getOperand(0).getAsValue(),
+          [&]<typename T>(T) { return invoke(shader::Vector<T, 4>{}); });
+    }
+
+    return {};
+  }
+
+  return {};
+}
+
+template <typename Cond = void, typename Cb>
+eval::Value visit(const eval::Value &value, Cb &&cb) {
+  using VisitCond = decltype([](auto &&storage) {
+    using T = std::remove_cvref_t<decltype(storage)>;
+    if constexpr (std::is_same_v<T, std::nullptr_t>) {
+      return false;
+    } else {
+      return testVisitCond<Cond, T>();
+    }
+  });
+
+  return visitImpl<VisitCond>(value, std::forward<Cb>(cb));
+}
+
+template <typename Cb>
+eval::Value visit2(auto &&cond, const eval::Value &value, Cb &&cb) {
+  if constexpr (cond()) {
+    return visitImpl(value, std::forward<Cb>(cb));
+  } else {
+    return {};
+  }
+}
+
+template <typename ValueCond = void, typename TypeVisitCond = void,
+          typename TypeValueVisitCond = void, typename Cb>
+eval::Value visitWithType(const eval::Value &value, ir::Value type, Cb &&cb) {
+  using ValueVisitCond = decltype([](auto storage) {
+    if constexpr (std::is_same_v<decltype(storage), std::nullptr_t>) {
+      return false;
+    } else {
+      return testVisitCond<ValueCond, decltype(storage)>();
+    }
+  });
+
+  return visitImpl<ValueVisitCond>(value, [&](auto &&value) -> eval::Value {
+    return visitType<TypeVisitCond>(type, [&](auto type) -> eval::Value {
+      if constexpr (testVisitCond<TypeValueVisitCond, decltype(type),
+                                  decltype(value)>()) {
+        return std::forward<Cb>(cb)(type, value);
+      } else {
+        return {};
+      }
+    });
+  });
+}
+
+namespace {
+template <typename T> struct ComponentTypeImpl {
+  using type = T;
+};
+
+template <typename T, std::size_t N> struct ComponentTypeImpl<Vector<T, N>> {
+  using type = T;
+};
+
+template <typename T, std::size_t N>
+struct ComponentTypeImpl<std::array<T, N>> {
+  using type = T;
+};
+
+template <typename T> struct MakeSignedImpl {
+  using type = std::make_signed_t<T>;
+};
+
+template <typename T, std::size_t N> struct MakeSignedImpl<Vector<T, N>> {
+  using type = Vector<std::make_signed_t<T>, N>;
+};
+template <typename T> struct MakeUnsignedImpl {
+  using type = std::make_unsigned_t<T>;
+};
+
+template <typename T, std::size_t N> struct MakeUnsignedImpl<Vector<T, N>> {
+  using type = Vector<std::make_unsigned_t<T>, N>;
+};
+} // namespace
+
+template <typename T> using ComponentType = typename ComponentTypeImpl<T>::type;
+template <typename T> using MakeSigned = typename MakeSignedImpl<T>::type;
+template <typename T> using MakeUnsigned = typename MakeUnsignedImpl<T>::type;
+
+template <typename> constexpr std::size_t Components = 1;
+template <typename T, std::size_t N>
+constexpr std::size_t Components<Vector<T, N>> = N;
+template <typename T, std::size_t N>
+constexpr std::size_t Components<std::array<T, N>> = N;
+
+template <typename> constexpr bool IsArray = false;
+template <typename T, std::size_t N>
+constexpr bool IsArray<std::array<T, N>> = true;
+
+eval::Value
+eval::Value::compositeConstruct(ir::Value type,
+                                std::span<const eval::Value> constituents) {
+  using Cond =
+      decltype([](auto type) { return Components<decltype(type)> > 1; });
+
+  return visitType<Cond>(type, [&](auto type) -> Value {
+    constexpr std::size_t N = Components<decltype(type)>;
+    if (N != constituents.size()) {
+      return {};
+    }
+
+    decltype(type) result;
+
+    for (std::size_t i = 0; i < N; ++i) {
+      if (auto value = constituents[i].as<ComponentType<decltype(type)>>()) {
+        result[i] = *value;
+      } else {
+        return {};
+      }
+    }
+
+    return result;
+  });
+}
+
+eval::Value eval::Value::compositeExtract(const Value &index) const {
+  using Cond =
+      decltype([](auto type) { return Components<decltype(type)> > 1; });
+
+  auto optIndexInt = index.zExtScalar();
+  if (!optIndexInt) {
+    return {};
+  }
+
+  auto indexInt = *optIndexInt;
+
+  return visit<Cond>(*this, [&](auto &&value) -> Value {
+    using ValueType = std::remove_cvref_t<decltype(value)>;
+    constexpr std::size_t N = Components<ValueType>;
+
+    if (indexInt >= N) {
+      return {};
+    }
+
+    return value[indexInt];
+  });
+}
+
+eval::Value eval::Value::isNan() const {
+  using Cond = decltype([](auto type) {
+    return std::is_floating_point_v<ComponentType<decltype(type)>> && !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) -> Value {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+
+    if constexpr (N == 1) {
+      return std::isnan(value);
+    } else {
+      Vector<bool, N> result;
+      for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::isnan(value[i]);
+      }
+      return result;
+    }
+  });
+}
+
+eval::Value eval::Value::isInf() const {
+  using Cond = decltype([](auto type) {
+    return std::is_floating_point_v<ComponentType<decltype(type)>> && !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) -> Value {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+
+    if constexpr (N == 1) {
+      return std::isinf(value);
+    } else {
+      Vector<bool, N> result;
+      for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::isinf(value[i]);
+      }
+      return result;
+    }
+  });
+}
+
+eval::Value eval::Value::isFinite() const {
+  using Cond = decltype([](auto type) {
+    return std::is_floating_point_v<ComponentType<decltype(type)>>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) -> Value {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+
+    if constexpr (N == 1) {
+      return std::isfinite(value);
+    } else {
+      Vector<bool, N> result;
+      for (std::size_t i = 0; i < N; ++i) {
+        result[i] = std::isfinite(value[i]);
+      }
+      return result;
+    }
+  });
+}
+
+eval::Value eval::Value::makeUnsigned() const {
+  using Cond = decltype([](auto type) {
+    return std::is_integral_v<ComponentType<decltype(type)>> &&
+           !std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) -> Value {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+    using T = std::make_unsigned_t<
+        ComponentType<std::remove_cvref_t<decltype(value)>>>;
+
+    if constexpr (N == 1) {
+      return static_cast<T>(value);
+    } else {
+      Vector<T, N> result;
+      for (std::size_t i = 0; i < N; ++i) {
+        result[i] = static_cast<T>(value[i]);
+      }
+      return result;
+    }
+  });
+}
+eval::Value eval::Value::makeSigned() const {
+  using Cond = decltype([](auto type) {
+    return std::is_integral_v<ComponentType<decltype(type)>> &&
+           !std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) -> Value {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+    using T =
+        std::make_signed_t<ComponentType<std::remove_cvref_t<decltype(value)>>>;
+
+    if constexpr (N == 1) {
+      return static_cast<T>(value);
+    } else {
+      Vector<T, N> result;
+      for (std::size_t i = 0; i < N; ++i) {
+        result[i] = static_cast<T>(value[i]);
+      }
+      return result;
+    }
+  });
+}
+
+eval::Value eval::Value::all() const {
+  using Cond = decltype([](auto type) {
+    return std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           (Components<decltype(type)> > 1)  && !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+    for (std::size_t i = 0; i < N; ++i) {
+      if (!value[i]) {
+        return false;
+      }
+    }
+    return true;
+  });
+}
+
+eval::Value eval::Value::any() const {
+  using Cond = decltype([](auto type) {
+    return std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           (Components<decltype(type)> > 1) && !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [](auto &&value) {
+    constexpr std::size_t N = Components<std::remove_cvref_t<decltype(value)>>;
+    for (std::size_t i = 0; i < N; ++i) {
+      if (value[i]) {
+        return true;
+      }
+    }
+    return false;
+  });
+}
+
+eval::Value eval::Value::select(const Value &trueValue,
+                                const Value &falseValue) const {
+  using Cond = decltype([](auto type) consteval {
+    return std::is_same_v<ComponentType<decltype(type)>, bool> && !IsArray<decltype(type)>;
+  });
+
+  return visit<Cond>(*this, [&](auto &&cond) -> Value {
+    using CondType = std::remove_cvref_t<decltype(cond)>;
+    using TrueCond = decltype([](auto type) consteval {
+      return Components<decltype(type)> == Components<CondType>;
+    });
+
+    return visit<TrueCond>(trueValue, [&](auto &&trueValue) {
+      using TrueValue = std::remove_cvref_t<decltype(trueValue)>;
+      using FalseCond = decltype([](auto type) {
+        return std::is_same_v<TrueValue, std::remove_cvref_t<decltype(type)>>;
+      });
+
+      return visit(falseValue, [&](auto &&falseValue) -> Value {
+        if constexpr (std::is_same_v<TrueValue, std::remove_cvref_t<
+                                                    decltype(falseValue)>>) {
+          constexpr std::size_t N = Components<CondType>;
+
+          if constexpr (N == 1) {
+            return cond ? trueValue : falseValue;
+          } else {
+            Vector<bool, N> result;
+            for (std::size_t i = 0; i < N; ++i) {
+              result[i] = cond[i] ? trueValue[i] : falseValue[i];
+            }
+            return result;
+          }
+        } else {
+          return {};
+        }
+      });
+    });
+  });
+}
+
+eval::Value eval::Value::iConvert(ir::Value type, bool isSigned) const {
+  using Cond = decltype([](auto type) {
+    using Type = std::remove_cvref_t<decltype(type)>;
+
+    return std::is_integral_v<ComponentType<Type>> &&
+           !std::is_same_v<bool, ComponentType<Type>> && !IsArray<decltype(type)>;
+  });
+
+  using PairCond = decltype([](auto lhs, auto rhs) {
+    using Lhs = decltype(lhs);
+    using Rhs = decltype(rhs);
+
+    return !std::is_same_v<Lhs, Rhs> && Components<Lhs> == Components<Rhs>;
+  });
+
+  return visitWithType<Cond, Cond, PairCond>(
+      *this, type, [&](auto type, auto &&value) -> Value {
+        using Type = std::remove_cvref_t<decltype(type)>;
+        using ValueType = std::remove_cvref_t<decltype(value)>;
+        if (isSigned) {
+          return static_cast<Type>(static_cast<MakeSigned<ValueType>>(value));
+        } else {
+          return static_cast<Type>(static_cast<MakeUnsigned<ValueType>>(value));
+        }
+      });
+}
+eval::Value eval::Value::fConvert(ir::Value type) const {
+  using Cond = decltype([](auto type) {
+    return std::is_floating_point_v<ComponentType<decltype(type)>> && !IsArray<decltype(type)>;
+  });
+
+  using PairCond = decltype([](auto lhs, auto rhs) {
+    using Lhs = decltype(lhs);
+    using Rhs = decltype(rhs);
+
+    return !std::is_same_v<Lhs, Rhs> && Components<Lhs> == Components<Rhs>;
+  });
+
+  return visitWithType<void, void, PairCond>(
+      *this, type, [&](auto type, auto &&value) -> Value {
+        using Type = std::remove_cvref_t<decltype(type)>;
+        return static_cast<Type>(value);
+      });
+}
+
+eval::Value eval::Value::bitcast(ir::Value type) const {
+  using Cond = decltype([](auto type, auto value) {
+    using Type = std::remove_cvref_t<decltype(type)>;
+
+    return sizeof(type) == sizeof(value);
+  });
+
+  return visitWithType<void, void, Cond>(
+      *this, type, [](auto type, auto &&value) -> Value {
+        return std::bit_cast<decltype(type)>(value);
+      });
+}
+
+std::optional<std::uint64_t> eval::Value::zExtScalar() const {
+  using Cond = decltype([](auto type) {
+    return std::is_integral_v<ComponentType<decltype(type)>> &&
+           !std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           Components<decltype(type)> == 1 && !IsArray<decltype(type)>;
+  });
+
+  auto result = visit<Cond>(*this, [&](auto value) -> Value {
+    return static_cast<std::uint64_t>(
+        static_cast<MakeUnsigned<decltype(value)>>(value));
+  });
+
+  if (result) {
+    return result.as<std::uint64_t>();
+  }
+
+  return {};
+}
+
+std::optional<std::int64_t> eval::Value::sExtScalar() const {
+  using Cond = decltype([](auto type) {
+    return std::is_integral_v<ComponentType<decltype(type)>> &&
+           !std::is_same_v<ComponentType<decltype(type)>, bool> &&
+           Components<decltype(type)> == 1 && !IsArray<decltype(type)>;
+  });
+
+  auto result = visit<Cond>(*this, [&](auto value) -> Value {
+    return static_cast<std::int64_t>(
+        static_cast<MakeSigned<decltype(value)>>(value));
+  });
+
+  if (result) {
+    return result.as<std::int64_t>();
+  }
+
+  return {};
+}
+
+#define DEFINE_BINARY_OP(OP)                                                   \
+  eval::Value eval::Value::operator OP(const Value & rhs) const {              \
+    using LhsCond = decltype([](auto &&lhs) {                                  \
+      return requires { static_cast<Value>(lhs OP rhs); };                     \
+    });                                                                        \
+    return visit<LhsCond>(*this, [&]<typename Lhs>(Lhs &&lhs) -> Value {       \
+      using RhsCond = decltype([](auto &&rhs) {                                \
+        return requires(Lhs lhs) { static_cast<Value>(lhs OP rhs); };          \
+      });                                                                      \
+      return visit<RhsCond>(rhs, [&](auto &&rhs) -> Value {                    \
+        return static_cast<Value>(lhs OP rhs);                                 \
+      });                                                                      \
+    });                                                                        \
+  }
+
+#define DEFINE_UNARY_OP(OP)                                                    \
+  eval::Value eval::Value::operator OP() const {                               \
+    using Cond = decltype([](auto rhs) {                                       \
+      return requires { static_cast<Value>(OP rhs); };                         \
+    });                                                                        \
+    return visit<Cond>(*this, [&](auto &&rhs) -> Value {                       \
+      return static_cast<Value>(OP rhs);                                       \
+    });                                                                        \
+  }
+
+DEFINE_BINARY_OP(+);
+DEFINE_BINARY_OP(-);
+DEFINE_BINARY_OP(*);
+DEFINE_BINARY_OP(/);
+DEFINE_BINARY_OP(%);
+DEFINE_BINARY_OP(&);
+DEFINE_BINARY_OP(|);
+DEFINE_BINARY_OP(^);
+DEFINE_BINARY_OP(>>);
+DEFINE_BINARY_OP(<<);
+DEFINE_BINARY_OP(&&);
+DEFINE_BINARY_OP(||);
+DEFINE_BINARY_OP(<);
+DEFINE_BINARY_OP(>);
+DEFINE_BINARY_OP(<=);
+DEFINE_BINARY_OP(>=);
+DEFINE_BINARY_OP(==);
+DEFINE_BINARY_OP(!=);
+
+DEFINE_UNARY_OP(-);
+DEFINE_UNARY_OP(~);
+DEFINE_UNARY_OP(!);
--- a/Show more
+++ b/Show more