diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 33d9f3448..3dcdf4da9 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -377,7 +377,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestScratchGPUBuffer(
       barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
       barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
       barrier.Transition.pResource = scratch_buffer_;
-      barrier.Transition.Subresource = 0;
+      barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
       barrier.Transition.StateBefore = scratch_buffer_state_;
       barrier.Transition.StateAfter = state;
       GetCurrentCommandList()->ResourceBarrier(1, &barrier);
@@ -489,6 +489,10 @@ bool D3D12CommandProcessor::SetupContext() {
 
   render_target_cache_ =
       std::make_unique<RenderTargetCache>(this, register_file_);
+  if (!render_target_cache_->Initialize()) {
+    XELOGE("Failed to initialize the render target cache");
+    return false;
+  }
 
   return true;
 }
diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index aa015c858..39d1585c1 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -21,13 +21,176 @@ namespace xe {
 namespace gpu {
 namespace d3d12 {
 
+// Generated with `xb buildhlsl`.
+#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_float_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_unorm_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_32bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_64bpp_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_7e3_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h"
+#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h"
+
+const RenderTargetCache::EDRAMLoadStorePipelineInfo
+    RenderTargetCache::edram_load_store_pipeline_info_[size_t(
+        RenderTargetCache::EDRAMLoadStorePipelineIndex::kCount)] = {
+        {edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs),
+         L"EDRAM Load 32bpp Color"},
+        {edram_store_color_32bpp_cs, sizeof(edram_store_color_32bpp_cs),
+         L"EDRAM Store 32bpp Color"},
+        {edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs),
+         L"EDRAM Load 64bpp Color"},
+        {edram_store_color_64bpp_cs, sizeof(edram_store_color_64bpp_cs),
+         L"EDRAM Store 64bpp Color"},
+        {edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs),
+         L"EDRAM Load 7e3 Color"},
+        {edram_store_color_7e3_cs, sizeof(edram_store_color_7e3_cs),
+         L"EDRAM Store 7e3 Color"},
+        {edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs),
+         L"EDRAM Load UNorm Depth"},
+        {edram_store_depth_unorm_cs, sizeof(edram_store_depth_unorm_cs),
+         L"EDRAM Store UNorm Depth"},
+        {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs),
+         L"EDRAM Load Float Depth"},
+        {edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs),
+         L"EDRAM Store Float Depth"},
+};
+
 RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor,
                                      RegisterFile* register_file)
     : command_processor_(command_processor), register_file_(register_file) {}
 
 RenderTargetCache::~RenderTargetCache() { Shutdown(); }
 
-void RenderTargetCache::Shutdown() { ClearCache(); }
+bool RenderTargetCache::Initialize() {
+  auto device =
+      command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
+
+  // Create the buffer for reinterpreting EDRAM contents.
+  D3D12_RESOURCE_DESC edram_buffer_desc;
+  edram_buffer_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+  edram_buffer_desc.Alignment = 0;
+  // First 10 MB is guest pixel data, second 10 MB is 32-bit depth when using
+  // D24FS8 so loads/stores don't corrupt multipass rendering.
+  edram_buffer_desc.Width = 2 * 2048 * 5120;
+  edram_buffer_desc.Height = 1;
+  edram_buffer_desc.DepthOrArraySize = 1;
+  edram_buffer_desc.MipLevels = 1;
+  edram_buffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+  edram_buffer_desc.SampleDesc.Count = 1;
+  edram_buffer_desc.SampleDesc.Quality = 0;
+  edram_buffer_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+  edram_buffer_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
+  D3D12_HEAP_PROPERTIES edram_buffer_heap_properties = {};
+  edram_buffer_heap_properties.Type = D3D12_HEAP_TYPE_DEFAULT;
+  // The first operation will be a clear.
+  edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+  if (FAILED(device->CreateCommittedResource(
+          &edram_buffer_heap_properties, D3D12_HEAP_FLAG_NONE,
+          &edram_buffer_desc, edram_buffer_state_, nullptr,
+          IID_PPV_ARGS(&edram_buffer_)))) {
+    XELOGE("Failed to create the EDRAM buffer");
+    return false;
+  }
+  edram_buffer_cleared_ = false;
+
+  // Create the root signature for EDRAM buffer load/store.
+  D3D12_ROOT_PARAMETER root_parameters[2];
+  // Parameter 0 is constants (changed for each render target binding).
+  root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+  root_parameters[0].Constants.ShaderRegister = 0;
+  root_parameters[0].Constants.RegisterSpace = 0;
+  root_parameters[0].Constants.Num32BitValues =
+      sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t);
+  root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+  // Parameter 1 is source and target.
+  D3D12_DESCRIPTOR_RANGE root_load_store_ranges[2];
+  root_load_store_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
+  root_load_store_ranges[0].NumDescriptors = 1;
+  root_load_store_ranges[0].BaseShaderRegister = 0;
+  root_load_store_ranges[0].RegisterSpace = 0;
+  root_load_store_ranges[0].OffsetInDescriptorsFromTableStart = 0;
+  root_load_store_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
+  root_load_store_ranges[1].NumDescriptors = 1;
+  root_load_store_ranges[1].BaseShaderRegister = 0;
+  root_load_store_ranges[1].RegisterSpace = 0;
+  root_load_store_ranges[1].OffsetInDescriptorsFromTableStart = 1;
+  root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+  root_parameters[1].DescriptorTable.NumDescriptorRanges = 2;
+  root_parameters[1].DescriptorTable.pDescriptorRanges = root_load_store_ranges;
+  root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+  D3D12_ROOT_SIGNATURE_DESC root_signature_desc;
+  root_signature_desc.NumParameters = UINT(xe::countof(root_parameters));
+  root_signature_desc.pParameters = root_parameters;
+  root_signature_desc.NumStaticSamplers = 0;
+  root_signature_desc.pStaticSamplers = nullptr;
+  root_signature_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
+  ID3DBlob* root_signature_blob;
+  ID3DBlob* root_signature_error_blob = nullptr;
+  if (FAILED(D3D12SerializeRootSignature(
+          &root_signature_desc, D3D_ROOT_SIGNATURE_VERSION_1,
+          &root_signature_blob, &root_signature_error_blob))) {
+    XELOGE("Failed to serialize the EDRAM buffer load/store root signature");
+    if (root_signature_error_blob != nullptr) {
+      XELOGE("%s", reinterpret_cast<const char*>(
+                       root_signature_error_blob->GetBufferPointer()));
+      root_signature_error_blob->Release();
+    }
+    Shutdown();
+    return false;
+  }
+  if (root_signature_error_blob != nullptr) {
+    root_signature_error_blob->Release();
+  }
+  if (FAILED(device->CreateRootSignature(
+          0, root_signature_blob->GetBufferPointer(),
+          root_signature_blob->GetBufferSize(),
+          IID_PPV_ARGS(&edram_load_store_root_signature_)))) {
+    XELOGE("Failed to create the EDRAM buffer load/store root signature");
+    root_signature_blob->Release();
+    Shutdown();
+    return false;
+  }
+  root_signature_blob->Release();
+
+  // Create the load/store pipelines.
+  D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc;
+  pipeline_desc.pRootSignature = edram_load_store_root_signature_;
+  pipeline_desc.NodeMask = 0;
+  pipeline_desc.CachedPSO.pCachedBlob = nullptr;
+  pipeline_desc.CachedPSO.CachedBlobSizeInBytes = 0;
+  pipeline_desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
+  for (uint32_t i = 0; i < uint32_t(EDRAMLoadStorePipelineIndex::kCount); ++i) {
+    const EDRAMLoadStorePipelineInfo& pipeline_info =
+        edram_load_store_pipeline_info_[i];
+    pipeline_desc.CS.pShaderBytecode = pipeline_info.shader;
+    pipeline_desc.CS.BytecodeLength = pipeline_info.shader_size;
+    if (FAILED(device->CreateComputePipelineState(
+            &pipeline_desc, IID_PPV_ARGS(&edram_load_store_pipelines_[i])))) {
+      XELOGE("Failed to create EDRAM load/store pipeline for mode %u", i);
+      Shutdown();
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void RenderTargetCache::Shutdown() {
+  ClearCache();
+
+  if (edram_load_store_root_signature_ != nullptr) {
+    edram_load_store_root_signature_->Release();
+    edram_load_store_root_signature_ = nullptr;
+  }
+
+  if (edram_buffer_ != nullptr) {
+    edram_buffer_->Release();
+    edram_buffer_ = nullptr;
+  }
+}
 
 void RenderTargetCache::ClearCache() {
   for (auto render_target_pair : render_targets_) {
@@ -334,7 +497,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
     uint32_t heap_usage[5] = {};
     if (full_update) {
       // Export the currently bound render targets before we ruin the bindings.
-      WriteRenderTargetsToEDRAM();
+      StoreRenderTargetsToEDRAM();
 
       ClearBindings();
       current_surface_pitch_ = surface_pitch;
@@ -527,7 +690,7 @@ bool RenderTargetCache::UpdateRenderTargets() {
 }
 
 void RenderTargetCache::EndFrame() {
-  WriteRenderTargetsToEDRAM();
+  StoreRenderTargetsToEDRAM();
   ClearBindings();
 }
 
@@ -709,6 +872,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
   }
   ++descriptor_heap->descriptors_used;
 
+  // Get the layout for copying to the EDRAM buffer.
   RenderTarget* render_target = new RenderTarget;
   render_target->resource = resource;
   render_target->state = state;
@@ -716,11 +880,245 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
   render_target->key = key;
   render_target->heap_page_first = heap_page_first;
   render_target->heap_page_count = heap_page_count;
+  UINT64 copy_buffer_size;
+  device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0,
+                                render_target->footprints, nullptr, nullptr,
+                                &copy_buffer_size);
+  render_target->copy_buffer_size = uint32_t(copy_buffer_size);
   render_targets_.insert(std::make_pair(key.value, render_target));
   return render_target;
 }
 
-void RenderTargetCache::WriteRenderTargetsToEDRAM() {}
+void RenderTargetCache::StoreRenderTargetsToEDRAM() {
+  auto command_list = command_processor_->GetCurrentCommandList();
+  if (command_list == nullptr) {
+    return;
+  }
+
+  uint32_t surface_pitch_ss =
+      current_surface_pitch_ *
+      (current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1);
+  uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80;
+  assert_true(surface_pitch_tiles != 0);
+
+  // TODO(Triang3l): Clear the buffer if calling for the first time.
+
+  uint32_t store_bindings[5];
+  uint32_t store_binding_count = 0;
+
+  D3D12_RESOURCE_BARRIER barriers[6];
+  uint32_t barrier_count;
+
+  // Extract only the render targets that need to be stored, transition them to
+  // copy sources and calculate intermediate buffer size.
+  uint32_t copy_buffer_size = 0;
+  barrier_count = 0;
+  for (uint32_t i = 0; i < 5; ++i) {
+    const RenderTargetBinding& binding = current_bindings_[i];
+    RenderTarget* render_target = binding.render_target;
+    // TODO(Triang3l): Change edram_dirty_length to dirty row count.
+    if (!binding.is_bound || render_target == nullptr ||
+        binding.edram_dirty_length < surface_pitch_tiles) {
+      continue;
+    }
+    store_bindings[store_binding_count] = i;
+    copy_buffer_size =
+        std::max(copy_buffer_size, render_target->copy_buffer_size);
+    ++store_binding_count;
+    if (render_target->state != D3D12_RESOURCE_STATE_COPY_SOURCE) {
+      D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
+      barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+      barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+      barrier.Transition.pResource = render_target->resource;
+      barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+      barrier.Transition.StateBefore = render_target->state;
+      barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
+      render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE;
+    }
+  }
+  if (store_binding_count == 0) {
+    return;
+  }
+  if (edram_buffer_state_ != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
+    // Also transition the EDRAM buffer to UAV.
+    D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++];
+    barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+    barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+    barrier.Transition.pResource = edram_buffer_;
+    barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+    barrier.Transition.StateBefore = edram_buffer_state_;
+    barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+    edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+  }
+  if (barrier_count != 0) {
+    command_list->ResourceBarrier(barrier_count, barriers);
+  }
+
+  // Allocate descriptors for the buffers.
+  D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
+  D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
+  if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start,
+                                                 descriptor_gpu_start) == 0) {
+    return;
+  }
+
+  // Get the buffer for copying.
+  D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
+  ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
+      copy_buffer_size, copy_buffer_state);
+  if (copy_buffer == nullptr) {
+    return;
+  }
+
+  // Prepare for writing.
+  auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
+  auto device = provider->GetDevice();
+  auto descriptor_size_view = provider->GetDescriptorSizeView();
+  D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc;
+  srv_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+  srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER;
+  srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+  srv_desc.Buffer.FirstElement = 0;
+  srv_desc.Buffer.NumElements = copy_buffer_size >> 2;
+  srv_desc.Buffer.StructureByteStride = 0;
+  srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW;
+  device->CreateShaderResourceView(copy_buffer, &srv_desc,
+                                   descriptor_cpu_start);
+  D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc;
+  uav_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+  uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
+  uav_desc.Buffer.FirstElement = 0;
+  uav_desc.Buffer.NumElements = 2 * 2048 * 1280;
+  uav_desc.Buffer.StructureByteStride = 0;
+  uav_desc.Buffer.CounterOffsetInBytes = 0;
+  uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW;
+  D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle;
+  uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view;
+  device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc,
+                                    uav_cpu_handle);
+  command_list->SetComputeRootSignature(edram_load_store_root_signature_);
+  command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start);
+
+  // Sort the bindings in ascending order of EDRAM base so data in the render
+  // targets placed farther in EDRAM isn't lost in case of overlap.
+  std::sort(
+      store_bindings, store_bindings + store_binding_count,
+      [this](uint32_t a, uint32_t b) {
+        if (current_bindings_[a].edram_base < current_bindings_[b].edram_base) {
+          return true;
+        }
+        return a < b;
+      });
+
+  // Store each render target.
+  for (uint32_t i = 0; i < store_binding_count; ++i) {
+    const RenderTargetBinding& binding = current_bindings_[store_bindings[i]];
+    const RenderTarget* render_target = binding.render_target;
+    EDRAMLoadStorePipelineIndex pipeline_index;
+    bool is_64bpp = false;
+    if (render_target->key.is_depth) {
+      if (DepthRenderTargetFormat(render_target->key.format) ==
+          DepthRenderTargetFormat::kD24FS8) {
+        pipeline_index = EDRAMLoadStorePipelineIndex::kDepthFloatStore;
+      } else {
+        pipeline_index = EDRAMLoadStorePipelineIndex::kDepthUnormStore;
+      }
+    } else {
+      switch (ColorRenderTargetFormat(render_target->key.format)) {
+        case ColorRenderTargetFormat::k_8_8_8_8:
+        case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
+        case ColorRenderTargetFormat::k_2_10_10_10:
+        case ColorRenderTargetFormat::k_16_16:
+        case ColorRenderTargetFormat::k_16_16_FLOAT:
+        case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16:
+        case ColorRenderTargetFormat::k_32_FLOAT:
+          pipeline_index = EDRAMLoadStorePipelineIndex::kColor32bppStore;
+          break;
+        case ColorRenderTargetFormat::k_16_16_16_16:
+        case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
+        case ColorRenderTargetFormat::k_32_32_FLOAT:
+          pipeline_index = EDRAMLoadStorePipelineIndex::kColor64bppStore;
+          is_64bpp = true;
+          break;
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
+        case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
+          pipeline_index = EDRAMLoadStorePipelineIndex::kColor7e3Store;
+          break;
+        default:
+          assert_unhandled_case(render_target->key.format);
+          continue;
+      }
+    }
+
+    D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
+    location_source.pResource = render_target->resource;
+    location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+    location_source.SubresourceIndex = 0;
+    location_dest.pResource = copy_buffer;
+    location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+    location_dest.PlacedFootprint = render_target->footprints[0];
+    // TODO(Triang3l): Box for color render targets.
+    command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
+                                    nullptr);
+    EDRAMLoadStoreRootConstants root_constants;
+    root_constants.base_tiles = binding.edram_base;
+    root_constants.pitch_tiles = surface_pitch_tiles * (is_64bpp ? 2 : 1);
+    root_constants.rt_color_depth_pitch =
+        location_dest.PlacedFootprint.Footprint.RowPitch;
+    if (render_target->key.is_depth) {
+      location_source.SubresourceIndex = 1;
+      location_dest.PlacedFootprint = render_target->footprints[1];
+      command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source,
+                                      nullptr);
+      root_constants.rt_stencil_offset =
+          uint32_t(location_dest.PlacedFootprint.Offset);
+      root_constants.rt_stencil_pitch =
+          location_dest.PlacedFootprint.Footprint.RowPitch;
+    }
+
+    // Transition the copy buffer to SRV.
+    barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+    barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+    barriers[0].Transition.pResource = copy_buffer;
+    barriers[0].Transition.Subresource =
+        D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+    barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;
+    barriers[0].Transition.StateAfter =
+        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    command_list->ResourceBarrier(1, barriers);
+
+    // Store the data.
+    command_list->SetComputeRoot32BitConstants(
+        0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
+    command_processor_->SetPipeline(
+        edram_load_store_pipelines_[size_t(pipeline_index)]);
+    command_list->Dispatch(
+        root_constants.pitch_tiles,
+        binding.edram_dirty_length / root_constants.pitch_tiles, 1);
+
+    // Commit the UAV write and prepare for copying again.
+    barrier_count = 1;
+    barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
+    barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+    barriers[0].UAV.pResource = edram_buffer_;
+    if (i + 1 < store_binding_count) {
+      barrier_count = 2;
+      barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+      barriers[1].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+      barriers[1].Transition.pResource = copy_buffer;
+      barriers[1].Transition.Subresource =
+          D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+      barriers[1].Transition.StateBefore =
+          D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+      barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST;
+      copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
+    }
+    command_list->ResourceBarrier(barrier_count, barriers);
+  }
+
+  command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
+}
 
 }  // namespace d3d12
 }  // namespace gpu
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index b300f3e8d..9f5ff69f2 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -201,6 +201,7 @@ class RenderTargetCache {
                     RegisterFile* register_file);
   ~RenderTargetCache();
 
+  bool Initialize();
   void Shutdown();
   void ClearCache();
 
@@ -233,6 +234,27 @@ class RenderTargetCache {
   }
 
  private:
+  enum class EDRAMLoadStorePipelineIndex {
+    kColor32bppLoad,
+    kColor32bppStore,
+    kColor64bppLoad,
+    kColor64bppStore,
+    kColor7e3Load,
+    kColor7e3Store,
+    kDepthUnormLoad,
+    kDepthUnormStore,
+    kDepthFloatLoad,
+    kDepthFloatStore,
+
+    kCount
+  };
+
+  struct EDRAMLoadStorePipelineInfo {
+    const void* shader;
+    size_t shader_size;
+    const WCHAR* name;
+  };
+
   union RenderTargetKey {
     struct {
       // Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16.
@@ -267,8 +289,12 @@ class RenderTargetCache {
     RenderTargetKey key;
     // The first 4 MB page in the heaps.
     uint32_t heap_page_first;
-    // Number of 4 MB pages this render target uses.
+    // The number of 4 MB pages this render target uses.
     uint32_t heap_page_count;
+    // Color/depth and stencil layouts.
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2];
+    // Buffer size needed to copy the render target to the EDRAM buffer.
+    uint32_t copy_buffer_size;
   };
 
   struct RenderTargetBinding {
@@ -294,13 +320,34 @@ class RenderTargetCache {
   RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key,
                                          uint32_t heap_page_first);
 
-  // Must be in a frame to call. Writes the dirty areas of the currently bound
+  // Must be in a frame to call. Stores the dirty areas of the currently bound
   // render targets and marks them as clean.
-  void WriteRenderTargetsToEDRAM();
+  void StoreRenderTargetsToEDRAM();
 
   D3D12CommandProcessor* command_processor_;
   RegisterFile* register_file_;
 
+  // The EDRAM buffer allowing color and depth data to be reinterpreted.
+  ID3D12Resource* edram_buffer_ = nullptr;
+  D3D12_RESOURCE_STATES edram_buffer_state_;
+  bool edram_buffer_cleared_;
+
+  // EDRAM buffer load/store root signature.
+  ID3D12RootSignature* edram_load_store_root_signature_ = nullptr;
+  struct EDRAMLoadStoreRootConstants {
+    uint32_t base_tiles;
+    uint32_t pitch_tiles;
+    uint32_t rt_color_depth_pitch;
+    uint32_t rt_stencil_offset;
+    uint32_t rt_stencil_pitch;
+  };
+  // EDRAM buffer load/store pipelines.
+  static const EDRAMLoadStorePipelineInfo
+      edram_load_store_pipeline_info_[size_t(
+          EDRAMLoadStorePipelineIndex::kCount)];
+  ID3D12PipelineState* edram_load_store_pipelines_[size_t(
+      EDRAMLoadStorePipelineIndex::kCount)] = {};
+
   // 32 MB heaps backing used render targets resources, created when needed.
   // 24 MB proved to be not enough to store a single render target occupying the
   // entire EDRAM - a 32-bit depth/stencil one - at some resolution.
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
new file mode 100644
index 000000000..cd4079c67
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl
@@ -0,0 +1,14 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  uint4 pixels = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
new file mode 100644
index 000000000..273ee41cf
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl
@@ -0,0 +1,19 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(40, 8, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
+  // from 1 render target row rather than 1. Threads with X 0-19 are for the
+  // first row, with 20-39 are for the second.
+  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
+  [flatten] if (xe_group_thread_id.x >= 20u) {
+    tile_dword_index += uint2(uint(-80), 1u);
+  }
+  uint4 pixels = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  xe_edram_load_store_dest.Store4(rt_offset, pixels);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
new file mode 100644
index 000000000..dd8611ae6
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl
@@ -0,0 +1,20 @@
+#include "edram_load_store.hlsli"
+#include "pixel_formats.hlsli"
+
+[numthreads(40, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 2u;
+  uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x);
+  uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y);
+  uint4 pixels_f16u32_packed =
+      uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) |
+      (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u);
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
new file mode 100644
index 000000000..fc2644705
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl
@@ -0,0 +1,31 @@
+#include "edram_load_store.hlsli"
+#include "pixel_formats.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset);
+  uint4 depth24 = depth24_stencil & 0xFFFFFFu;
+  uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset);
+  // Depth. If the stored 32-bit depth converted to 24-bit is the same as the
+  // stored 24-bit depth, load the 32-bit value because it has more precision
+  // (and multipass rendering is possible), if it's not, convert the 24-bit
+  // depth because it was overwritten by aliasing.
+  uint4 depth24to32 = XeFloat20e4To32(depth24);
+  uint4 depth = depth24to32 + (depth32 - depth24to32) *
+                uint4(XeFloat32To20e4(depth32) == depth24);
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  xe_edram_load_store_dest.Store4(rt_offset, depth);
+  // Stencil.
+  uint4 stencil = (depth24_stencil >> 24u) << uint4(0u, 8u, 16u, 24u);
+  stencil.xy |= stencil.zw;
+  stencil.x |= stencil.y;
+  rt_offset = xe_edram_rt_stencil_offset +
+              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
+  xe_edram_load_store_dest.Store(rt_offset, stencil.x);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
new file mode 100644
index 000000000..0d85248c7
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl
@@ -0,0 +1,22 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  uint4 pixels = xe_edram_load_store_source.Load4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index));
+  // Depth.
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  xe_edram_load_store_dest.Store4(rt_offset, pixels & 0xFFFFFFu);
+  // Stencil.
+  uint4 stencil = (pixels >> 24u) << uint4(0u, 8u, 16u, 24u);
+  stencil.xy |= stencil.zw;
+  stencil.x |= stencil.y;
+  rt_offset = xe_edram_rt_stencil_offset +
+              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
+  xe_edram_load_store_dest.Store(rt_offset, stencil.x);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
new file mode 100644
index 000000000..f7636266a
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli
@@ -0,0 +1,21 @@
+#ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
+#define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
+
+cbuffer XeEDRAMLoadStoreConstants : register(b0) {
+  uint xe_edram_base_tiles;
+  uint xe_edram_pitch_tiles;
+  uint xe_edram_rt_color_depth_pitch;
+  uint xe_edram_rt_stencil_offset;
+  uint xe_edram_rt_stencil_pitch;
+};
+
+ByteAddressBuffer xe_edram_load_store_source : register(t0);
+RWByteAddressBuffer xe_edram_load_store_dest : register(u0);
+
+uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) {
+  return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) +
+          tile_index.x) * 5120u + tile_dword_index.y * 320u +
+         tile_dword_index.x * 4u;
+}
+
+#endif  // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
new file mode 100644
index 000000000..584416fdb
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl
@@ -0,0 +1,14 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  xe_edram_load_store_dest.Store4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
new file mode 100644
index 000000000..ec3cab476
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl
@@ -0,0 +1,19 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(40, 8, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset);
+  // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data
+  // from 1 render target row rather than 1. Threads with X 0-19 are for the
+  // first row, with 20-39 are for the second.
+  uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u);
+  [flatten] if (xe_group_thread_id.x >= 20u) {
+    tile_dword_index += uint2(uint(-80), 1u);
+  }
+  xe_edram_load_store_dest.Store4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
new file mode 100644
index 000000000..7b9c5cc03
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl
@@ -0,0 +1,19 @@
+#include "edram_load_store.hlsli"
+#include "pixel_formats.hlsli"
+
+[numthreads(40, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset);
+  uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u);
+  uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u);
+  uint2 pixels_7e3_packed =
+      uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32));
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 2u;
+  xe_edram_load_store_dest.Store2(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
new file mode 100644
index 000000000..17cb1acdf
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl
@@ -0,0 +1,25 @@
+#include "edram_load_store.hlsli"
+#include "pixel_formats.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  // Depth.
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset);
+  uint4 depth24_stencil = XeFloat32To20e4(depth32);
+  // Stencil.
+  rt_offset = xe_edram_rt_stencil_offset +
+              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
+  depth24_stencil |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
+                     uint4(0u, 8u, 16u, 24u) << 24u;
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index);
+  // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date.
+  xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil);
+  // Store 32-bit depth so precision isn't lost when doing multipass rendering.
+  xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
new file mode 100644
index 000000000..3a318645e
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl
@@ -0,0 +1,20 @@
+#include "edram_load_store.hlsli"
+
+[numthreads(20, 16, 1)]
+void main(uint3 xe_group_id : SV_GroupID,
+          uint3 xe_group_thread_id : SV_GroupThreadID,
+          uint3 xe_thread_id : SV_DispatchThreadID) {
+  // Depth.
+  uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch +
+                   xe_thread_id.x * 16u;
+  uint4 pixels = xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu;
+  // Stencil.
+  rt_offset = xe_edram_rt_stencil_offset +
+              xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u;
+  pixels |= xe_edram_load_store_source.Load(rt_offset).xxxx >>
+            uint4(0u, 8u, 16u, 24u) << 24u;
+  uint2 tile_dword_index = xe_group_thread_id.xy;
+  tile_dword_index.x *= 4u;
+  xe_edram_load_store_dest.Store4(
+      XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels);
+}
diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
new file mode 100644
index 000000000..fbdbb0221
--- /dev/null
+++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli
@@ -0,0 +1,74 @@
+#ifndef XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
+#define XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
+
+// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+
+uint XeFloat16To7e3(uint4 rgba_f16u32) {
+  float4 rgba_f32 = f16tof32(rgba_f16u32);
+  uint3 rgb_f32u32 = asuint(rgba_f32.xyz);
+  // Keep only positive (high bit set means negative for both float and int) and
+  // saturate to 31.875 (also dropping NaNs).
+  rgb_f32u32 = uint3(clamp(int3(rgb_f32u32), 0, 0x41FF0000));
+  uint3 normalized = rgb_f32u32 + 0xC2000000u;
+  uint3 denormalized = ((rgb_f32u32 & 0x7FFFFFu) | 0x800000u) >>
+                       ((125u).xxx - (rgb_f32u32 >> 23u));
+  uint3 rgb_f10u32 = normalized + (denormalized - normalized) *
+                     uint3(rgb_f32u32 < 0x3E800000u);
+  rgb_f10u32 =
+      ((rgb_f10u32 + 0x7FFFu + ((rgb_f10u32 >> 16u) & 1u)) >> 16u) & 0x3FFu;
+  return rgb_f10u32.r | (rgb_f10u32.g << 10u) | (rgb_f10u32.b << 20u) |
+         (uint(saturate(rgba_f32.a) * 3.0) << 30u);
+}
+
+uint4 XeFloat7e3To16(uint rgba_packed) {
+  uint3 rgb_f10u32 = (rgba_packed.xxx >> uint3(0u, 10u, 20u)) & 0x3FFu;
+  uint3 mantissa = rgb_f10u32 & 0x7Fu;
+  uint3 exponent = rgb_f10u32 >> 7u;
+  // Normalize the values for the denormalized components.
+  // Exponent = 1;
+  // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x80) == 0);
+  uint3 is_denormalized = uint3(exponent == 0u);
+  uint3 mantissa_lzcnt = (7u).xxx - firstbithigh(mantissa);
+  exponent += ((1u).xxx - mantissa_lzcnt - exponent) * is_denormalized;
+  mantissa +=
+      (((mantissa << mantissa_lzcnt) & 0x7Fu) - mantissa) * is_denormalized;
+  // Combine into 32-bit float bits and clear zeros.
+  uint3 rgb_f32u32 = (((exponent + 124u) << 23u) | (mantissa << 16u)) *
+                     uint3(rgb_f10u32 != 0u);
+  return f32tof16(float4(asfloat(rgb_f32u32),
+                         float(rgba_packed >> 30u) * (1.0 / 3.0)));
+}
+
+// Based on CFloat24 from d3dref9.dll and the 6e4 code from:
+// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp
+// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2).
+// We also can't clamp the stored value to 1 as load->store->load must be exact.
+
+uint4 XeFloat32To20e4(uint4 f32u32) {
+  // Keep only positive (high bit set means negative for both float and int) and
+  // saturate to the maximum representable value near 2 (also dropping NaNs).
+  f32u32 = uint4(clamp(int4(f32u32), 0, 0x3FFFFFF8));
+  uint4 normalized = f32u32 + 0xC8000000u;
+  uint4 denormalized =
+      ((f32u32 & 0x7FFFFFu) | 0x800000u) >> ((113u).xxxx - (f32u32 >> 23u));
+  uint4 f24u32 =
+      normalized + (denormalized - normalized) * uint4(f32u32 < 0x38800000u);
+  return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu;
+}
+
+uint4 XeFloat20e4To32(uint4 f24u32) {
+  uint4 mantissa = f24u32 & 0xF00000u;
+  uint4 exponent = f24u32 >> 20u;
+  // Normalize the values for the denormalized components.
+  // Exponent = 1;
+  // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0);
+  uint4 is_denormalized = uint4(exponent == 0u);
+  uint4 mantissa_lzcnt = (20u).xxxx - firstbithigh(mantissa);
+  exponent += ((1u).xxxx - mantissa_lzcnt - exponent) * is_denormalized;
+  mantissa +=
+      (((mantissa << mantissa_lzcnt) & 0xFFFFFu) - mantissa) * is_denormalized;
+  // Combine into 32-bit float bits and clear zeros.
+  return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u);
+}
+
+#endif  // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_
diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc
index 033045336..7c60b696e 100644
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ b/src/xenia/gpu/d3d12/shared_memory.cc
@@ -394,7 +394,7 @@ void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state,
   barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
   barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
   barrier.Transition.pResource = buffer_;
-  barrier.Transition.Subresource = 0;
+  barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
   barrier.Transition.StateBefore = buffer_state_;
   barrier.Transition.StateAfter = new_state;
   command_list->ResourceBarrier(1, &barrier);
diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc
index 8cb1e4b14..8e4db0f0c 100644
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@@ -741,7 +741,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
     if (copy_buffer_state != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
       barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
       barriers[0].Transition.pResource = copy_buffer;
-      barriers[0].Transition.Subresource = 0;
+      barriers[0].Transition.Subresource =
+          D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
       barriers[0].Transition.StateBefore = copy_buffer_state;
       barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
       command_list->ResourceBarrier(1, barriers);
@@ -792,7 +793,8 @@ bool TextureCache::LoadTextureData(Texture* texture) {
     barriers[0].UAV.pResource = copy_buffer;
     barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
     barriers[1].Transition.pResource = copy_buffer;
-    barriers[1].Transition.Subresource = 0;
+    barriers[1].Transition.Subresource =
+        D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
     barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
     barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
     command_list->ResourceBarrier(2, barriers);