diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc
index 4cace24ba..5b9f8a182 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@@ -22,6 +22,9 @@ namespace vulkan {
 
 using xe::ui::vulkan::CheckResult;
 
+// Space kept between tail and head when wrapping.
+constexpr VkDeviceSize kDeadZone = 4 * 1024;
+
 BufferCache::BufferCache(RegisterFile* register_file,
                          ui::vulkan::VulkanDevice* device, size_t capacity)
     : register_file_(register_file),
@@ -187,6 +190,42 @@ BufferCache::~BufferCache() {
 
 VkDeviceSize BufferCache::UploadConstantRegisters(
     const Shader::ConstantRegisterMap& constant_register_map) {
+  // Fat struct, including all registers:
+  // struct {
+  //   vec4 float[512];
+  //   uint bool[8];
+  //   uint loop[32];
+  // };
+  size_t total_size = xe::round_up(
+      static_cast<VkDeviceSize>((512 * 4 * 4) + (32 * 4) + (8 * 4)),
+      uniform_buffer_alignment_);
+  auto offset = AllocateTransientData(uniform_buffer_alignment_, total_size);
+  if (offset == VK_WHOLE_SIZE) {
+    // OOM.
+    return VK_WHOLE_SIZE;
+  }
+
+  // Copy over all the registers.
+  const auto& values = register_file_->values;
+  uint8_t* dest_ptr =
+      reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset;
+  std::memcpy(dest_ptr, &values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32,
+              (512 * 4 * 4));
+  dest_ptr += 512 * 4 * 4;
+  std::memcpy(dest_ptr, &values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
+              8 * 4);
+  dest_ptr += 8 * 4;
+  std::memcpy(dest_ptr, &values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32,
+              32 * 4);
+  dest_ptr += 32 * 4;
+
+  return offset;
+
+// Packed upload code.
+// This is not currently supported by the shaders, but would be awesome.
+// We should be able to use this for any shader that does not do dynamic
+// constant indexing.
+#if 0
   // Allocate space in the buffer for our data.
   auto offset = AllocateTransientData(uniform_buffer_alignment_,
                                       constant_register_map.packed_byte_length);
@@ -230,6 +269,7 @@ VkDeviceSize BufferCache::UploadConstantRegisters(
   }
 
   return offset;
+#endif  // 0
 }
 
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
@@ -282,17 +322,53 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
   return {transient_vertex_buffer_, offset};
 }
 
-VkDeviceSize BufferCache::AllocateTransientData(size_t alignment,
-                                                size_t length) {
-  // Try to add to end, wrapping if required.
-
-  // Check to ensure there is space.
-  if (false) {
-    // Consume all fences.
+VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize alignment,
+                                                VkDeviceSize length) {
+  // Try fast path (if we have space).
+  VkDeviceSize offset = TryAllocateTransientData(alignment, length);
+  if (offset != VK_WHOLE_SIZE) {
+    return offset;
   }
 
-  // Slice off our bit.
+  // Ran out of easy allocations.
+  // Try consuming fences before we panic.
+  assert_always("Reclamation not yet implemented");
 
+  // Try again. It may still fail if we didn't get enough space back.
+  return TryAllocateTransientData(alignment, length);
+}
+
+VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize alignment,
+                                                   VkDeviceSize length) {
+  if (transient_tail_offset_ >= transient_head_offset_) {
+    // Tail follows head, so things are easy:
+    // |    H----T   |
+    if (transient_tail_offset_ + length <= transient_capacity_) {
+      // Allocation fits from tail to end of buffer, so grow.
+      // |    H----**T |
+      VkDeviceSize offset = transient_tail_offset_;
+      transient_tail_offset_ += length;
+      return offset;
+    } else if (length + kDeadZone <= transient_head_offset_) {
+      // Can't fit at the end, but can fit if we wrap around.
+      // |**T H----....|
+      VkDeviceSize offset = 0;
+      transient_tail_offset_ = length;
+      return offset;
+    }
+  } else {
+    // Head follows tail, so we're reversed:
+    // |----T    H---|
+    if (transient_tail_offset_ + length + kDeadZone <= transient_head_offset_) {
+      // Fits from tail to head.
+      // |----***T H---|
+      VkDeviceSize offset = transient_tail_offset_;
+      transient_tail_offset_ += length;
+      return offset;
+    }
+  }
+
+  // No more space.
   return VK_WHOLE_SIZE;
 }
 
diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h
index 661e30aa7..af42f23d8 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.h
+++ b/src/xenia/gpu/vulkan/buffer_cache.h
@@ -78,8 +78,14 @@ class BufferCache {
 
  private:
   // Allocates a block of memory in the transient buffer.
+  // When memory is not available fences are checked and space is reclaimed.
   // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
-  VkDeviceSize AllocateTransientData(size_t alignment, size_t length);
+  VkDeviceSize AllocateTransientData(VkDeviceSize alignment,
+                                     VkDeviceSize length);
+  // Tries to allocate a block of memory in the transient buffer.
+  // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
+  VkDeviceSize TryAllocateTransientData(VkDeviceSize alignment,
+                                        VkDeviceSize length);
 
   RegisterFile* register_file_ = nullptr;
   VkDevice device_ = nullptr;
@@ -92,8 +98,10 @@ class BufferCache {
   VkBuffer transient_vertex_buffer_ = nullptr;
   VkDeviceMemory transient_buffer_memory_ = nullptr;
   void* transient_buffer_data_ = nullptr;
+  VkDeviceSize transient_head_offset_ = 0;
+  VkDeviceSize transient_tail_offset_ = 0;
 
-  // Required alignemnts for our various types.
+  // Required alignments for our various types.
   // All allocations must start at the appropriate alignment.
   VkDeviceSize uniform_buffer_alignment_ = 0;
   VkDeviceSize index_buffer_alignment_ = 0;