diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc
index b83e545d2..cf6788f96 100644
--- a/src/xenia/base/memory.cc
+++ b/src/xenia/base/memory.cc
@@ -9,8 +9,8 @@
 
 #include "xenia/base/memory.h"
 #include "xenia/base/cvar.h"
-#include "xenia/base/platform.h"
 #include "xenia/base/logging.h"
+#include "xenia/base/platform.h"
 
 #if XE_ARCH_ARM64
 #include <arm_neon.h>
@@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
 
   CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
   CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
-  
+
   for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
     xe::swcache::CacheLine line0, line1, line2, line3;
 
@@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
     _movdir64b(physaddr + i, rdmapping + i);
   }
 }
-
+static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr,
+                                 CacheLine* XE_RESTRICT rdmapping,
+                                 uint32_t written_length) {
+  __movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping,
+          written_length / 8);
+}
 XE_COLD
 static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
                           CacheLine* XE_RESTRICT rdmapping,
@@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
   if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
     XELOGI("Selecting MOVDIR64M vastcpy.");
     dispatch_to_use = vastcpy_impl_movdir64m;
+  } else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) {
+    XELOGI("Selecting rep movs vastcpy.");
+    dispatch_to_use = vastcpy_impl_repmovs;
   } else {
     XELOGI("Selecting generic AVX vastcpy.");
     dispatch_to_use = vastcpy_impl_avx;
diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc
index a8aa7889c..88ecb145c 100644
--- a/src/xenia/base/threading_win.cc
+++ b/src/xenia/base/threading_win.cc
@@ -60,6 +60,12 @@ namespace xe {
 namespace threading {
 
 void EnableAffinityConfiguration() {
+  // chrispy: i don't think this is necessary,
+  // affinity always seems to be the system mask? research more
+  // also, maybe if ignore_thread_affinities is on we should use
+  // SetProcessAffinityUpdateMode to allow windows to dynamically update
+  // our process' affinity (by default windows cannot change the affinity itself
+  // at runtime, user code must do it)
   HANDLE process_handle = GetCurrentProcess();
   DWORD_PTR process_affinity_mask;
   DWORD_PTR system_affinity_mask;
@@ -117,7 +123,7 @@ void set_name(const std::string_view name) {
 
 // checked ntoskrnl, it does not modify delay, so we can place this as a
 // constant and avoid creating a stack variable
-static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
+static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}};
 
 void MaybeYield() {
 #if 0
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index fc236aa6a..70b09501e 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() {
 
   CommandProcessor::ShutdownContext();
 }
-// todo: bit-pack the bools and use bitarith to reduce branches
-void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+
+XE_FORCEINLINE
+void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index,
+                                                     uint32_t value) {
   __m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
 
   __m128i lower_bounds = _mm_setr_epi16(
@@ -1783,6 +1785,10 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
     return;
   }
 }
+// todo: bit-pack the bools and use bitarith to reduce branches
+void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
+  WriteRegisterForceinline(index, value);
+}
 
 void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
                                                   uint32_t* base,
@@ -1911,8 +1917,22 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
 void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
                                                        uint32_t base,
                                                        uint32_t num_registers) {
-  WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
-                                                       num_registers);
+  // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
+  //                                                     num_registers);
+
+  RingBuffer::ReadRange range =
+      ring->BeginRead(num_registers * sizeof(uint32_t));
+
+  XE_LIKELY_IF(!range.second) {
+    WriteRegistersFromMemCommonSense(
+        base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
+        num_registers);
+
+    ring->EndRead(range);
+  }
+  else {
+    return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
+  }
 }
 
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@@ -1926,6 +1946,145 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
          bounds_may_have_reg<register_lower_bound, register_upper_bound>(
              last_reg);
 }
+void D3D12CommandProcessor::WriteShaderConstantsFromMem(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  if (frame_open_) {
+    bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+    bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+    if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) {
+      // super naive, could just do some bit magic and interval checking,
+      // but we just need this hoisted out of the copy so we do a bulk copy
+      // because its the actual load/swap/store we're getting murdered by
+      // this precheck followed by copy_and_swap_32_unaligned reduced the cpu
+      // usage from packettype0/writeregistersfrommem from 10-11% of cpu time
+      // spent on xenia to like 1%
+      // chrispy: todo, this can be reduced even further, should be split into
+      // two loops and should skip whole words, this could net us even bigger
+      // gains
+      uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      uint32_t end_map_index =
+          (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
+      if (!cbuffer_vertex_uptodate) {
+        if (256 >= end_map_index) {
+          goto skip_map_checks;
+        }
+        map_index = 256;
+      }
+      for (; map_index < end_map_index; ++map_index) {
+        uint32_t float_constant_index = map_index;
+        if (float_constant_index >= 256) {
+          float_constant_index -= 256;
+          if (current_float_constant_map_pixel_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_pixel_uptodate = false;
+            if (!cbuffer_vertex_uptodate) {
+              break;
+            }
+          }
+        } else {
+          if (current_float_constant_map_vertex_[float_constant_index >> 6] &
+              (1ull << (float_constant_index & 63))) {
+            cbuffer_vertex_uptodate = false;
+            if (!cbuffer_pixel_uptodate) {
+              break;
+            } else {
+              map_index = 255;  // skip to checking pixel
+              continue;  // increment will put us at 256, then the check will
+                         // happen
+            }
+          }
+        }
+      }
+    skip_map_checks:;
+    }
+    cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
+    cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
+  }
+  // maybe use non-temporal copy if possible...
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
+                                                 uint32_t* base,
+                                                 uint32_t num_registers) {
+  cbuffer_binding_bool_loop_.up_to_date = false;
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
+                                              uint32_t* base,
+                                              uint32_t num_registers) {
+  cbuffer_binding_fetch_.up_to_date = false;
+
+  uint32_t first_fetch =
+      ((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+  uint32_t last_fetch =  // i think last_fetch should be inclusive if its modulo
+                         // is nz...
+      (((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) /
+       6);
+  texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch);
+
+  copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
+                             num_registers);
+}
+
+void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
+    uint32_t start_index, uint32_t* base, uint32_t num_registers) {
+  uint32_t end = start_index + num_registers;
+
+  uint32_t current_index = start_index;
+
+  auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
+    return std::min<uint32_t>(regnum, end) - current_index;
+  };
+
+#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n)     \
+  WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
+      index, base, n)
+
+#define DO_A_RANGE(start_range, end_range, cb)                     \
+  if (current_index < (end_range)) {                               \
+    uint32_t ntowrite = get_end_before_qty(end_range);             \
+    cb((start_range), (end_range), current_index, base, ntowrite); \
+    current_index += ntowrite;                                     \
+    base += ntowrite;                                              \
+  }                                                                \
+  if (current_index >= end) {                                      \
+    return;                                                        \
+  }
+
+  if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) {  // fairly common
+    goto shader_vars_start;
+  }
+
+#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
+  copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
+  DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
+             DO_A_RANGE_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
+             REGULAR_WRITE_CALLBACK);
+
+#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
+                                        n)                                   \
+  WriteShaderConstantsFromMem(index, base, n)
+shader_vars_start:
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             WRITE_SHADER_CONSTANTS_CALLBACK);
+
+#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
+  WriteFetchFromMem(ind, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+             XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
+             WRITE_FETCH_CONSTANTS_CALLBACK);
+#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
+             XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
+  DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
+             REGULAR_WRITE_CALLBACK);
+}
 template <uint32_t register_lower_bound, uint32_t register_upper_bound>
 XE_FORCEINLINE void
 D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
@@ -1935,12 +2094,21 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
   constexpr auto bounds_has_bounds =
       bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
 
+  bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
+  bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
+
+  bool skip_uptodate_checks =
+      (!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
   for (uint32_t i = 0; i < num_registers; ++i) {
     uint32_t data = xe::load_and_swap<uint32_t>(range + i);
-
-    {
-      uint32_t index = base + i;
-      uint32_t value = data;
+    uint32_t index = base + i;
+    uint32_t value = data;
+    // cant if constexpr this one or we get unreferenced label errors, and if we
+    // move the label into the else we get errors about a jump from one if
+    // constexpr into another
+    if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
+      D3D12CommandProcessor::WriteRegisterForceinline(index, value);
+    } else {
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       register_file_->values[index].u32 = value;
@@ -1968,27 +2136,13 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
         HandleSpecialRegisterWrite(index, value);
         goto write_done;
       }
-      XE_MSVC_ASSUME(index >= register_lower_bound &&
-                     index < register_upper_bound);
-      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
-                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
-        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
-            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
-          cbuffer_binding_fetch_.up_to_date = false;
-          // texture cache is never nullptr
-          texture_cache_->TextureFetchConstantWritten(
-              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
-
-          goto write_done;
-        }
-      }
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
                                       XE_GPU_REG_SHADER_CONSTANT_511_W)) {
         if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
             index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
-          if (frame_open_) {
+          if (!skip_uptodate_checks) {
             uint32_t float_constant_index =
                 (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
             if (float_constant_index >= 256) {
@@ -2002,12 +2156,30 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
                                                      6] &
                   (1ull << (float_constant_index & 63))) {
                 cbuffer_binding_float_vertex_.up_to_date = false;
+                if (!cbuffer_pixel_uptodate) {
+                  skip_uptodate_checks = true;
+                }
               }
             }
           }
           goto write_done;
         }
       }
+      XE_MSVC_ASSUME(index >= register_lower_bound &&
+                     index < register_upper_bound);
+      if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
+                                      XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
+        if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
+            index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
+          cbuffer_binding_fetch_.up_to_date = false;
+          // texture cache is never nullptr
+          texture_cache_->TextureFetchConstantWritten(
+              (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
+
+          goto write_done;
+        }
+      }
+
       XE_MSVC_ASSUME(index >= register_lower_bound &&
                      index < register_upper_bound);
       if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 75f23cf03..0fcd76593 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -211,12 +211,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
  protected:
   bool SetupContext() override;
   void ShutdownContext() override;
-
+  XE_FORCEINLINE
+  void WriteRegisterForceinline(uint32_t index, uint32_t value);
   void WriteRegister(uint32_t index, uint32_t value) override;
   XE_FORCEINLINE
   virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
                                      uint32_t num_registers) override;
-
+  //SHADER_CONSTANT_blah_XWYZ
+  void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
+                                        uint32_t num_registers);
+  void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
+                            uint32_t num_registers);
+  void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
+                         uint32_t num_registers);
+  void WriteRegistersFromMemCommonSense(uint32_t start_index,  uint32_t* base,
+                                     uint32_t num_registers) ;
   template <uint32_t register_lower_bound, uint32_t register_upper_bound>
   XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
       uint32_t start_index, uint32_t* base, uint32_t num_registers);
diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc
index af977d4d5..98d2802ee 100644
--- a/src/xenia/gpu/draw_util.cc
+++ b/src/xenia/gpu/draw_util.cc
@@ -551,30 +551,60 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
     viewport_info_out.ndc_offset[i] = ndc_offset[i];
   }
 }
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
-                bool clamp_to_surface_pitch) {
+template <bool clamp_to_surface_pitch>
+XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
+                                       Scissor& XE_RESTRICT scissor_out) {
   auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
-  int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x);
-  int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y);
   auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
-  int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x);
-  int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y);
-  if (!pa_sc_window_scissor_tl.window_offset_disable) {
-    auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
-    tl_x += pa_sc_window_offset.window_x_offset;
-    tl_y += pa_sc_window_offset.window_y_offset;
-    br_x += pa_sc_window_offset.window_x_offset;
-    br_y += pa_sc_window_offset.window_y_offset;
+  auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
+  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
+  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
+  uint32_t surface_pitch = 0;
+  if constexpr (clamp_to_surface_pitch) {
+    surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
   }
+  uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
+           pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
+           pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
+           pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
+           pa_sc_window_offset_window_x_offset =
+               pa_sc_window_offset.window_x_offset,
+           pa_sc_window_offset_window_y_offset =
+               pa_sc_window_offset.window_y_offset,
+           pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
+           pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
+           pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
+           pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
+
+  int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
+  int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
+
+  int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
+  int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
+
+  // chrispy: put this here to make it clear that the shift by 31 is extracting
+  // this field
+  XE_MAYBE_UNUSED
+  uint32_t window_offset_disable_reference =
+      pa_sc_window_scissor_tl.window_offset_disable;
+  int32_t window_offset_disable_mask =
+      ~(static_cast<int32_t>(pa_sc_window_scissor_tl.value) >> 31);
+  // if (!pa_sc_window_scissor_tl.window_offset_disable) {
+
+  tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
+  br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
+  //}
   // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
   // still handled here for completeness.
-  auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
-  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x));
-  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y));
-  auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
-  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x));
-  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y));
-  if (clamp_to_surface_pitch) {
+
+  tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x));
+  tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y));
+
+  br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x));
+  br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y));
+  if constexpr (clamp_to_surface_pitch) {
     // Clamp the horizontal scissor to surface_pitch for safety, in case that's
     // not done by the guest for some reason (it's not when doing draws without
     // clipping in Direct3D 9, for instance), to prevent overflow - this is
@@ -582,7 +612,7 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
     // rasterization without render target width at all (pixel shader
     // interlock-based custom RB implementations) and using conventional render
     // targets, but padded to EDRAM tiles.
-    uint32_t surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
+
     tl_x = std::min(tl_x, int32_t(surface_pitch));
     br_x = std::min(br_x, int32_t(surface_pitch));
   }
@@ -601,6 +631,15 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
   scissor_out.extent[1] = uint32_t(br_y - tl_y);
 }
 
+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) {
+  if (clamp_to_surface_pitch) {
+    return GetScissorTmpl<true>(regs, scissor_out);
+  } else {
+    return GetScissorTmpl<false>(regs, scissor_out);
+  }
+}
+
 uint32_t GetNormalizedColorMask(const RegisterFile& regs,
                                 uint32_t pixel_shader_writes_color_targets) {
   if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
@@ -863,7 +902,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
     y1 = y0 + int32_t(xenos::kMaxResolveSize);
   }
   // fails in forza horizon 1
-  //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
+  // x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
   assert_true(x0 <= x1 && y0 <= y1);
   if (x0 >= x1 || y0 >= y1) {
     XELOGE("Resolve region is empty");
@@ -1103,7 +1142,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
   info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
   info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
   info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
-  #if 0
+#if 0
   XELOGD(
       "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
       "modified memory range 0x{:08X} to 0x{:08X})",
@@ -1114,7 +1153,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
                      xenos::ColorRenderTargetFormat(color_edram_info.format)),
       FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
       copy_dest_extent_end);
-  #endif
+#endif
   return true;
 }
 XE_MSVC_OPTIMIZE_REVERT()
diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h
index ececbaac9..8196830b8 100644
--- a/src/xenia/gpu/draw_util.h
+++ b/src/xenia/gpu/draw_util.h
@@ -433,13 +433,15 @@ struct GetViewportInfoArgs {
 void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
                          ViewportInfo& viewport_info_out);
 
-struct Scissor {
+struct alignas(16) Scissor {
   // Offset from render target UV = 0 to +UV.
   uint32_t offset[2];
   // Extent can be zero.
   uint32_t extent[2];
 };
-void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
+
+void GetScissor(const RegisterFile& XE_RESTRICT regs,
+                Scissor& XE_RESTRICT scissor_out,
                 bool clamp_to_surface_pitch = true);
 
 // Returns the color component write mask for the draw command taking into
diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h
index da0888f21..de6b66f6f 100644
--- a/src/xenia/gpu/pm4_command_processor_declare.h
+++ b/src/xenia/gpu/pm4_command_processor_declare.h
@@ -109,7 +109,7 @@ XE_NOINLINE
 XE_COLD
 bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
 
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t GetCurrentRingReadCount();
 
diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h
index 2fc63cef2..21b9553d4 100644
--- a/src/xenia/gpu/pm4_command_processor_implement.h
+++ b/src/xenia/gpu/pm4_command_processor_implement.h
@@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
                                               uint32_t count) XE_RESTRICT {
   SCOPE_profile_cpu_f("gpu");
 
-
   trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t));
   if (count != 0) {
     RingBuffer old_reader = reader_;
@@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
     trace_writer_.WriteIndirectBufferEnd();
     reader_ = old_reader;
   } else {
-	  //rare, but i've seen it happen! (and then a division by 0 occurs)
+    // rare, but i've seen it happen! (and then a division by 0 occurs)
     return;
   }
-
 }
 
 bool COMMAND_PROCESSOR::ExecutePacket() {
@@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
          count * sizeof(uint32_t));
   return false;
 }
-    /*
-	Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
+/*
+    Todo: optimize this function this one along with execute packet type III are
+   the most frequently called functions for PM4
 */
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
@@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
 
   uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
 
-
   if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
       count * sizeof(uint32_t)) {
     trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
@@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT {
   trace_writer_.WritePacketEnd();
   return true;
 }
-XE_NOINLINE
+XE_FORCEINLINE
 XE_NOALIAS
 uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() {
   return reader_.read_count();
@@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
   return true;
 }
 
-XE_NOINLINE
+/*
+        chrispy: this is fine to inline, as a noinline function it compiled down
+   to 54 bytes
+*/
 static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
-  /*
-	Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits 
-	then use the wait_info value in order to select the bits that correctly implement the condition
-	If neither subtraction has the signbit set then that means the value is equal
-  */
-  bool matched = false;
-  switch (wait_info & 0x7) {
-    case 0x0:  // Never.
-      matched = false;
-      break;
-    case 0x1:  // Less than reference.
-      matched = value < ref;
-      break;
-    case 0x2:  // Less than or equal to reference.
-      matched = value <= ref;
-      break;
-    case 0x3:  // Equal to reference.
-      matched = value == ref;
-      break;
-    case 0x4:  // Not equal to reference.
-      matched = value != ref;
-      break;
-    case 0x5:  // Greater than or equal to reference.
-      matched = value >= ref;
-      break;
-    case 0x6:  // Greater than reference.
-      matched = value > ref;
-      break;
-    case 0x7:  // Always
-      matched = true;
-      break;
-  }
-  return matched;
+// smaller code is generated than the #else path, although whether it is faster
+// i do not know. i don't think games do an enormous number of cond_write
+// though, so we have picked
+// the path with the smaller codegen.
+// we do technically have more instructions executed vs the switch case method,
+// but we have no mispredicts and most of our instructions are 0.25/0.3
+// throughput
+#if 1
+  uint32_t value_minus_ref =
+      static_cast<uint32_t>(static_cast<int32_t>(value - ref) >> 31);
+  uint32_t ref_minus_value =
+      static_cast<uint32_t>(static_cast<int32_t>(ref - value) >> 31);
+  uint32_t eqmask = ~(value_minus_ref | ref_minus_value);
+  uint32_t nemask = (value_minus_ref | ref_minus_value);
+
+  uint32_t value_lt_mask = value_minus_ref;
+  uint32_t value_gt_mask = ref_minus_value;
+  uint32_t value_lte_mask = value_lt_mask | eqmask;
+  uint32_t value_gte_mask = value_gt_mask | eqmask;
+
+  uint32_t bits_for_selecting =
+      (value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) |
+      (eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) |
+      (value_gt_mask & (1 << 6)) | (1 << 7);
+
+  return (bits_for_selecting >> (wait_info & 7)) & 1;
+
+#else
+
+  return ((((value < ref) << 1) | ((value <= ref) << 2) |
+           ((value == ref) << 3) | ((value != ref) << 4) |
+           ((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >>
+          (wait_info & 7)) &
+         1;
+#endif
 }
 XE_NOINLINE
 bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
@@ -1128,7 +1131,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
 }
 
 uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
-                                                uint32_t write_index) {
+                                                 uint32_t write_index) {
   SCOPE_profile_cpu_f("gpu");
 #if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
   // If we have a pending trace stream open it now. That way we ensure we get
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index ff29fcb38..606367b66 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -104,6 +104,15 @@ class TextureCache {
   void TextureFetchConstantWritten(uint32_t index) {
     texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
   }
+  void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
+    // generate a mask of all bits from before the first index, and xor it with
+    // all bits before the last index this produces a mask covering only the
+    // bits between first and last
+    uint32_t res = ((1U << first_index) - 1) ^ ((1U << last_index) - 1);
+    // todo: check that this is right
+
+    texture_bindings_in_sync_ &= ~res;
+  }
 
   virtual void RequestTextures(uint32_t used_texture_mask);