From 79465708aa85cfc61a9fcd2b73e1bd0c79dd6bfc Mon Sep 17 00:00:00 2001
From: disjtqz <chss95cs@gmail.com>
Date: Sat, 30 Sep 2023 14:59:56 -0400
Subject: [PATCH] implement bit-perfect vrsqrtefp

---
 src/xenia/cpu/backend/x64/x64_backend.cc      | 319 +++++++++++++++++-
 src/xenia/cpu/backend/x64/x64_backend.h       |  21 +-
 src/xenia/cpu/backend/x64/x64_emitter.cc      |  47 ++-
 src/xenia/cpu/backend/x64/x64_emitter.h       |   6 +-
 src/xenia/cpu/backend/x64/x64_seq_vector.cc   |  11 +
 src/xenia/cpu/backend/x64/x64_sequences.cc    |  42 ++-
 .../compiler/passes/simplification_pass.cc    |  32 ++
 src/xenia/cpu/hir/value.cc                    |  80 +++++
 src/xenia/cpu/hir/value.h                     |  10 +-
 9 files changed, 540 insertions(+), 28 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc
index 1e1f3fedb..e9927dc09 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter {
   void* EmitTryAcquireReservationHelper();
   void* EmitReservedStoreHelper(bool bit64 = false);
 
+  void* EmitScalarVRsqrteHelper();
+  void* EmitVectorVRsqrteHelper(void* scalar_helper);
+
  private:
   void* EmitCurrentForOffsets(const _code_offsets& offsets,
                               size_t stack_size = 0);
@@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) {
   if (!code_cache_->Initialize()) {
     return false;
   }
+  // Allocate emitter constant data.
+  emitter_data_ = X64Emitter::PlaceConstData();
 
   // Generate thunks used to transition between jitted code and host code.
   XbyakAllocator allocator;
@@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) {
       thunk_emitter.EmitTryAcquireReservationHelper();
   reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
   reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
-
+  vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
+  vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
   // Set the code cache to use the ResolveFunction thunk for default
   // indirections.
   assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) {
   // Allocate some special indirections.
   code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
 
-  // Allocate emitter constant data.
-  emitter_data_ = X64Emitter::PlaceConstData();
-
   // Setup exception callback
   ExceptionHandler::Install(&ExceptionCallbackThunk, this);
   if (cvars::record_mmio_access_exceptions) {
@@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
   _code_offsets code_offsets = {};
   code_offsets.prolog = getSize();
   pop(r8);  // return address
-
+  
   switch (stack_element_size) {
     case 4:
       mov(r11d, ptr[r8]);
@@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
   return EmitCurrentForOffsets(code_offsets);
 }
 
+void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
+  _code_offsets code_offsets = {};
+
+  Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
+  Xbyak::Label LC1, _LCPI3_1;
+  Xbyak::Label handle_denormal_input;
+  Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal;
+
+  auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
+    inLocalLabel();
+    Xbyak::Label end_lzcnt;
+    bsr(scratch_reg, reg);
+    mov(reg, 0x20);
+    jz(end_lzcnt);
+    xor_(scratch_reg, 0x1F);
+    mov(reg, scratch_reg);
+    L(end_lzcnt);
+    outLocalLabel();
+  };
+
+  vmovd(r8d, xmm0);
+  vmovaps(xmm1, xmm0);
+  mov(ecx, r8d);
+  //extract mantissa
+  and_(ecx, 0x7fffff);
+  mov(edx, ecx);
+  cmp(r8d, 0xff800000);
+  jz(specialcheck_1, CodeGenerator::T_NEAR);
+  //is exponent zero?
+  test(r8d, 0x7f800000);
+  jne(L18);
+  test(ecx, ecx);
+  jne(L2);
+
+  L(L18);
+  //extract biased exponent and unbias
+  mov(r9d, r8d);
+  shr(r9d, 23);
+  movzx(r9d, r9b);
+  lea(eax, ptr[r9 - 127]);
+  cmp(r9d, 255);
+  jne(L4);
+  jmp(L35);
+
+  L(L2);
+
+  bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
+  jnc(handle_denormal_input, CodeGenerator::T_NEAR);
+ 
+  // handle denormal input with NJM on 
+  // denorms get converted to zero w/ input sign, jump to our label
+  // that handles inputs of 0 for this
+  
+  jmp(convert_to_signed_inf_and_ret);
+  L(L35);
+
+  vxorps(xmm0, xmm0, xmm0);
+  mov(eax, 128);
+  vcomiss(xmm1, xmm0);
+  jb(L4);
+  test(ecx, ecx);
+  jne(L8);
+  ret();
+
+  L(L4);
+  cmp(eax, 128);
+  jne(L9);
+  vxorps(xmm0, xmm0, xmm0);
+  vcomiss(xmm0, xmm1);
+  jbe(L9);
+  vmovss(xmm2, ptr[rip+LC1]);
+  vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
+
+  test(edx, edx);
+  jne(L8);
+  vorps(xmm0, xmm2, xmm2);
+  ret();
+
+  L(L9);
+  test(edx, edx);
+  je(L10);
+  cmp(eax, 128);
+  jne(L11);
+  L(L8);
+  or_(r8d, 0x400000);
+  vmovd(xmm0, r8d);
+  ret();
+  L(L10);
+  test(r9d, r9d);
+  jne(L11);
+  L(convert_to_signed_inf_and_ret);
+  not_(r8d);
+  shr(r8d, 31);
+
+  lea(rdx, ptr[rip + _LCPI3_1]);
+  shl(r8d, 2);
+  vmovss(xmm0, ptr[r8 + rdx]);
+  ret();
+
+  L(L11);
+  vxorps(xmm2, xmm2, xmm2);
+  vmovss(xmm0, ptr[rip+LC1]);
+  vcomiss(xmm2, xmm1);
+  ja(L1, CodeGenerator::T_NEAR);
+  mov(ecx, 127);
+  sal(eax, 4);
+  sub(ecx, r9d);
+  mov(r9d, edx);
+  and_(eax, 16);
+  shr(edx, 9);
+  shr(r9d, 19);
+  and_(edx, 1023);
+  sar(ecx, 1);
+  or_(eax, r9d);
+  xor_(eax, 16);
+  mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
+               rax * 4]);
+  mov(eax, r9d);
+  shr(r9d, 16);
+  imul(edx, r9d);
+  sal(eax, 10);
+  and_(eax, 0x3fffc00);
+  sub(eax, edx);
+  bt(eax, 25);
+  jc(L12);
+  mov(edx, eax);
+  add(ecx, 6);
+  and_(edx, 0x1ffffff);
+
+  if (IsFeatureEnabled(kX64EmitLZCNT)) {
+    lzcnt(edx, edx);
+  } else {
+    emulate_lzcnt_helper_unary_reg(edx, r9d);
+  }
+
+  lea(r9d, ptr[rdx - 6]);
+  sub(ecx, edx);
+  if (IsFeatureEnabled(kX64EmitBMI2)) {
+    shlx(eax, eax, r9d);
+  } else {
+    xchg(ecx, r9d);
+    shl(eax, cl);
+    xchg(ecx, r9d);
+  }
+
+  L(L12);
+  test(al, 5);
+  je(L13);
+  test(al, 2);
+  je(L13);
+  add(eax, 4);
+
+  L(L13);
+  sal(ecx, 23);
+  and_(r8d, 0x80000000);
+  shr(eax, 2);
+  add(ecx, 0x3f800000);
+  and_(eax, 0x7fffff);
+  vxorps(xmm1, xmm1);
+  or_(ecx, r8d);
+  or_(ecx, eax);
+  vmovd(xmm0, ecx);
+  vaddss(xmm0, xmm1);//apply DAZ behavior to output
+
+  L(L1);
+  ret();
+
+
+  L(handle_denormal_input);
+  mov(r9d, r8d);
+  and_(r9d, 0x7FFFFFFF);
+  cmp(r9d, 0x400000);
+  jz(handle_oddball_denormal);
+  if (IsFeatureEnabled(kX64EmitLZCNT)) {
+    lzcnt(ecx, ecx);
+  } else {
+    emulate_lzcnt_helper_unary_reg(ecx, r9d);
+  }
+
+  mov(r9d, 9);
+  mov(eax, -118);
+  lea(edx, ptr[rcx - 8]);
+  sub(r9d, ecx);
+  sub(eax, ecx);
+  if (IsFeatureEnabled(kX64EmitBMI2)) {
+    shlx(edx, r8d, edx);
+  } else {
+    xchg(ecx, edx);
+    // esi is just the value of xmm0's low word, so we can restore it from there
+    shl(r8d, cl);
+    mov(ecx, edx);  // restore ecx, dont xchg because we're going to spoil edx anyway
+    mov(edx, r8d);
+    vmovd(r8d, xmm0);
+  }
+  and_(edx, 0x7ffffe);
+  jmp(L4);
+
+  L(specialcheck_1);
+  //should be extremely rare
+  vmovss(xmm0, ptr[rip+LC1]);
+  ret();
+
+  L(handle_oddball_denormal);
+  not_(r8d);
+  lea(r9, ptr[rip + LC1]);
+
+  shr(r8d, 31);
+  movss(xmm0, ptr[r9 + r8 * 4]);
+  ret();
+
+  L(_LCPI3_1);
+  dd(0xFF800000);
+  dd(0x7F800000);
+  L(LC1);
+  //the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal
+  dd(0x7FC00000);
+  dd(0x5F34FD00);
+
+
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.prolog = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
+void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
+  _code_offsets code_offsets = {};
+  Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
+  auto result_ptr =
+      GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
+  auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
+  counter_ptr.setBit(64);
+
+  //shuffle and xor to check whether all lanes are equal
+  //sadly has to leave the float pipeline for the vptest, which is moderate yikes
+  vmovhlps(xmm2, xmm0, xmm0);
+  vmovsldup(xmm1, xmm0);
+  vxorps(xmm1, xmm1, xmm0);
+  vxorps(xmm2, xmm2, xmm0);
+  vorps(xmm2, xmm1, xmm2);
+  vptest(xmm2, xmm2);
+  jnz(check_scalar_operation_in_vmx);
+  //jmp(scalar_helper, CodeGenerator::T_NEAR);
+  call(scalar_helper);
+  vshufps(xmm0, xmm0, xmm0, 0);
+  ret();
+
+  L(check_scalar_operation_in_vmx);
+
+  vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
+  jnz(actual_vector_version);
+  vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
+  call(scalar_helper);
+ // this->DebugBreak();
+  vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
+
+  vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
+           0b0111);
+  
+  ret();
+
+
+  L(actual_vector_version);
+
+
+  xor_(ecx, ecx);
+  vmovaps(result_ptr, xmm0);
+
+  mov(counter_ptr, rcx);
+  Xbyak::Label loop;
+
+  L(loop);
+  lea(rax, result_ptr);
+  vmovss(xmm0, ptr[rax+rcx*4]);
+  call(scalar_helper);
+  mov(rcx, counter_ptr);
+  lea(rax, result_ptr);
+  vmovss(ptr[rax+rcx*4], xmm0);
+  inc(ecx);
+  cmp(ecx, 4);
+  mov(counter_ptr, rcx);
+  jl(loop);
+  vmovaps(xmm0, result_ptr);
+  ret();
+  code_offsets.prolog_stack_alloc = getSize();
+  code_offsets.body = getSize();
+  code_offsets.epilog = getSize();
+  code_offsets.tail = getSize();
+  code_offsets.prolog = getSize();
+  return EmitCurrentForOffsets(code_offsets);
+}
+
 void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
   _code_offsets code_offsets = {};
   code_offsets.prolog = getSize();
@@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
   Xbyak::Label already_has_a_reservation;
   Xbyak::Label acquire_new_reservation;
 
-  btr(GetBackendFlagsPtr(), 1);
+  btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
   mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
   jc(already_has_a_reservation);
 
@@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
   // set flag on local backend context for thread to indicate our previous
   // attempt to get the reservation succeeded
   setnc(r9b);  // success = bitmap did not have a set bit at the idx
-  shl(r9b, 1);
+  shl(r9b, kX64BackendHasReserveBit);
 
   mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
       rdx);
@@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
   Xbyak::Label somehow_double_cleared;
   // carry must be set + zero flag must be set
 
-  btr(GetBackendFlagsPtr(), 1);
+  btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
 
   jnc(done);
 
@@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
                           : nullptr;
   bctx->current_stackpoint_depth = 0;
   bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
-  bctx->flags = 0;
+  bctx->flags = (1U << kX64BackendNJMOn);  // NJM on by default
   // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
   bctx->Ox1000 = 0x1000;
   bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
@@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
   uint32_t control = mode & 7;
   _mm_setcsr(mxcsr_table[control]);
   bctx->mxcsr_fpu = mxcsr_table[control];
-  ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
+  auto ppc_context = ((ppc::PPCContext*)ctx);
+  ppc_context->fpscr.bits.rn = control;
+  ppc_context->fpscr.bits.ni = control >> 2;
 }
 
 bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h
index 97b37a692..a7676ec2d 100644
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@@ -61,11 +61,22 @@ struct X64BackendStackpoint {
   // use
   unsigned guest_return_address_;
 };
+enum : uint32_t { 
+    kX64BackendMXCSRModeBit = 0, 
+    kX64BackendHasReserveBit = 1,
+    kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines
+    kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu.
+};
 // located prior to the ctx register
 // some things it would be nice to have be per-emulator instance instead of per
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
+  union {
+    __m128 helper_scratch_xmms[4];
+    uint64_t helper_scratch_u64s[8];
+    uint32_t helper_scratch_u32s[16];
+  };
   ReserveHelper* reserve_helper_;
   uint64_t cached_reserve_value_;
   // guest_tick_count is used if inline_loadclock is used
@@ -147,6 +158,13 @@ class X64Backend : public Backend {
   virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
   virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
   void RecordMMIOExceptionForGuestInstruction(void* host_address);
+
+  uint32_t LookupXMMConstantAddress32(unsigned index) {
+    return static_cast<uint32_t>(emitter_data() + sizeof(vec128_t) * index);
+  }
+  void* LookupXMMConstantAddress(unsigned index) {
+    return reinterpret_cast<void*>(emitter_data() + sizeof(vec128_t) * index);
+  }
 #if XE_X64_PROFILER_AVAILABLE == 1
   uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
 #endif
@@ -173,7 +191,8 @@ class X64Backend : public Backend {
   void* try_acquire_reservation_helper_ = nullptr;
   void* reserved_store_32_helper = nullptr;
   void* reserved_store_64_helper = nullptr;
-
+  void* vrsqrtefp_vector_helper = nullptr;
+  void* vrsqrtefp_scalar_helper = nullptr;
  private:
 #if XE_X64_PROFILER_AVAILABLE == 1
   GuestProfilerData profiler_data_;
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc
index da9816a35..239c14b75 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
   return result;
 }
 
+static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2,
+    uint32_t v3) {
+  vec128_t result;
+  result.u32[0] = v0;
+  result.u32[1] = v1;
+  result.u32[2] = v2;
+  result.u32[3] = v3;
+  return result;
+}
+
 static const vec128_t xmm_consts[] = {
     /* XMMZero                */ vec128f(0.0f),
     /* XMMByteSwapMask        */
@@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = {
     vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
     v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
     // XMMVSRMask
-    vec128b(1)};
+    vec128b(1),
+    //XMMVRsqrteTableStart
+    v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618),
+    v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8),
+    v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57),
+    v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B),
+    v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572),
+    v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0),
+    v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287),
+    v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4),
+    //XMMVRsqrteTableBase
+    vec128i(0) //filled in later
+};
 
 void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
   for (auto& vec : xmm_consts) {
@@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() {
 
   // The pointer must not be greater than 31 bits.
   assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
+
   std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
+  /*
+    set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart
+    this 
+  */
+  vec128_t* deferred_constants = reinterpret_cast<vec128_t*>(mem);
+  vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase];
+  uint32_t ptr_to_vrsqrte_table32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(&deferred_constants[XMMVRsqrteTableStart]));
+  *vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32);
+
   memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
 
   return reinterpret_cast<uintptr_t>(mem);
@@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) {
 Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
   // Load through fixed constant table setup by PlaceConstData.
   // It's important that the pointer is not signed, as it will be sign-extended.
-  return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
-                                     sizeof(vec128_t) * id)];
+  void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast<unsigned>(id));
+  xenia_assert(reinterpret_cast<uintptr_t>(emitter_data_ptr) < (1ULL << 31));//must not have signbit set
+  return ptr[emitter_data_ptr];
 }
 // Implies possible StashXmm(0, ...)!
 void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
@@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
     } else {  // even if already set, we still need to update flags to reflect
               // our mode
       if (new_mode == MXCSRMode::Fpu) {
-        btr(GetBackendFlagsPtr(), 0);
+        btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
       } else if (new_mode == MXCSRMode::Vmx) {
-        bts(GetBackendFlagsPtr(), 0);
+        bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
       } else {
         assert_unhandled_case(new_mode);
       }
@@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
     if (!already_set) {
       if (new_mode == MXCSRMode::Fpu) {
         LoadFpuMxcsrDirect();
-        btr(GetBackendFlagsPtr(), 0);
+        btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
         return true;
       } else if (new_mode == MXCSRMode::Vmx) {
         LoadVmxMxcsrDirect();
-        bts(GetBackendFlagsPtr(), 0);
+        bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
         return true;
       } else {
         assert_unhandled_case(new_mode);
diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h
index 4fdeab4a4..d6ca1e028 100644
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@@ -174,7 +174,9 @@ enum XmmConst {
   XMMSTVLShuffle,
   XMMSTVRSwapMask,  // swapwordmask with bit 7 set
   XMMVSRShlByteshuf,
-  XMMVSRMask
+  XMMVSRMask,
+  XMMVRsqrteTableStart,
+  XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm
 
 };
 using amdfx::xopcompare_e;
@@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
 
   size_t stack_size() const { return stack_size_; }
   SimdDomain DeduceSimdDomain(const hir::Value* for_value);
-
+ 
   void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
   /*
         returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index 791b9a87d..e97040f17 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
     auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
 
     addr_vmx.setBit(32);
+    auto flags_ptr = e.GetBackendFlagsPtr();
     if (i.src1.is_constant) {
       if (i.src1.constant() == 0) {
         // turn off daz/flush2z
         e.mov(addr_vmx, _MM_MASK_MASK);
+        e.btr(flags_ptr, kX64BackendNJMOn);
 
       } else {
         e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
+        e.bts(flags_ptr, kX64BackendNJMOn);
       }
 
     } else {
+      e.mov(e.eax, flags_ptr);
+      e.mov(e.edx, 1U << kX64BackendNJMOn);
+      e.mov(e.ecx, e.edx);
+      e.not_(e.ecx);
+      e.and_(e.ecx, e.eax);
+      e.or_(e.edx, e.eax);
       e.test(i.src1, i.src1);
+      e.cmove(e.edx, e.ecx);
+      e.mov(flags_ptr, e.edx);
       e.mov(e.edx, DEFAULT_VMX_MXCSR);
       e.mov(e.eax, _MM_MASK_MASK);
 
diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc
index 5f428ad6c..e60e6c33b 100644
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
   static void Emit(X64Emitter& e, const EmitArgType& i) {
     e.ChangeMxcsrMode(MXCSRMode::Vmx);
     Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
-    if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
-      e.vrsqrt14ps(i.dest, src1);
+    /*
+        the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part
+        of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles
+        that have inputs which do not come from vmsum.
+    */
+    if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) {
+      e.vmovss(e.xmm0, src1);
+      e.call(e.backend()->vrsqrtefp_scalar_helper);
+      e.vshufps(i.dest, e.xmm0, e.xmm0, 0);
     } else {
-      e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne));
-      e.vsqrtps(e.xmm1, src1);
-      e.vdivps(i.dest, e.xmm0, e.xmm1);
+      e.vmovaps(e.xmm0, src1);
+      e.call(e.backend()->vrsqrtefp_vector_helper);
+      e.vmovaps(i.dest, e.xmm0);
     }
   }
 };
@@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32
     // removed the And with 7 and hoisted that and into the InstrEmit_'s that
     // generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
     // backends dont have to worry about it
+    auto flags_ptr = e.GetBackendFlagsPtr();
     if (i.src1.is_constant) {
-      e.mov(e.eax, mxcsr_table[i.src1.constant()]);
+      unsigned constant_value = i.src1.constant();
+      e.mov(e.eax, mxcsr_table[constant_value]);
+
+      if (constant_value & 4) {
+        e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode);
+      }
+      else {
+        e.btr(flags_ptr, kX64BackendNonIEEEMode);
+      }
       e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
       e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
       e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
 
     } else {
-      e.mov(e.ecx, i.src1);
+      //can andnot, but this is a very infrequently used opcode
+      e.mov(e.eax, 1U << kX64BackendNonIEEEMode);
+      e.mov(e.edx, e.eax);
+      e.not_(e.edx);
+      e.mov(e.ecx, flags_ptr);
+      //edx = flags w/ non ieee cleared
+      e.and_(e.edx, e.ecx);
+      //eax = flags w/ non ieee set
+      e.or_(e.eax, e.ecx);
+      e.bt(i.src1, 2);
 
+      e.mov(e.ecx, i.src1);
+      e.cmovc(e.edx, e.eax);
       e.mov(e.rax, uintptr_t(mxcsr_table));
+      e.mov(flags_ptr, e.edx);
       e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
       // this was not here
       e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);
diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc
index da6f0dfe1..466bf21a5 100644
--- a/src/xenia/cpu/compiler/passes/simplification_pass.cc
+++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc
@@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
       }
     }
   }
+
+  /*
+    splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and
+    instead use the vector extracted from, which will be identical
+    have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there
+  */
+  if (opc == OPCODE_SPLAT) {
+    if (i->dest->type == VEC128_TYPE) {
+      auto splatted_value = i->src1.value;
+      auto splat_type = splatted_value->type;
+      if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) {
+        //its a splat of a fourbyte value, check the definition
+        auto splat_input_definition = splatted_value->GetDefSkipAssigns();
+        if (splat_input_definition) {
+          auto defining_opcode = splat_input_definition->GetOpcodeNum();
+          if (defining_opcode == OPCODE_EXTRACT) {
+            auto value_extracted_from = splat_input_definition->src1.value;
+            if (value_extracted_from->type == VEC128_TYPE) {
+
+              xenia_assert(splat_input_definition->dest->type == splat_type);
+
+              if (value_extracted_from->AllFloatVectorLanesSameValue()) {
+                i->Replace(&OPCODE_ASSIGN_info,0);
+                i->set_src1(value_extracted_from);
+                return true;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
   return false;
 }
 bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc
index 84b28a8f2..95b945fa9 100644
--- a/src/xenia/cpu/hir/value.cc
+++ b/src/xenia/cpu/hir/value.cc
@@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const {
   }
   return true;
 }
+bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value,
+                                              uint32_t current_depth) {
+  // limit recursion, otherwise this function will slow down emission
+  if (current_depth == 16) {
+    return false;
+  }
+  using namespace hir;
+  hir::Instr* definition;
+  Opcode definition_opcode_number;
+re_enter:
+  definition = for_value->def;
+  if (!definition) {
+    xenia_assert(for_value->IsConstant());
+
+    auto&& constant_value = for_value->constant.v128;
+    for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) {
+      if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  definition_opcode_number = definition->GetOpcodeNum();
+
+  if (definition_opcode_number == OPCODE_ASSIGN) {
+    for_value = definition->src1.value;
+    goto re_enter;
+  }
+
+  if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) {
+    for_value = definition->src1.value;
+    goto re_enter;
+  }
+  /*
+    vmsum propagates its result to every lane
+  */
+  if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 ||
+      definition_opcode_number == OPCODE_DOT_PRODUCT_3) {
+    return true;
+  }
+  //if splat of 32-bit value type, return true
+  //technically a splat of int16 or int8 would also produce the same "float" in all lanes
+  //but i think its best to keep this function focused on specifically float data
+  if (definition_opcode_number == OPCODE_SPLAT) {
+    if (definition->dest->type == VEC128_TYPE) {
+      auto splat_src_value_type = definition->src1.value->type;
+      if (splat_src_value_type == INT32_TYPE ||
+          splat_src_value_type == FLOAT32_TYPE) {
+        return true;
+      }
+    }
+  }
+
+  switch (definition_opcode_number) { 
+      //all of these opcodes produce the same value for the same input
+  case OPCODE_RSQRT:
+  case OPCODE_RECIP:
+  case OPCODE_POW2:
+  case OPCODE_LOG2:
+      for_value = definition->src1.value;
+      goto re_enter;
+
+    //binary opcodes
+  case OPCODE_ADD:
+  case OPCODE_SUB:
+  case OPCODE_MUL:
+      if (!AllFloatVectorLanesSameValue(definition->src1.value,
+                                        current_depth + 1)) {
+        return false;
+      }
+      for_value = definition->src2.value;
+      goto re_enter;
+  default:
+      break;
+  }
+
+  return false;
+}
+
+
 }  // namespace hir
 }  // namespace cpu
 }  // namespace xe
diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h
index 3a1cd442e..2eec39e00 100644
--- a/src/xenia/cpu/hir/value.h
+++ b/src/xenia/cpu/hir/value.h
@@ -618,8 +618,16 @@ class Value {
   bool MaybeFloaty() const {
     return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
   }
-
+  bool AllFloatVectorLanesSameValue() const {
+    return Value::AllFloatVectorLanesSameValue(this);
+  }
  private:
+  /*
+returns true if for_value (which must be VEC128_TYPE) has the same value in
+every float
+*/
+  static bool AllFloatVectorLanesSameValue(const hir::Value* for_value,
+                                           uint32_t current_depth = 0);
   static bool CompareInt8(Opcode opcode, Value* a, Value* b);
   static bool CompareInt16(Opcode opcode, Value* a, Value* b);
   static bool CompareInt32(Opcode opcode, Value* a, Value* b);