Merge pull request #149 from chrisps/canary_experimental

reimplement reserved load/store
2025-12-06 07:12:03 +01:00 · 2023-04-15 17:23:22 -04:00 · 2023-04-15 17:23:22 -04:00 · 26dc48f695
parent 5e0c67438c e75e0425e0
commit 26dc48f695
12 changed files with 355 additions and 187 deletions
--- a/src/xenia/cpu/backend/x64/x64_backend.cc
+++ b/src/xenia/cpu/backend/x64/x64_backend.cc
@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
  void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
      void* sync_func, unsigned stack_element_size);
  void* EmitTryAcquireReservationHelper();
  void* EmitReservedStoreHelper(bool bit64 = false);
 private:
  void* EmitCurrentForOffsets(const _code_offsets& offsets,
                              size_t stack_size = 0);
@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
        thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
            synchronize_guest_and_host_stack_helper_, 4);
  }
  try_acquire_reservation_helper_ =
      thunk_emitter.EmitTryAcquireReservationHelper();
  reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
  reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
  // Set the code cache to use the ResolveFunction thunk for default
  // indirections.
@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
  inc(ecx);
  jmp(checkbp, T_NEAR);
  L(we_good);
-  //we're popping this return address, so go down by one
+  // we're popping this return address, so go down by one
  sub(edx, sizeof(X64BackendStackpoint));
  dec(ecx);
  L(checkbp);
@ -857,6 +864,125 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
  code_offsets.tail = getSize();
  return EmitCurrentForOffsets(code_offsets);
 }
 void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
  _code_offsets code_offsets = {};
  code_offsets.prolog = getSize();
  Xbyak::Label already_has_a_reservation;
  Xbyak::Label acquire_new_reservation;
  btr(GetBackendFlagsPtr(), 1);
  mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
  jc(already_has_a_reservation);
  shr(ecx, RESERVE_BLOCK_SHIFT);
  xor_(r9d, r9d);
  mov(edx, ecx);
  shr(edx, 6);  // divide by 64
  lea(rdx, ptr[r8 + rdx * 8]);
  and_(ecx, 64 - 1);
  lock();
  bts(qword[rdx], rcx);
  // set flag on local backend context for thread to indicate our previous
  // attempt to get the reservation succeeded
  setnc(r9b);  // success = bitmap did not have a set bit at the idx
  shl(r9b, 1);
  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
      rdx);
  mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
  or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
  ret();
  L(already_has_a_reservation);
  DebugBreak();
  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
  code_offsets.epilog = getSize();
  code_offsets.tail = getSize();
  return EmitCurrentForOffsets(code_offsets);
 }
 // ecx=guest addr
 // r9 = host addr
 // r8 = value
 // if ZF is set and CF is set, we succeeded
 void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
  _code_offsets code_offsets = {};
  code_offsets.prolog = getSize();
  Xbyak::Label done;
  Xbyak::Label reservation_isnt_for_our_addr;
  Xbyak::Label somehow_double_cleared;
  // carry must be set + zero flag must be set
  btr(GetBackendFlagsPtr(), 1);
  jnc(done);
  mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
  shr(ecx, RESERVE_BLOCK_SHIFT);
  mov(edx, ecx);
  shr(edx, 6);  // divide by 64
  lea(rdx, ptr[rax + rdx * 8]);
  // begin acquiring exclusive access to cacheline containing our bit
  prefetchw(ptr[rdx]);
  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
      rdx);
  jnz(reservation_isnt_for_our_addr);
  mov(rax,
      GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
  // we need modulo bitsize, it turns out bittests' modulus behavior for the
  // bitoffset only applies for register operands, for memory ones we bug out
  // todo: actually, the above note may not be true, double check it
  and_(ecx, 64 - 1);
  cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
  jnz(reservation_isnt_for_our_addr);
  // was our memory modified by kernel code or something?
  lock();
  if (bit64) {
    cmpxchg(ptr[r9], r8);
  } else {
    cmpxchg(ptr[r9], r8d);
  }
  // the ZF flag is unaffected by BTR! we exploit this for the retval
  // cancel our lock on the 65k block
  lock();
  btr(qword[rdx], rcx);
  jnc(somehow_double_cleared);
  L(done);
  // i don't care that theres a dependency on the prev value of rax atm
  // sadly theres no CF&ZF condition code
  setz(al);
  setc(ah);
  cmp(ax, 0x0101);
  ret();
  // could be the same label, but otherwise we don't know where we came from
  // when one gets triggered
  L(reservation_isnt_for_our_addr);
  DebugBreak();
  L(somehow_double_cleared);  // somehow, something else cleared our reserve??
  DebugBreak();
  code_offsets.prolog_stack_alloc = getSize();
  code_offsets.body = getSize();
  code_offsets.epilog = getSize();
  code_offsets.tail = getSize();
  return EmitCurrentForOffsets(code_offsets);
 }
 void X64HelperEmitter::EmitSaveVolatileRegs() {
  // Save off volatile registers.
  // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
@ -975,6 +1101,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
  // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
  bctx->Ox1000 = 0x1000;
  bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
  bctx->reserve_helper_ = &reserve_helper_;
 }
 void X64Backend::DeinitializeBackendContext(void* ctx) {
  X64BackendContext* bctx = BackendContextForGuestContext(ctx);
--- a/src/xenia/cpu/backend/x64/x64_backend.h
+++ b/src/xenia/cpu/backend/x64/x64_backend.h
@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
 typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
 typedef void (*ResolveFunctionThunk)();
 #define RESERVE_BLOCK_SHIFT 16
 #define RESERVE_NUM_ENTRIES \
  ((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
 // https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
 struct ReserveHelper {
  uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
  ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
 };
 struct X64BackendStackpoint {
  uint64_t host_stack_;
  unsigned guest_stack_;
@ -55,16 +66,21 @@ struct X64BackendStackpoint {
 // context (somehow placing a global X64BackendCtx prior to membase, so we can
 // negatively index the membase reg)
 struct X64BackendContext {
  ReserveHelper* reserve_helper_;
  uint64_t cached_reserve_value_;
  // guest_tick_count is used if inline_loadclock is used
  uint64_t* guest_tick_count;
  // records mapping of host_stack to guest_stack
  X64BackendStackpoint* stackpoints;
-
+  uint64_t cached_reserve_offset;
  uint32_t cached_reserve_bit;
  unsigned int current_stackpoint_depth;
  unsigned int mxcsr_fpu;  // currently, the way we implement rounding mode
                           // affects both vmx and the fpu
  unsigned int mxcsr_vmx;
-  unsigned int flags;   // bit 0 = 0 if mxcsr is fpu, else it is vmx
+  // bit 0 = 0 if mxcsr is fpu, else it is vmx
  // bit 1 = got reserve
  unsigned int flags;
  unsigned int Ox1000;  // constant 0x1000 so we can shrink each tail emitted
                        // add of it by... 2 bytes lol
 };
@ -152,9 +168,18 @@ class X64Backend : public Backend {
  void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
  void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
  void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
 public:
  void* try_acquire_reservation_helper_ = nullptr;
  void* reserved_store_32_helper = nullptr;
  void* reserved_store_64_helper = nullptr;
 private:
 #if XE_X64_PROFILER_AVAILABLE == 1
  GuestProfilerData profiler_data_;
 #endif
  alignas(64) ReserveHelper reserve_helper_;
 };
 }  // namespace x64
--- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc
@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
 };
 EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
 struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    Xbyak::Label endpoint{};
@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
 struct RESERVED_LOAD_INT32
    : Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // should use phys addrs, not virtual addrs!
    // try_acquire_reservation_helper_ doesnt spoil rax
    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
    // begin acquiring exclusive access to the location
    // we will do a load first, but we'll need exclusive access once we do our
    // atomic op in the store
    e.prefetchw(e.ptr[e.rax]);
    e.mov(e.ecx, i.src1.reg().cvt32());
    e.call(e.backend()->try_acquire_reservation_helper_);
    e.mov(i.dest, e.dword[e.rax]);
    e.mov(
        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
        i.dest.reg().cvt64());
  }
 };
 struct RESERVED_LOAD_INT64
    : Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // try_acquire_reservation_helper_ doesnt spoil rax
    e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
    e.mov(e.ecx, i.src1.reg().cvt32());
    // begin acquiring exclusive access to the location
    // we will do a load first, but we'll need exclusive access once we do our
    // atomic op in the store
    e.prefetchw(e.ptr[e.rax]);
    e.call(e.backend()->try_acquire_reservation_helper_);
    e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
    e.mov(
        e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
        i.dest.reg());
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
                     RESERVED_LOAD_INT64);
 // address, value
 struct RESERVED_STORE_INT32
    : Sequence<RESERVED_STORE_INT32,
               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // edx=guest addr
    // r9 = host addr
    // r8 = value
    // if ZF is set and CF is set, we succeeded
    e.mov(e.ecx, i.src1.reg().cvt32());
    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
    e.mov(e.r8d, i.src2);
    e.call(e.backend()->reserved_store_32_helper);
    e.setz(i.dest);
  }
 };
 struct RESERVED_STORE_INT64
    : Sequence<RESERVED_STORE_INT64,
               I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.mov(e.ecx, i.src1.reg().cvt32());
    e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
    e.mov(e.r8, i.src2);
    e.call(e.backend()->reserved_store_64_helper);
    e.setz(i.dest);
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
                     RESERVED_STORE_INT64);
 // ============================================================================
 // OPCODE_ATOMIC_COMPARE_EXCHANGE
 // ============================================================================
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    if (!HasPrecedingCmpOfSameValues(i.instr)) {
      EmitCommutativeBinaryXmmOp(
-          e, i,
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
            e.vcomiss(src1, src2);
          });
    }
@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    if (!HasPrecedingCmpOfSameValues(i.instr)) {
      EmitCommutativeBinaryXmmOp(
-          e, i,
+          e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
          [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
            e.vcomisd(src1, src2);
          });
    }
@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
 struct NEGATED_MUL_ADD_F64
    : Sequence<NEGATED_MUL_ADD_F64,
               I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
    if (e.IsFeatureEnabled(kX64EmitFMA)) {
      // todo: this is garbage
      e.vmovapd(e.xmm3, src1);
      e.vfmadd213sd(e.xmm3, src2, src3);
      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
    } else {
      // todo: might need to use x87 in this case...
      e.vmulsd(e.xmm3, src1, src2);
      e.vaddsd(i.dest, e.xmm3, src3);
      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
    }
  }
 };
 struct NEGATED_MUL_ADD_V128
    : Sequence<NEGATED_MUL_ADD_V128,
               I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
    if (e.IsFeatureEnabled(kX64EmitFMA)) {
      // todo: this is garbage
      e.vmovaps(e.xmm3, src1);
      e.vfmadd213ps(e.xmm3, src2, src3);
      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
    } else {
      // todo: might need to use x87 in this case...
      e.vmulps(e.xmm3, src1, src2);
      e.vaddps(i.dest, e.xmm3, src3);
      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
    }
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
                     NEGATED_MUL_ADD_V128);
 // ============================================================================
 // OPCODE_MUL_SUB
 // ============================================================================
@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
 };
 EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
 struct NEGATED_MUL_SUB_F64
    : Sequence<NEGATED_MUL_SUB_F64,
               I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Fpu);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
    if (e.IsFeatureEnabled(kX64EmitFMA)) {
      // todo: this is garbage
      e.vmovapd(e.xmm3, src1);
      e.vfmsub213sd(e.xmm3, src2, src3);
      e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
    } else {
      // todo: might need to use x87 in this case...
      e.vmulsd(e.xmm3, src1, src2);
      e.vsubsd(i.dest, e.xmm3, src3);
      e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
    }
  }
 };
 struct NEGATED_MUL_SUB_V128
    : Sequence<NEGATED_MUL_SUB_V128,
               I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    e.ChangeMxcsrMode(MXCSRMode::Vmx);
    Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
    Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
    Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
    if (e.IsFeatureEnabled(kX64EmitFMA)) {
      // todo: this is garbage
      e.vmovaps(e.xmm3, src1);
      e.vfmsub213ps(e.xmm3, src2, src3);
      e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
    } else {
      // todo: might need to use x87 in this case...
      e.vmulps(e.xmm3, src1, src2);
      e.vsubps(i.dest, e.xmm3, src3);
      e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
    }
  }
 };
 EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
                     NEGATED_MUL_SUB_V128);
 // ============================================================================
 // OPCODE_NEG
 // ============================================================================
@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
    // src1 constant.
    // `and` instruction only supports up to 32-bit immediate constants
    // 64-bit constants will need a temp register
-	  //only possible with 64 bit inputs, andc is the only instruction that generates this
+    // only possible with 64 bit inputs, andc is the only instruction that
    // generates this
    auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
    e.mov(temp, i.src1.constant());
--- a/src/xenia/cpu/hir/hir_builder.cc
+++ b/src/xenia/cpu/hir/hir_builder.cc
@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
  return i->dest;
 }
 Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
  i->set_src1(address);
  i->src2.value = i->src3.value = NULL;
  return i->dest;
 }
 Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
                                    TypeName type) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
  i->set_src1(address);
  i->set_src2(value);
  i->src3.value = NULL;
  return i->dest;
 }
 void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
  ASSERT_ADDRESS_TYPE(address);
  Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
  return i->dest;
 }
 Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
  Instr* i =
      AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
  i->set_src3(value3);
  return i->dest;
 }
 Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
  ASSERT_TYPES_EQUAL(value1, value2);
  ASSERT_TYPES_EQUAL(value1, value3);
  Instr* i =
      AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
  i->set_src1(value1);
  i->set_src2(value2);
  i->set_src3(value3);
  return i->dest;
 }
 Value* HIRBuilder::Neg(Value* value) {
  Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
  i->set_src1(value);
--- a/src/xenia/cpu/hir/hir_builder.h
+++ b/src/xenia/cpu/hir/hir_builder.h
@ -189,6 +189,9 @@ class HIRBuilder {
                   uint32_t store_flags = 0);
  Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
  // create a reserve on an address,
  Value* LoadWithReserve(Value* address, TypeName type);
  Value* StoreWithReserve(Value* address, Value* value, TypeName type);
  Value* LoadVectorLeft(Value* address);
  Value* LoadVectorRight(Value* address);
@ -242,10 +245,7 @@ class HIRBuilder {
  Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
  Value* MulAdd(Value* value1, Value* value2, Value* value3);  // (1 * 2) + 3
  Value* MulSub(Value* value1, Value* value2, Value* value3);  // (1 * 2) - 3
-  Value* NegatedMulAdd(Value* value1, Value* value2,
+
                       Value* value3);  // -((1 * 2) + 3)
  Value* NegatedMulSub(Value* value1, Value* value2,
                       Value* value3);  // -((1 * 2) - 3)
  Value* Neg(Value* value);
  Value* Abs(Value* value);
  Value* Sqrt(Value* value);
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -248,9 +248,7 @@ enum Opcode {
  OPCODE_MUL_HI,  // TODO(benvanik): remove this and add INT128 type.
  OPCODE_DIV,
  OPCODE_MUL_ADD,
  OPCODE_NEGATED_MUL_ADD,
  OPCODE_MUL_SUB,
  OPCODE_NEGATED_MUL_SUB,
  OPCODE_NEG,
  OPCODE_ABS,
  OPCODE_SQRT,
@ -292,7 +290,10 @@ enum Opcode {
                     // as we already have OPCODE_ROUND. round double to float (
                     // ppc "single" fpu instruction result rounding behavior )
  OPCODE_SET_NJM,
-	OPCODE_DELAY_EXECUTION, //for db16cyc
+  OPCODE_DELAY_EXECUTION,  // for db16cyc
  OPCODE_RESERVED_LOAD,
  OPCODE_RESERVED_STORE,
  __OPCODE_MAX_VALUE,  // Keep at end.
 };
--- a/src/xenia/cpu/hir/opcodes.inl
+++ b/src/xenia/cpu/hir/opcodes.inl
@ -218,7 +218,12 @@ DEFINE_OPCODE(
    "context_barrier",
    OPCODE_SIG_X,
    0)
-DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
+
 DEFINE_OPCODE(
 	OPCODE_DELAY_EXECUTION, 
 	"delay_execution",
 	OPCODE_SIG_X,
 	0)
 DEFINE_OPCODE(
    OPCODE_LOAD_MMIO,
    "load_mmio",
@ -453,19 +458,6 @@ DEFINE_OPCODE(
    OPCODE_SIG_V_V_V_V,
    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_NEGATED_MUL_ADD,
    "negated_mul_add",
    OPCODE_SIG_V_V_V_V,
    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_NEGATED_MUL_SUB,
    "negated_mul_sub",
    OPCODE_SIG_V_V_V_V,
    OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
 DEFINE_OPCODE(
    OPCODE_NEG,
    "neg",
@ -719,3 +711,15 @@ DEFINE_OPCODE(
    "storev_right",
    OPCODE_SIG_X_V_V,
    OPCODE_FLAG_MEMORY)
 DEFINE_OPCODE(	
 	OPCODE_RESERVED_LOAD,
 	"reserved_load",
    OPCODE_SIG_V_V,
    OPCODE_FLAG_MEMORY)
 DEFINE_OPCODE(
    OPCODE_RESERVED_STORE,
    "reserved_store",
    OPCODE_SIG_V_V_V,
    OPCODE_FLAG_MEMORY)
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
  uint8_t rex_b = rex & 0b0001;
  uint8_t rex_x = rex & 0b0010;
  uint8_t rex_r = rex & 0b0100;
-  //uint8_t rex_w = rex & 0b1000;
+  // uint8_t rex_w = rex & 0b1000;
  // http://www.sandpile.org/x86/opc_rm.htm
  // http://www.sandpile.org/x86/opc_sib.htm
@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
    if (cur_access != memory::PageAccess::kNoAccess &&
        (!is_write || cur_access != memory::PageAccess::kReadOnly)) {
      // Another thread has cleared this watch. Abort.
      XELOGD("Race condition on watch, was already cleared by another thread!");
      return true;
    }
    // The address is not found within any range, so either a write watch or an
--- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc
@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
  Value* b = f.VectorDenormFlush(f.LoadVR(vb));
  Value* c = f.VectorDenormFlush(f.LoadVR(vc));
-  Value* v = f.NegatedMulSub(a, c, b);
+  Value* v = f.Neg(f.MulSub(a, c, b));
  f.StoreVR(vd, v);
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc
@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+  Value* v = f.Neg(
-                             f.LoadFPR(i.A.FRB));
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] + frB)
-  Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+  Value* v = f.Neg(
-                             f.LoadFPR(i.A.FRB));
+      f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+  Value* v = f.Neg(
-                             f.LoadFPR(i.A.FRB));
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
  return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
 int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- -([frA x frC] - frB)
-  Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
+  Value* v = f.Neg(
-                             f.LoadFPR(i.A.FRB));
+      f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
  v = f.ToSingle(v);
  f.StoreFPR(i.A.FRT, v);
  f.UpdateFPSCR(v, i.A.Rc);
@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
  f.StoreFPR(i.X.RT, v);
  /*
  The contents of frB with bit 0 cleared are placed into frD.
-Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
+Note that the fabs instruction treats NaNs just like any other kind of value.
-bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
+That is, the sign bit of a NaN may be altered by fabs. This instruction does not
-Other registers altered:
+alter the FPSCR. Other registers altered: • Condition Register (CR1 field):
 • Condition Register (CR1 field):
 Affected: FX, FEX, VX, OX (if Rc = 1)
  */
- // f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
    // todo
  }
@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- !abs(frB)
  Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
  f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
-	//todo
+    // todo
  }
  return 0;
 }
@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
  // frD <- ¬ frB[0] || frB[1-63]
  Value* v = f.Neg(f.LoadFPR(i.X.RB));
  f.StoreFPR(i.X.RT, v);
-  //f.UpdateFPSCR(v, i.X.Rc);
+  // f.UpdateFPSCR(v, i.X.Rc);
  if (i.X.Rc) {
-	//todo
+    // todo
  }
  return 0;
 }
--- a/src/xenia/cpu/ppc/ppc_emit_memory.cc
+++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc
@ -22,6 +22,12 @@ DEFINE_bool(
    "instructions were written with the Xbox 360's cache in mind, and modern "
    "processors do their own automatic prefetching.",
    "CPU");
 DEFINE_bool(no_reserved_ops, false,
            "For testing whether a game may have races with a broken reserved "
            "load/store impl",
            "CPU");
 namespace xe {
 namespace cpu {
 namespace ppc {
@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
  // already, but I haven't see anything but interrupt callbacks (which are
  // always under a global lock) do that yet.
  // We issue a memory barrier here to make sure that we get good values.
  f.MemoryBarrier();
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
+
-  f.StoreReserved(rt);
+  if (cvars::no_reserved_ops) {
-  f.StoreGPR(i.X.RT, rt);
+    f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
  } else {
    f.MemoryBarrier();
    Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
    f.StoreGPR(i.X.RT, rt);
  }
  return 0;
 }
@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
  // already, but I haven't see anything but interrupt callbacks (which are
  // always under a global lock) do that yet.
  // We issue a memory barrier here to make sure that we get good values.
  f.MemoryBarrier();
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
-  Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
+  if (cvars::no_reserved_ops) {
-  f.StoreReserved(rt);
+    f.StoreGPR(i.X.RT,
-  f.StoreGPR(i.X.RT, rt);
+               f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
  } else {
    f.MemoryBarrier();
    Value* rt =
        f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
    f.StoreGPR(i.X.RT, rt);
  }
  return 0;
 }
@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
  Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
-  Value* res = f.ByteSwap(f.LoadReserved());
+
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
+  if (cvars::no_reserved_ops) {
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+    f.Store(ea, rt);
    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
  } else {
    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
  }
  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
  // Issue memory barrier for when we go out of lock and want others to see our
  // updates.
-
+  if (!cvars::no_reserved_ops) {
-  f.MemoryBarrier();
+    f.MemoryBarrier();
-
+  }
  return 0;
 }
@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
  // This will always succeed if under the global lock, however.
  Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
  Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
-  Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
+
-  Value* v = f.AtomicCompareExchange(ea, res, rt);
+  if (cvars::no_reserved_ops) {
-  f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
+    f.Store(ea, rt);
    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
  } else {
    Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
    f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
  }
  f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
  f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
  // Issue memory barrier for when we go out of lock and want others to see our
  // updates.
-  f.MemoryBarrier();
+  if (!cvars::no_reserved_ops) {
    f.MemoryBarrier();
  }
  return 0;
 }
 // Floating-point load (A-19)
 int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {