Merge pull request #149 from chrisps/canary_experimental

reimplement reserved load/store
This commit is contained in:
chrisps 2023-04-15 17:23:22 -04:00 committed by GitHub
commit 26dc48f695
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 355 additions and 187 deletions

View file

@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk( void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
void* sync_func, unsigned stack_element_size); void* sync_func, unsigned stack_element_size);
void* EmitTryAcquireReservationHelper();
void* EmitReservedStoreHelper(bool bit64 = false);
private: private:
void* EmitCurrentForOffsets(const _code_offsets& offsets, void* EmitCurrentForOffsets(const _code_offsets& offsets,
size_t stack_size = 0); size_t stack_size = 0);
@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk( thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
synchronize_guest_and_host_stack_helper_, 4); synchronize_guest_and_host_stack_helper_, 4);
} }
try_acquire_reservation_helper_ =
thunk_emitter.EmitTryAcquireReservationHelper();
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
// Set the code cache to use the ResolveFunction thunk for default // Set the code cache to use the ResolveFunction thunk for default
// indirections. // indirections.
@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
inc(ecx); inc(ecx);
jmp(checkbp, T_NEAR); jmp(checkbp, T_NEAR);
L(we_good); L(we_good);
//we're popping this return address, so go down by one // we're popping this return address, so go down by one
sub(edx, sizeof(X64BackendStackpoint)); sub(edx, sizeof(X64BackendStackpoint));
dec(ecx); dec(ecx);
L(checkbp); L(checkbp);
@ -857,6 +864,125 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
code_offsets.tail = getSize(); code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets); return EmitCurrentForOffsets(code_offsets);
} }
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
Xbyak::Label already_has_a_reservation;
Xbyak::Label acquire_new_reservation;
btr(GetBackendFlagsPtr(), 1);
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
jc(already_has_a_reservation);
shr(ecx, RESERVE_BLOCK_SHIFT);
xor_(r9d, r9d);
mov(edx, ecx);
shr(edx, 6); // divide by 64
lea(rdx, ptr[r8 + rdx * 8]);
and_(ecx, 64 - 1);
lock();
bts(qword[rdx], rcx);
// set flag on local backend context for thread to indicate our previous
// attempt to get the reservation succeeded
setnc(r9b); // success = bitmap did not have a set bit at the idx
shl(r9b, 1);
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
rdx);
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
ret();
L(already_has_a_reservation);
DebugBreak();
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
// ecx=guest addr
// r9 = host addr
// r8 = value
// if ZF is set and CF is set, we succeeded
void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
Xbyak::Label done;
Xbyak::Label reservation_isnt_for_our_addr;
Xbyak::Label somehow_double_cleared;
// carry must be set + zero flag must be set
btr(GetBackendFlagsPtr(), 1);
jnc(done);
mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
shr(ecx, RESERVE_BLOCK_SHIFT);
mov(edx, ecx);
shr(edx, 6); // divide by 64
lea(rdx, ptr[rax + rdx * 8]);
// begin acquiring exclusive access to cacheline containing our bit
prefetchw(ptr[rdx]);
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
rdx);
jnz(reservation_isnt_for_our_addr);
mov(rax,
GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
// we need modulo bitsize, it turns out bittests' modulus behavior for the
// bitoffset only applies for register operands, for memory ones we bug out
// todo: actually, the above note may not be true, double check it
and_(ecx, 64 - 1);
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
jnz(reservation_isnt_for_our_addr);
// was our memory modified by kernel code or something?
lock();
if (bit64) {
cmpxchg(ptr[r9], r8);
} else {
cmpxchg(ptr[r9], r8d);
}
// the ZF flag is unaffected by BTR! we exploit this for the retval
// cancel our lock on the 65k block
lock();
btr(qword[rdx], rcx);
jnc(somehow_double_cleared);
L(done);
// i don't care that theres a dependency on the prev value of rax atm
// sadly theres no CF&ZF condition code
setz(al);
setc(ah);
cmp(ax, 0x0101);
ret();
// could be the same label, but otherwise we don't know where we came from
// when one gets triggered
L(reservation_isnt_for_our_addr);
DebugBreak();
L(somehow_double_cleared); // somehow, something else cleared our reserve??
DebugBreak();
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
void X64HelperEmitter::EmitSaveVolatileRegs() { void X64HelperEmitter::EmitSaveVolatileRegs() {
// Save off volatile registers. // Save off volatile registers.
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax); // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
@ -975,6 +1101,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000; bctx->Ox1000 = 0x1000;
bctx->guest_tick_count = Clock::GetGuestTickCountPointer(); bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
bctx->reserve_helper_ = &reserve_helper_;
} }
void X64Backend::DeinitializeBackendContext(void* ctx) { void X64Backend::DeinitializeBackendContext(void* ctx) {
X64BackendContext* bctx = BackendContextForGuestContext(ctx); X64BackendContext* bctx = BackendContextForGuestContext(ctx);

View file

@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
typedef void (*ResolveFunctionThunk)(); typedef void (*ResolveFunctionThunk)();
#define RESERVE_BLOCK_SHIFT 16
#define RESERVE_NUM_ENTRIES \
((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
struct ReserveHelper {
uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
};
struct X64BackendStackpoint { struct X64BackendStackpoint {
uint64_t host_stack_; uint64_t host_stack_;
unsigned guest_stack_; unsigned guest_stack_;
@ -55,16 +66,21 @@ struct X64BackendStackpoint {
// context (somehow placing a global X64BackendCtx prior to membase, so we can // context (somehow placing a global X64BackendCtx prior to membase, so we can
// negatively index the membase reg) // negatively index the membase reg)
struct X64BackendContext { struct X64BackendContext {
ReserveHelper* reserve_helper_;
uint64_t cached_reserve_value_;
// guest_tick_count is used if inline_loadclock is used // guest_tick_count is used if inline_loadclock is used
uint64_t* guest_tick_count; uint64_t* guest_tick_count;
// records mapping of host_stack to guest_stack // records mapping of host_stack to guest_stack
X64BackendStackpoint* stackpoints; X64BackendStackpoint* stackpoints;
uint64_t cached_reserve_offset;
uint32_t cached_reserve_bit;
unsigned int current_stackpoint_depth; unsigned int current_stackpoint_depth;
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
// affects both vmx and the fpu // affects both vmx and the fpu
unsigned int mxcsr_vmx; unsigned int mxcsr_vmx;
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx // bit 0 = 0 if mxcsr is fpu, else it is vmx
// bit 1 = got reserve
unsigned int flags;
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
// add of it by... 2 bytes lol // add of it by... 2 bytes lol
}; };
@ -152,9 +168,18 @@ class X64Backend : public Backend {
void* synchronize_guest_and_host_stack_helper_size8_ = nullptr; void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
void* synchronize_guest_and_host_stack_helper_size16_ = nullptr; void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
void* synchronize_guest_and_host_stack_helper_size32_ = nullptr; void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
public:
void* try_acquire_reservation_helper_ = nullptr;
void* reserved_store_32_helper = nullptr;
void* reserved_store_64_helper = nullptr;
private:
#if XE_X64_PROFILER_AVAILABLE == 1 #if XE_X64_PROFILER_AVAILABLE == 1
GuestProfilerData profiler_data_; GuestProfilerData profiler_data_;
#endif #endif
alignas(64) ReserveHelper reserve_helper_;
}; };
} // namespace x64 } // namespace x64

View file

@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
}; };
EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128); EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> { struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Label endpoint{}; Xbyak::Label endpoint{};
@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128); EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
struct RESERVED_LOAD_INT32
: Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// should use phys addrs, not virtual addrs!
// try_acquire_reservation_helper_ doesnt spoil rax
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
// begin acquiring exclusive access to the location
// we will do a load first, but we'll need exclusive access once we do our
// atomic op in the store
e.prefetchw(e.ptr[e.rax]);
e.mov(e.ecx, i.src1.reg().cvt32());
e.call(e.backend()->try_acquire_reservation_helper_);
e.mov(i.dest, e.dword[e.rax]);
e.mov(
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
i.dest.reg().cvt64());
}
};
struct RESERVED_LOAD_INT64
: Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// try_acquire_reservation_helper_ doesnt spoil rax
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.ecx, i.src1.reg().cvt32());
// begin acquiring exclusive access to the location
// we will do a load first, but we'll need exclusive access once we do our
// atomic op in the store
e.prefetchw(e.ptr[e.rax]);
e.call(e.backend()->try_acquire_reservation_helper_);
e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
e.mov(
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
i.dest.reg());
}
};
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
RESERVED_LOAD_INT64);
// address, value
struct RESERVED_STORE_INT32
: Sequence<RESERVED_STORE_INT32,
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// edx=guest addr
// r9 = host addr
// r8 = value
// if ZF is set and CF is set, we succeeded
e.mov(e.ecx, i.src1.reg().cvt32());
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.r8d, i.src2);
e.call(e.backend()->reserved_store_32_helper);
e.setz(i.dest);
}
};
struct RESERVED_STORE_INT64
: Sequence<RESERVED_STORE_INT64,
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.mov(e.ecx, i.src1.reg().cvt32());
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.r8, i.src2);
e.call(e.backend()->reserved_store_64_helper);
e.setz(i.dest);
}
};
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
RESERVED_STORE_INT64);
// ============================================================================ // ============================================================================
// OPCODE_ATOMIC_COMPARE_EXCHANGE // OPCODE_ATOMIC_COMPARE_EXCHANGE
// ============================================================================ // ============================================================================

View file

@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
e.ChangeMxcsrMode(MXCSRMode::Fpu); e.ChangeMxcsrMode(MXCSRMode::Fpu);
if (!HasPrecedingCmpOfSameValues(i.instr)) { if (!HasPrecedingCmpOfSameValues(i.instr)) {
EmitCommutativeBinaryXmmOp( EmitCommutativeBinaryXmmOp(
e, i, e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e.vcomiss(src1, src2); e.vcomiss(src1, src2);
}); });
} }
@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
e.ChangeMxcsrMode(MXCSRMode::Fpu); e.ChangeMxcsrMode(MXCSRMode::Fpu);
if (!HasPrecedingCmpOfSameValues(i.instr)) { if (!HasPrecedingCmpOfSameValues(i.instr)) {
EmitCommutativeBinaryXmmOp( EmitCommutativeBinaryXmmOp(
e, i, e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e.vcomisd(src1, src2); e.vcomisd(src1, src2);
}); });
} }
@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
struct NEGATED_MUL_ADD_F64
: Sequence<NEGATED_MUL_ADD_F64,
I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmadd213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vaddsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_ADD_V128
: Sequence<NEGATED_MUL_ADD_V128,
I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmadd213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vaddps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
NEGATED_MUL_ADD_V128);
// ============================================================================ // ============================================================================
// OPCODE_MUL_SUB // OPCODE_MUL_SUB
// ============================================================================ // ============================================================================
@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
}; };
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128); EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
struct NEGATED_MUL_SUB_F64
: Sequence<NEGATED_MUL_SUB_F64,
I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmsub213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vsubsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_SUB_V128
: Sequence<NEGATED_MUL_SUB_V128,
I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmsub213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vsubps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
NEGATED_MUL_SUB_V128);
// ============================================================================ // ============================================================================
// OPCODE_NEG // OPCODE_NEG
// ============================================================================ // ============================================================================
@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
// src1 constant. // src1 constant.
// `and` instruction only supports up to 32-bit immediate constants // `and` instruction only supports up to 32-bit immediate constants
// 64-bit constants will need a temp register // 64-bit constants will need a temp register
//only possible with 64 bit inputs, andc is the only instruction that generates this // only possible with 64 bit inputs, andc is the only instruction that
// generates this
auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e); auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
e.mov(temp, i.src1.constant()); e.mov(temp, i.src1.constant());

View file

@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
return i->dest; return i->dest;
} }
Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
i->set_src1(address);
i->src2.value = i->src3.value = NULL;
return i->dest;
}
Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
TypeName type) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
i->set_src1(address);
i->set_src2(value);
i->src3.value = NULL;
return i->dest;
}
void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) { void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
ASSERT_ADDRESS_TYPE(address); ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_STORE_info, store_flags); Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
return i->dest; return i->dest;
} }
Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::Neg(Value* value) { Value* HIRBuilder::Neg(Value* value) {
Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type)); Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
i->set_src1(value); i->set_src1(value);

View file

@ -189,6 +189,9 @@ class HIRBuilder {
uint32_t store_flags = 0); uint32_t store_flags = 0);
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
// create a reserve on an address,
Value* LoadWithReserve(Value* address, TypeName type);
Value* StoreWithReserve(Value* address, Value* value, TypeName type);
Value* LoadVectorLeft(Value* address); Value* LoadVectorLeft(Value* address);
Value* LoadVectorRight(Value* address); Value* LoadVectorRight(Value* address);
@ -242,10 +245,7 @@ class HIRBuilder {
Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3 Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3 Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
Value* NegatedMulAdd(Value* value1, Value* value2,
Value* value3); // -((1 * 2) + 3)
Value* NegatedMulSub(Value* value1, Value* value2,
Value* value3); // -((1 * 2) - 3)
Value* Neg(Value* value); Value* Neg(Value* value);
Value* Abs(Value* value); Value* Abs(Value* value);
Value* Sqrt(Value* value); Value* Sqrt(Value* value);

View file

@ -248,9 +248,7 @@ enum Opcode {
OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type. OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
OPCODE_DIV, OPCODE_DIV,
OPCODE_MUL_ADD, OPCODE_MUL_ADD,
OPCODE_NEGATED_MUL_ADD,
OPCODE_MUL_SUB, OPCODE_MUL_SUB,
OPCODE_NEGATED_MUL_SUB,
OPCODE_NEG, OPCODE_NEG,
OPCODE_ABS, OPCODE_ABS,
OPCODE_SQRT, OPCODE_SQRT,
@ -292,7 +290,10 @@ enum Opcode {
// as we already have OPCODE_ROUND. round double to float ( // as we already have OPCODE_ROUND. round double to float (
// ppc "single" fpu instruction result rounding behavior ) // ppc "single" fpu instruction result rounding behavior )
OPCODE_SET_NJM, OPCODE_SET_NJM,
OPCODE_DELAY_EXECUTION, //for db16cyc OPCODE_DELAY_EXECUTION, // for db16cyc
OPCODE_RESERVED_LOAD,
OPCODE_RESERVED_STORE,
__OPCODE_MAX_VALUE, // Keep at end. __OPCODE_MAX_VALUE, // Keep at end.
}; };

View file

@ -218,7 +218,12 @@ DEFINE_OPCODE(
"context_barrier", "context_barrier",
OPCODE_SIG_X, OPCODE_SIG_X,
0) 0)
DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
DEFINE_OPCODE(
OPCODE_DELAY_EXECUTION,
"delay_execution",
OPCODE_SIG_X,
0)
DEFINE_OPCODE( DEFINE_OPCODE(
OPCODE_LOAD_MMIO, OPCODE_LOAD_MMIO,
"load_mmio", "load_mmio",
@ -453,19 +458,6 @@ DEFINE_OPCODE(
OPCODE_SIG_V_V_V_V, OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_ADD,
"negated_mul_add",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_SUB,
"negated_mul_sub",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE( DEFINE_OPCODE(
OPCODE_NEG, OPCODE_NEG,
"neg", "neg",
@ -719,3 +711,15 @@ DEFINE_OPCODE(
"storev_right", "storev_right",
OPCODE_SIG_X_V_V, OPCODE_SIG_X_V_V,
OPCODE_FLAG_MEMORY) OPCODE_FLAG_MEMORY)
DEFINE_OPCODE(
OPCODE_RESERVED_LOAD,
"reserved_load",
OPCODE_SIG_V_V,
OPCODE_FLAG_MEMORY)
DEFINE_OPCODE(
OPCODE_RESERVED_STORE,
"reserved_store",
OPCODE_SIG_V_V_V,
OPCODE_FLAG_MEMORY)

View file

@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
uint8_t rex_b = rex & 0b0001; uint8_t rex_b = rex & 0b0001;
uint8_t rex_x = rex & 0b0010; uint8_t rex_x = rex & 0b0010;
uint8_t rex_r = rex & 0b0100; uint8_t rex_r = rex & 0b0100;
//uint8_t rex_w = rex & 0b1000; // uint8_t rex_w = rex & 0b1000;
// http://www.sandpile.org/x86/opc_rm.htm // http://www.sandpile.org/x86/opc_rm.htm
// http://www.sandpile.org/x86/opc_sib.htm // http://www.sandpile.org/x86/opc_sib.htm
@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
if (cur_access != memory::PageAccess::kNoAccess && if (cur_access != memory::PageAccess::kNoAccess &&
(!is_write || cur_access != memory::PageAccess::kReadOnly)) { (!is_write || cur_access != memory::PageAccess::kReadOnly)) {
// Another thread has cleared this watch. Abort. // Another thread has cleared this watch. Abort.
XELOGD("Race condition on watch, was already cleared by another thread!");
return true; return true;
} }
// The address is not found within any range, so either a write watch or an // The address is not found within any range, so either a write watch or an

View file

@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
Value* b = f.VectorDenormFlush(f.LoadVR(vb)); Value* b = f.VectorDenormFlush(f.LoadVR(vb));
Value* c = f.VectorDenormFlush(f.LoadVR(vc)); Value* c = f.VectorDenormFlush(f.LoadVR(vc));
Value* v = f.NegatedMulSub(a, c, b); Value* v = f.Neg(f.MulSub(a, c, b));
f.StoreVR(vd, v); f.StoreVR(vd, v);
return 0; return 0;
} }

View file

@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB) // frD <- -([frA x frC] + frB)
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), Value* v = f.Neg(
f.LoadFPR(i.A.FRB)); f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
f.StoreFPR(i.A.FRT, v); f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc); f.UpdateFPSCR(v, i.A.Rc);
return 0; return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB) // frD <- -([frA x frC] + frB)
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), Value* v = f.Neg(
f.LoadFPR(i.A.FRB)); f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.ToSingle(v); v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v); f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc); f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB) // frD <- -([frA x frC] - frB)
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), Value* v = f.Neg(
f.LoadFPR(i.A.FRB)); f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
f.StoreFPR(i.A.FRT, v); f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc); f.UpdateFPSCR(v, i.A.Rc);
return 0; return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB) // frD <- -([frA x frC] - frB)
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), Value* v = f.Neg(
f.LoadFPR(i.A.FRB)); f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.ToSingle(v); v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v); f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc); f.UpdateFPSCR(v, i.A.Rc);
@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
f.StoreFPR(i.X.RT, v); f.StoreFPR(i.X.RT, v);
/* /*
The contents of frB with bit 0 cleared are placed into frD. The contents of frB with bit 0 cleared are placed into frD.
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign Note that the fabs instruction treats NaNs just like any other kind of value.
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR. That is, the sign bit of a NaN may be altered by fabs. This instruction does not
Other registers altered: alter the FPSCR. Other registers altered: Condition Register (CR1 field):
Condition Register (CR1 field):
Affected: FX, FEX, VX, OX (if Rc = 1) Affected: FX, FEX, VX, OX (if Rc = 1)
*/ */
// f.UpdateFPSCR(v, i.X.Rc); // f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) { if (i.X.Rc) {
// todo // todo
} }
@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- !abs(frB) // frD <- !abs(frB)
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
f.StoreFPR(i.X.RT, v); f.StoreFPR(i.X.RT, v);
//f.UpdateFPSCR(v, i.X.Rc); // f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) { if (i.X.Rc) {
//todo // todo
} }
return 0; return 0;
} }
@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- ¬ frB[0] || frB[1-63] // frD <- ¬ frB[0] || frB[1-63]
Value* v = f.Neg(f.LoadFPR(i.X.RB)); Value* v = f.Neg(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v); f.StoreFPR(i.X.RT, v);
//f.UpdateFPSCR(v, i.X.Rc); // f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) { if (i.X.Rc) {
//todo // todo
} }
return 0; return 0;
} }

View file

@ -22,6 +22,12 @@ DEFINE_bool(
"instructions were written with the Xbox 360's cache in mind, and modern " "instructions were written with the Xbox 360's cache in mind, and modern "
"processors do their own automatic prefetching.", "processors do their own automatic prefetching.",
"CPU"); "CPU");
DEFINE_bool(no_reserved_ops, false,
"For testing whether a game may have races with a broken reserved "
"load/store impl",
"CPU");
namespace xe { namespace xe {
namespace cpu { namespace cpu {
namespace ppc { namespace ppc {
@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
// already, but I haven't see anything but interrupt callbacks (which are // already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet. // always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values. // We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
f.StoreReserved(rt); if (cvars::no_reserved_ops) {
f.StoreGPR(i.X.RT, rt); f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
} else {
f.MemoryBarrier();
Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
f.StoreGPR(i.X.RT, rt);
}
return 0; return 0;
} }
@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
// already, but I haven't see anything but interrupt callbacks (which are // already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet. // always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values. // We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); if (cvars::no_reserved_ops) {
f.StoreReserved(rt); f.StoreGPR(i.X.RT,
f.StoreGPR(i.X.RT, rt); f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
} else {
f.MemoryBarrier();
Value* rt =
f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
f.StoreGPR(i.X.RT, rt);
}
return 0; return 0;
} }
@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT)); Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
Value* res = f.ByteSwap(f.LoadReserved());
Value* v = f.AtomicCompareExchange(ea, res, rt); if (cvars::no_reserved_ops) {
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
} else {
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
}
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our // Issue memory barrier for when we go out of lock and want others to see our
// updates. // updates.
if (!cvars::no_reserved_ops) {
f.MemoryBarrier(); f.MemoryBarrier();
}
return 0; return 0;
} }
@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
// This will always succeed if under the global lock, however. // This will always succeed if under the global lock, however.
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE)); Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
Value* v = f.AtomicCompareExchange(ea, res, rt); if (cvars::no_reserved_ops) {
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
} else {
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
}
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our // Issue memory barrier for when we go out of lock and want others to see our
// updates. // updates.
f.MemoryBarrier(); if (!cvars::no_reserved_ops) {
f.MemoryBarrier();
}
return 0; return 0;
} }
// Floating-point load (A-19) // Floating-point load (A-19)
int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {