diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 7cb278e5d..77448f15c 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -688,7 +688,12 @@ void X64ThunkEmitter::EmitLoadNonvolatileRegs() { vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]); #endif } - +void X64Backend::InitializeBackendContext(void* ctx) { + X64BackendContext* bctx = reinterpret_cast( + reinterpret_cast(ctx) - sizeof(X64BackendContext)); + bctx->ResolveFunction_Ptr = reinterpret_cast(&ResolveFunction); + bctx->Ox1000 = 0x1000; +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 470988806..332cbc196 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -31,6 +31,16 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); typedef void (*ResolveFunctionThunk)(); +// located prior to the ctx register +// some things it would be nice to have be per-emulator instance instead of per +// context (somehow placing a global X64BackendCtx prior to membase, so we can +// negatively index the membase reg) +struct X64BackendContext { + void* ResolveFunction_Ptr; // cached pointer to resolvefunction + unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted + // add of it by... 2 bytes lol +}; + class X64Backend : public Backend { public: static const uint32_t kForceReturnAddress = 0x9FFF0000u; @@ -65,6 +75,7 @@ class X64Backend : public Backend { void InstallBreakpoint(Breakpoint* breakpoint) override; void InstallBreakpoint(Breakpoint* breakpoint, Function* fn) override; void UninstallBreakpoint(Breakpoint* breakpoint) override; + virtual void InitializeBackendContext(void* ctx) override; private: static bool ExceptionCallbackThunk(Exception* ex, void* data); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 6d5690c2f..a212d5fe6 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -105,6 +105,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW); TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ); TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI); + TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW); #undef TEST_EMIT_FEATURE /* fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in @@ -121,6 +122,10 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) bool is_zennish = cpu_.displayFamily >= 0x17; if (is_zennish) { + // ik that i heard somewhere that this is the case for zen, but i need to + // verify. cant find my original source for that. + // todo: ask agner? + feature_flags_ |= kX64FlagsIndependentVars; feature_flags_ |= kX64FastJrcx; if (cpu_.displayFamily > 0x17) { @@ -132,6 +137,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) // for my cpu, which is ripper90 } } + may_use_membase32_as_zero_reg_ = + static_cast(reinterpret_cast( + processor()->memory()->virtual_membase())) == 0; } X64Emitter::~X64Emitter() = default; @@ -210,6 +218,11 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, top_ = old_address; reset(); call_sites_.clear(); + tail_code_.clear(); + for (auto&& cached_label : label_cache_) { + delete cached_label; + } + label_cache_.clear(); return new_execute_address; } @@ -261,13 +274,14 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { code_offsets.prolog_stack_alloc = getSize(); code_offsets.body = getSize(); - + xor_(eax, eax); /* * chrispy: removed this, it serves no purpose mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg()); */ mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx); - mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); + + mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0 // Safe now to do some tracing. if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) { @@ -343,6 +357,13 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { add(rsp, (uint32_t)stack_size); ret(); + // todo: do some kind of sorting by alignment? + for (auto&& tail_item : tail_code_) { + if (tail_item.alignment) { + align(tail_item.alignment); + } + tail_item.func(*this, tail_item.label); + } code_offsets.tail = getSize(); @@ -605,12 +626,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { // rdx = arg0 // r8 = arg1 // r9 = arg2 - auto thunk = backend()->guest_to_host_thunk(); - mov(rax, reinterpret_cast(thunk)); mov(rcx, reinterpret_cast(builtin_function->handler())); mov(rdx, reinterpret_cast(builtin_function->arg0())); mov(r8, reinterpret_cast(builtin_function->arg1())); - call(rax); + call(backend()->guest_to_host_thunk()); // rax = host return } } else if (function->behavior() == Function::Behavior::kExtern) { @@ -621,12 +640,10 @@ void X64Emitter::CallExtern(const hir::Instr* instr, const Function* function) { // rdx = arg0 // r8 = arg1 // r9 = arg2 - auto thunk = backend()->guest_to_host_thunk(); - mov(rax, reinterpret_cast(thunk)); mov(rcx, reinterpret_cast(extern_function->extern_handler())); mov(rdx, qword[GetContextReg() + offsetof(ppc::PPCContext, kernel_state)]); - call(rax); + call(backend()->guest_to_host_thunk()); // rax = host return } } @@ -656,10 +673,8 @@ void X64Emitter::CallNativeSafe(void* fn) { // rdx = arg0 // r8 = arg1 // r9 = arg2 - auto thunk = backend()->guest_to_host_thunk(); - mov(rax, reinterpret_cast(thunk)); mov(rcx, reinterpret_cast(fn)); - call(rax); + call(backend()->guest_to_host_thunk()); // rax = host return } @@ -715,24 +730,50 @@ bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) { } return false; } - +/* + WARNING: do not use any regs here, addr is often produced by + ComputeAddressOffset, which may use rax/rdx/rcx in its addr expression +*/ void X64Emitter::MovMem64(const Xbyak::RegExp& addr, uint64_t v) { - if ((v & ~0x7FFFFFFF) == 0) { + uint32_t lowpart = static_cast(v); + uint32_t highpart = static_cast(v >> 32); + // check whether the constant coincidentally collides with our membase + if (v == (uintptr_t)processor()->memory()->virtual_membase()) { + mov(qword[addr], GetMembaseReg()); + } else if ((v & ~0x7FFFFFFF) == 0) { // Fits under 31 bits, so just load using normal mov. + mov(qword[addr], v); } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { // Negative number that fits in 32bits. mov(qword[addr], v); - } else if (!(v >> 32)) { + } else if (!highpart) { // All high bits are zero. It'd be nice if we had a way to load a 32bit // immediate without sign extending! // TODO(benvanik): this is super common, find a better way. - mov(dword[addr], static_cast(v)); - mov(dword[addr + 4], 0); + if (lowpart == 0 && CanUseMembaseLow32As0()) { + mov(dword[addr], GetMembaseReg().cvt32()); + } else { + mov(dword[addr], static_cast(v)); + } + if (CanUseMembaseLow32As0()) { + mov(dword[addr + 4], GetMembaseReg().cvt32()); + } else { + mov(dword[addr + 4], 0); + } } else { // 64bit number that needs double movs. - mov(dword[addr], static_cast(v)); - mov(dword[addr + 4], static_cast(v >> 32)); + + if (lowpart == 0 && CanUseMembaseLow32As0()) { + mov(dword[addr], GetMembaseReg().cvt32()); + } else { + mov(dword[addr], lowpart); + } + if (highpart == 0 && CanUseMembaseLow32As0()) { + mov(dword[addr + 4], GetMembaseReg().cvt32()); + } else { + mov(dword[addr + 4], highpart); + } } } static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1, @@ -893,7 +934,13 @@ static const vec128_t xmm_consts[] = { /* XMMThreeFloatMask */ vec128i(~0U, ~0U, ~0U, 0U), /*XMMXenosF16ExtRangeStart*/ - vec128f(65504)}; + vec128f(65504), + /*XMMVSRShlByteshuf*/ + v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), + // XMMVSRMask + vec128b(1) + +}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { @@ -1300,6 +1347,27 @@ SimdDomain X64Emitter::DeduceSimdDomain(const hir::Value* for_value) { return SimdDomain::DONTCARE; } +Xbyak::Address X64Emitter::GetBackendCtxPtr(int offset_in_x64backendctx) { + /* + index context ptr negatively to get to backend ctx field + */ + ptrdiff_t delta = (-static_cast(sizeof(X64BackendContext))) + + offset_in_x64backendctx; + return ptr[GetContextReg() + static_cast(delta)]; +} +Xbyak::Label& X64Emitter::AddToTail(TailEmitCallback callback, + uint32_t alignment) { + TailEmitter emitter{}; + emitter.func = std::move(callback); + emitter.alignment = alignment; + tail_code_.push_back(std::move(emitter)); + return tail_code_.back().label; +} +Xbyak::Label& X64Emitter::NewCachedLabel() { + Xbyak::Label* tmp = new Xbyak::Label; + label_cache_.push_back(tmp); + return *tmp; +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 519bc629a..528326088 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -155,7 +155,15 @@ enum XmmConst { XMMLVSRTableBase, XMMSingleDenormalMask, XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3 - XMMXenosF16ExtRangeStart + XMMXenosF16ExtRangeStart, + XMMVSRShlByteshuf, + XMMVSRMask +}; +// X64Backend specific Instr->runtime_flags +enum : uint32_t { + INSTR_X64_FLAGS_ELIMINATED = + 1, // another sequence marked this instruction as not needing codegen, + // meaning they likely already handled it }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. @@ -185,7 +193,13 @@ enum X64EmitterFeatureFlags { kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1) kX64FastLoop = 1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2) - kX64EmitAVX512VBMI = 1 << 14 + kX64EmitAVX512VBMI = 1 << 14, + kX64FlagsIndependentVars = + 1 << 15, // if true, instructions that only modify some flags (like + // inc/dec) do not introduce false dependencies on EFLAGS + // because the individual flags are treated as different vars by + // the processor. (this applies to zen) + kX64EmitPrefetchW = 1 << 16 }; class ResolvableGuestCall { public: @@ -194,6 +208,13 @@ class ResolvableGuestCall { // rgcid unsigned offset_; }; +class X64Emitter; +using TailEmitCallback = std::function; +struct TailEmitter { + Xbyak::Label label; + uint32_t alignment; + TailEmitCallback func; +}; class X64Emitter : public Xbyak::CodeGenerator { public: @@ -264,7 +285,7 @@ class X64Emitter : public Xbyak::CodeGenerator { Xbyak::Reg64 GetContextReg(); Xbyak::Reg64 GetMembaseReg(); - + bool CanUseMembaseLow32As0() const { return may_use_membase32_as_zero_reg_; } void ReloadMembase(); void nop(size_t length = 1); @@ -274,6 +295,8 @@ class X64Emitter : public Xbyak::CodeGenerator { void MovMem64(const Xbyak::RegExp& addr, uint64_t v); Xbyak::Address GetXmmConstPtr(XmmConst id); + Xbyak::Address GetBackendCtxPtr(int offset_in_x64backendctx); + void LoadConstantXmm(Xbyak::Xmm dest, float v); void LoadConstantXmm(Xbyak::Xmm dest, double v); void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); @@ -289,6 +312,8 @@ class X64Emitter : public Xbyak::CodeGenerator { return (feature_flags_ & feature_flag) == feature_flag; } + Xbyak::Label& AddToTail(TailEmitCallback callback, uint32_t alignment = 0); + Xbyak::Label& NewCachedLabel(); FunctionDebugInfo* debug_info() const { return debug_info_; } size_t stack_size() const { return stack_size_; } @@ -324,6 +349,16 @@ class X64Emitter : public Xbyak::CodeGenerator { static const uint32_t xmm_reg_map_[XMM_COUNT]; uint32_t current_rgc_id_ = 0xEEDDF00F; std::vector call_sites_; + /* + set to true if the low 32 bits of membase == 0. + only really advantageous if you are storing 32 bit 0 to a displaced address, + which would have to represent 0 as 4 bytes + */ + bool may_use_membase32_as_zero_reg_; + std::vector tail_code_; + std::vector + label_cache_; // for creating labels that need to be referenced much + // later by tail emitters }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index 715614753..0df9d3255 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -109,7 +109,6 @@ struct DEBUG_BREAK_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64FastJrcx)) { e.mov(e.ecx, i.src1); Xbyak::Label skip; @@ -187,77 +186,48 @@ EMITTER_OPCODE_TABLE(OPCODE_TRAP, TRAP); struct TRAP_TRUE_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Label& after = e.NewCachedLabel(); + unsigned flags = i.instr->flags; + Xbyak::Label& dotrap = + e.AddToTail([flags, &after](X64Emitter& e, Xbyak::Label& me) { + e.L(me); + e.Trap(flags); + // does Trap actually return control to the guest? + e.jmp(after, X64Emitter::T_NEAR); + }); e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + e.jnz(dotrap, X64Emitter::T_NEAR); + e.L(after); } }; struct TRAP_TRUE_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + assert_impossible_sequence(TRAP_TRUE_I16); } }; struct TRAP_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64FastJrcx)) { - e.mov(e.ecx, i.src1); - Xbyak::Label skip; - e.jrcxz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } else { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } + assert_impossible_sequence(TRAP_TRUE_I32); } }; struct TRAP_TRUE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64FastJrcx)) { - e.mov(e.rcx, i.src1); - Xbyak::Label skip; - e.jrcxz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } else { - e.test(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); - } + assert_impossible_sequence(TRAP_TRUE_I64); } }; struct TRAP_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + assert_impossible_sequence(TRAP_TRUE_F32); } }; struct TRAP_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Trap(i.instr->flags); - e.L(skip); + assert_impossible_sequence(TRAP_TRUE_F64); } }; EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE, TRAP_TRUE_I8, TRAP_TRUE_I16, @@ -333,6 +303,7 @@ struct CALL_TRUE_F32 e.L(skip); } }; + struct CALL_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -388,7 +359,6 @@ struct CALL_INDIRECT_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (e.IsFeatureEnabled(kX64FastJrcx)) { e.mov(e.ecx, i.src1); Xbyak::Label skip; diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 33919d466..7bd306ad0 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -14,6 +14,7 @@ #include "xenia/base/cvar.h" #include "xenia/base/memory.h" +#include "xenia/cpu/backend/x64/x64_backend.h" #include "xenia/cpu/backend/x64/x64_op.h" #include "xenia/cpu/backend/x64/x64_tracers.h" #include "xenia/cpu/ppc/ppc_context.h" @@ -28,8 +29,127 @@ namespace cpu { namespace backend { namespace x64 { +struct LoadModStore { + const hir::Instr* load; + hir::Instr* modify; + hir::Instr* store; + + bool is_constant[3]; + void Consume(); +}; +void LoadModStore::Consume() { + modify->backend_flags |= INSTR_X64_FLAGS_ELIMINATED; + store->backend_flags |= INSTR_X64_FLAGS_ELIMINATED; +} +static bool GetLoadModStore(const hir::Instr* loadinsn, LoadModStore* out) { + if (IsTracingData()) { + return false; + } + // if (!loadinsn->dest->HasSingleUse()) { + // allow the value to be used multiple times, as long as it is by the same + // instruction + if (!loadinsn->dest->AllUsesByOneInsn()) { + return false; + } + hir::Instr* use = loadinsn->dest->use_head->instr; + + if (!use->dest || !use->dest->HasSingleUse() || + use->GetNonFakePrev() != loadinsn) { + return false; + } + + hir::Instr* shouldbstore = use->dest->use_head->instr; + + if (shouldbstore->dest || shouldbstore->GetNonFakePrev() != use) { + return false; // store insns have no destination + } + use->VisitValueOperands([out](Value* v, uint32_t idx) { + out->is_constant[idx] = v->IsConstant(); + }); + out->load = loadinsn; + out->modify = use; + out->store = shouldbstore; + return true; +} +struct LoadModStoreContext : public LoadModStore { + uint64_t offset; // ctx offset + TypeName type; + Opcode op; + bool is_commutative; + bool is_unary; + bool is_binary; + bool + binary_uses_twice; // true if binary_other == our value. (for instance, + // add r11, r10, r10, which can be gen'ed for r10 * 2) + hir::Value* binary_other; + + hir::Value::ConstantValue* other_const; + uint32_t other_index; +}; +static bool GetLoadModStoreContext(const hir::Instr* loadinsn, + LoadModStoreContext* out) { + if (!GetLoadModStore(loadinsn, out)) { + return false; + } + + if (out->load->opcode->num != OPCODE_LOAD_CONTEXT || + out->store->opcode->num != OPCODE_STORE_CONTEXT) { + return false; + } + + if (out->modify->opcode->flags & + (OPCODE_FLAG_VOLATILE | OPCODE_FLAG_MEMORY)) { + return false; + } + uint64_t offs = out->load->src1.offset; + + if (offs != out->store->src1.offset) { + return false; + } + + TypeName typ = out->load->dest->type; + // can happen if op is a conversion + if (typ != out->store->src2.value->type) { + return false; + } + /* + set up a whole bunch of convenience fields for the caller + */ + out->offset = offs; + out->type = typ; + const OpcodeInfo& opinf = *out->modify->opcode; + out->op = opinf.num; + out->is_commutative = opinf.flags & OPCODE_FLAG_COMMUNATIVE; + out->is_unary = IsOpcodeUnaryValue(opinf.signature); + out->is_binary = IsOpcodeBinaryValue(opinf.signature); + out->binary_uses_twice = false; + out->binary_other = nullptr; + out->other_const = nullptr; + out->other_index = ~0U; + if (out->is_binary) { + if (out->modify->src1.value == out->load->dest) { + out->binary_other = out->modify->src2.value; + out->other_index = 1; + } else { + out->binary_other = out->modify->src1.value; + out->other_index = 0; + } + if (out->binary_other && out->is_constant[out->other_index]) { + out->other_const = &out->binary_other->constant; + } + if (out->binary_other == out->load->dest) { + out->binary_uses_twice = true; + } + } + return true; +} volatile int anchor_memory = 0; +static void Do0x1000Add(X64Emitter& e, Reg32 reg) { + e.add(reg, e.GetBackendCtxPtr(offsetof(X64BackendContext, Ox1000))); + // e.add(reg, 0x1000); +} + // Note: all types are always aligned in the context. RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { return e.GetContextReg() + offset.value; @@ -58,51 +178,6 @@ static bool is_definitely_not_eo(const T& v) { return is_eo_def(v.value); } -template -RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, - const T& offset) { - assert_true(offset.is_constant); - int32_t offset_const = static_cast(offset.constant()); - - if (guest.is_constant) { - uint32_t address = static_cast(guest.constant()); - address += offset_const; - if (address < 0x80000000) { - return e.GetMembaseReg() + address; - } else { - if (address >= 0xE0000000 && - xe::memory::allocation_granularity() > 0x1000) { - e.mov(e.eax, address + 0x1000); - } else { - e.mov(e.eax, address); - } - return e.GetMembaseReg() + e.rax; - } - } else { - if (xe::memory::allocation_granularity() > 0x1000 && - !is_definitely_not_eo(guest)) { - // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do - // it via memory mapping. - - // todo: do branching or use an alt membase and cmov - e.xor_(e.eax, e.eax); - e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]); - - e.cmp(e.edx, e.GetContextReg().cvt32()); - e.setae(e.al); - e.shl(e.eax, 12); - e.add(e.eax, e.edx); - return e.GetMembaseReg() + e.rax; - - } else { - // Clear the top 32 bits, as they are likely garbage. - // TODO(benvanik): find a way to avoid doing this. - - e.mov(e.eax, guest.reg().cvt32()); - } - return e.GetMembaseReg() + e.rax + offset_const; - } -} // Note: most *should* be aligned, but needs to be checked! template RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { @@ -127,11 +202,23 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { !is_definitely_not_eo(guest)) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. - e.xor_(e.eax, e.eax); + Xbyak::Label& jmpback = e.NewCachedLabel(); + + e.mov(e.eax, guest.reg().cvt32()); + e.cmp(guest.reg().cvt32(), e.GetContextReg().cvt32()); - e.setae(e.al); - e.shl(e.eax, 12); - e.add(e.eax, guest.reg().cvt32()); + + Xbyak::Label& fixup_label = + e.AddToTail([&jmpback](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + Do0x1000Add(e, e.eax); + e.jmp(jmpback, e.T_NEAR); + }); + e.jae(fixup_label, e.T_NEAR); + + e.L(jmpback); + return e.GetMembaseReg() + e.rax; + } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. @@ -140,6 +227,64 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { return e.GetMembaseReg() + e.rax; } } +template +RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, + const T& offset) { + assert_true(offset.is_constant); + int32_t offset_const = static_cast(offset.constant()); + if (offset_const == 0) { + return ComputeMemoryAddress(e, guest); + } + if (guest.is_constant) { + uint32_t address = static_cast(guest.constant()); + address += offset_const; + if (address < 0x80000000) { + return e.GetMembaseReg() + address; + } else { + if (address >= 0xE0000000 && + xe::memory::allocation_granularity() > 0x1000) { + e.mov(e.eax, address + 0x1000); + } else { + e.mov(e.eax, address); + } + return e.GetMembaseReg() + e.rax; + } + } else { + if (xe::memory::allocation_granularity() > 0x1000 && + !is_definitely_not_eo(guest)) { + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do + // it via memory mapping. + + // todo: do branching or use an alt membase and cmov + + Xbyak::Label& tmplbl = e.NewCachedLabel(); + + e.lea(e.edx, e.ptr[guest.reg().cvt32() + offset_const]); + + e.cmp(e.edx, e.GetContextReg().cvt32()); + + Xbyak::Label& fixup_label = + e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + + Do0x1000Add(e, e.edx); + + e.jmp(tmplbl, e.T_NEAR); + }); + e.jae(fixup_label, e.T_NEAR); + + e.L(tmplbl); + return e.GetMembaseReg() + e.rdx; + + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + + e.mov(e.eax, guest.reg().cvt32()); + } + return e.GetMembaseReg() + e.rax + offset_const; + } +} // ============================================================================ // OPCODE_ATOMIC_EXCHANGE @@ -214,11 +359,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I32 if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. + e.mov(e.ecx, i.src1.reg().cvt32()); e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); - e.setae(e.cl); - e.movzx(e.ecx, e.cl); - e.shl(e.ecx, 12); - e.add(e.ecx, i.src1.reg().cvt32()); + Xbyak::Label& backtous = e.NewCachedLabel(); + + Xbyak::Label& fixup_label = + e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + + Do0x1000Add(e, e.ecx); + + e.jmp(backtous, e.T_NEAR); + }); + e.jae(fixup_label, e.T_NEAR); + e.L(backtous); } else { e.mov(e.ecx, i.src1.reg().cvt32()); } @@ -235,11 +389,20 @@ struct ATOMIC_COMPARE_EXCHANGE_I64 if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. + e.mov(e.ecx, i.src1.reg().cvt32()); e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); - e.setae(e.cl); - e.movzx(e.ecx, e.cl); - e.shl(e.ecx, 12); - e.add(e.ecx, i.src1.reg().cvt32()); + Xbyak::Label& backtous = e.NewCachedLabel(); + + Xbyak::Label& fixup_label = + e.AddToTail([&backtous](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + + Do0x1000Add(e, e.ecx); + + e.jmp(backtous, e.T_NEAR); + }); + e.jae(fixup_label, e.T_NEAR); + e.L(backtous); } else { e.mov(e.ecx, i.src1.reg().cvt32()); } @@ -319,25 +482,44 @@ struct STORE_LOCAL_I8 e.mov(e.byte[e.rsp + i.src1.constant()], i.src2); } }; + +template +static bool LocalStoreMayUseMembaseLow(X64Emitter& e, const T& i) { + return i.src2.is_constant && i.src2.constant() == 0 && + e.CanUseMembaseLow32As0(); +} struct STORE_LOCAL_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.word[e.rsp + i.src1.constant()], i.src2); + if (LocalStoreMayUseMembaseLow(e, i)) { + e.mov(e.word[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt16()); + } else { + e.mov(e.word[e.rsp + i.src1.constant()], i.src2); + } } }; struct STORE_LOCAL_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); + if (LocalStoreMayUseMembaseLow(e, i)) { + e.mov(e.dword[e.rsp + i.src1.constant()], e.GetMembaseReg().cvt32()); + } else { + e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); + } } }; struct STORE_LOCAL_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); - e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); + if (i.src2.is_constant && i.src2.constant() == 0) { + e.xor_(e.eax, e.eax); + e.mov(e.qword[e.rsp + i.src1.constant()], e.rax); + } else { + e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); + } } }; struct STORE_LOCAL_F32 @@ -404,10 +586,133 @@ struct LOAD_CONTEXT_I32 } } }; +template +static bool HandleLMS64Binary(X64Emitter& e, const EmitArgType& i, + LoadModStoreContext& lms, Xbyak::RegExp& addr) { + uint64_t other_const_val = 0; + bool const_fits_in_insn = false; + if (lms.other_const) { + other_const_val = lms.other_const->u64; + const_fits_in_insn = e.ConstantFitsIn32Reg(other_const_val); + } + + /* + this check is here because we currently cannot handle other variables + with this + */ + if (!lms.other_const && !lms.binary_uses_twice) { + return false; + } + + if (lms.op == OPCODE_ADD) { + if (lms.other_const) { + if (const_fits_in_insn) { + if (other_const_val == 1 && + e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(e.qword[addr]); + } else { + e.add(e.qword[addr], (uint32_t)other_const_val); + } + + } else { + e.mov(e.rax, other_const_val); + e.add(e.qword[addr], e.rax); + } + return true; + } else if (lms.binary_uses_twice) { + // we're being added to ourselves, we are a multiply by 2 + + e.shl(e.qword[addr], 1); + return true; + } else if (lms.binary_other) { + return false; // cannot handle other variables right now. + } + } else if (lms.op == OPCODE_SUB) { + if (lms.other_index != 1) { + return false; // if we are the second operand, we cant combine memory + // access and operation + } + + if (lms.other_const) { + if (const_fits_in_insn) { + if (other_const_val == 1 && + e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.dec(e.qword[addr]); + } else { + e.sub(e.qword[addr], (uint32_t)other_const_val); + } + + } else { + e.mov(e.rax, other_const_val); + e.sub(e.qword[addr], e.rax); + } + return true; + } + + } else if (lms.op == OPCODE_AND) { + if (lms.other_const) { + if (const_fits_in_insn) { + e.and_(e.qword[addr], (uint32_t)other_const_val); + } else { + e.mov(e.rax, other_const_val); + e.and_(e.qword[addr], e.rax); + } + return true; + } + } else if (lms.op == OPCODE_OR) { + if (lms.other_const) { + if (const_fits_in_insn) { + e.or_(e.qword[addr], (uint32_t)other_const_val); + } else { + e.mov(e.rax, other_const_val); + e.or_(e.qword[addr], e.rax); + } + return true; + } + } else if (lms.op == OPCODE_XOR) { + if (lms.other_const) { + if (const_fits_in_insn) { + e.xor_(e.qword[addr], (uint32_t)other_const_val); + } else { + e.mov(e.rax, other_const_val); + e.xor_(e.qword[addr], e.rax); + } + return true; + } + } + + return false; +} +template +static bool HandleLMS64Unary(X64Emitter& e, const EmitArgType& i, + LoadModStoreContext& lms, Xbyak::RegExp& addr) { + Opcode op = lms.op; + + if (op == OPCODE_NOT) { + e.not_(e.qword[addr]); + return true; + } else if (op == OPCODE_NEG) { + e.neg(e.qword[addr]); + return true; + } + + return false; +} struct LOAD_CONTEXT_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeContextAddress(e, i.src1); + LoadModStoreContext lms{}; + if (GetLoadModStoreContext(i.instr, &lms)) { + if (lms.is_binary && HandleLMS64Binary(e, i, lms, addr)) { + lms.Consume(); + return; + } else if (lms.is_unary && HandleLMS64Unary(e, i, lms, addr)) { + lms.Consume(); + return; + } + } + e.mov(i.dest, e.qword[addr]); if (IsTracingData()) { e.mov(e.GetNativeParam(1), e.qword[addr]); @@ -483,7 +788,11 @@ struct STORE_CONTEXT_I16 static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeContextAddress(e, i.src1); if (i.src2.is_constant) { - e.mov(e.word[addr], i.src2.constant()); + if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.word[addr], e.GetMembaseReg().cvt16()); + } else { + e.mov(e.word[addr], i.src2.constant()); + } } else { e.mov(e.word[addr], i.src2); } @@ -500,7 +809,11 @@ struct STORE_CONTEXT_I32 static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeContextAddress(e, i.src1); if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); + if (i.src2.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + } else { + e.mov(e.dword[addr], i.src2.constant()); + } } else { e.mov(e.dword[addr], i.src2); } @@ -569,9 +882,14 @@ struct STORE_CONTEXT_V128 auto addr = ComputeContextAddress(e, i.src1); if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vmovaps(e.ptr[addr], e.xmm0); + e.vmovdqa(e.ptr[addr], e.xmm0); } else { - e.vmovaps(e.ptr[addr], i.src2); + SimdDomain domain = e.DeduceSimdDomain(i.src2.value); + if (domain == SimdDomain::FLOATING) { + e.vmovaps(e.ptr[addr], i.src2); + } else { + e.vmovdqa(e.ptr[addr], i.src2); + } } if (IsTracingData()) { e.lea(e.GetNativeParam(1), e.ptr[addr]); @@ -735,7 +1053,11 @@ struct STORE_OFFSET_I16 } } else { if (i.src3.is_constant) { - e.mov(e.word[addr], i.src3.constant()); + if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.word[addr], e.GetMembaseReg().cvt16()); + } else { + e.mov(e.word[addr], i.src3.constant()); + } } else { e.mov(e.word[addr], i.src3); } @@ -757,7 +1079,11 @@ struct STORE_OFFSET_I32 } } else { if (i.src3.is_constant) { - e.mov(e.dword[addr], i.src3.constant()); + if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + } else { + e.mov(e.dword[addr], i.src3.constant()); + } } else { e.mov(e.dword[addr], i.src3); } @@ -895,7 +1221,7 @@ struct LOAD_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeMemoryAddress(e, i.src1); // TODO(benvanik): we should try to stick to movaps if possible. - e.vmovups(i.dest, e.ptr[addr]); + e.vmovdqa(i.dest, e.ptr[addr]); if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { // TODO(benvanik): find a way to do this without the memory load. e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteSwapMask)); @@ -1054,13 +1380,15 @@ struct STORE_V128 if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { assert_false(i.src2.is_constant); e.vpshufb(e.xmm0, i.src2, e.GetXmmConstPtr(XMMByteSwapMask)); - e.vmovaps(e.ptr[addr], e.xmm0); + // changed from vmovaps, the penalty on the vpshufb is unavoidable but + // we dont need to incur another here too + e.vmovdqa(e.ptr[addr], e.xmm0); } else { if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vmovaps(e.ptr[addr], e.xmm0); + e.vmovdqa(e.ptr[addr], e.xmm0); } else { - e.vmovaps(e.ptr[addr], i.src2); + e.vmovdqa(e.ptr[addr], i.src2); } } if (IsTracingData()) { @@ -1081,10 +1409,12 @@ struct CACHE_CONTROL : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - bool is_clflush = false, is_prefetch = false; + bool is_clflush = false, is_prefetch = false, is_prefetchw = false; switch (CacheControlType(i.instr->flags)) { - case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH: case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH_FOR_STORE: + is_prefetchw = true; + break; + case CacheControlType::CACHE_CONTROL_TYPE_DATA_TOUCH: is_prefetch = true; break; case CacheControlType::CACHE_CONTROL_TYPE_DATA_STORE: @@ -1095,6 +1425,11 @@ struct CACHE_CONTROL assert_unhandled_case(CacheControlType(i.instr->flags)); return; } + if (is_prefetchw && !e.IsFeatureEnabled(kX64EmitPrefetchW)) { + is_prefetchw = false; + is_prefetch = true; // cant prefetchw, cpu doesnt have it (unlikely to + // happen). just prefetcht0 + } size_t cache_line_size = i.src2.value; RegExp addr; @@ -1117,13 +1452,24 @@ struct CACHE_CONTROL } } else { if (xe::memory::allocation_granularity() > 0x1000) { - // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do - // it via memory mapping. + // Emulate the 4 KB physical address offset in 0xE0000000+ when can't + // do it via memory mapping. + e.mov(e.eax, i.src1.reg().cvt32()); + e.cmp(i.src1.reg().cvt32(), e.GetContextReg().cvt32()); - e.setae(e.al); - e.movzx(e.eax, e.al); - e.shl(e.eax, 12); - e.add(e.eax, i.src1.reg().cvt32()); + + Xbyak::Label& tmplbl = e.NewCachedLabel(); + + Xbyak::Label& fixup_label = + e.AddToTail([&tmplbl](X64Emitter& e, Xbyak::Label& our_tail_label) { + e.L(our_tail_label); + + Do0x1000Add(e, e.eax); + + e.jmp(tmplbl, e.T_NEAR); + }); + e.jae(fixup_label, e.T_NEAR); + e.L(tmplbl); } else { // Clear the top 32 bits, as they are likely garbage. // TODO(benvanik): find a way to avoid doing this. @@ -1131,12 +1477,17 @@ struct CACHE_CONTROL } addr = e.GetMembaseReg() + e.rax; } + // todo: use clflushopt + sfence on cpus that support it if (is_clflush) { e.clflush(e.ptr[addr]); } + if (is_prefetch) { e.prefetcht0(e.ptr[addr]); } + if (is_prefetchw) { + e.prefetchw(e.ptr[addr]); + } if (cache_line_size >= 128) { // Prefetch the other 64 bytes of the 128-byte cache line. @@ -1151,6 +1502,9 @@ struct CACHE_CONTROL if (is_prefetch) { e.prefetcht0(e.ptr[addr]); } + if (is_prefetchw) { + e.prefetchw(e.ptr[addr]); + } assert_true(cache_line_size == 128); } } @@ -1178,20 +1532,24 @@ struct MEMSET_I64_I8_I64 assert_true(i.src2.constant() == 0); e.vpxor(e.xmm0, e.xmm0); auto addr = ComputeMemoryAddress(e, i.src1); + /* + chrispy: changed to vmovdqa, the mismatch between vpxor and vmovaps + was causing a 1 cycle stall before the first store + */ switch (i.src3.constant()) { case 32: - e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); + + e.vmovdqa(e.ptr[addr], e.ymm0); break; case 128: - e.vmovaps(e.ptr[addr + 0 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 1 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 2 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 3 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 4 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 5 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 6 * 16], e.xmm0); - e.vmovaps(e.ptr[addr + 7 * 16], e.xmm0); + // probably should lea the address beforehand + e.vmovdqa(e.ptr[addr + 0 * 16], e.ymm0); + + e.vmovdqa(e.ptr[addr + 2 * 16], e.ymm0); + + e.vmovdqa(e.ptr[addr + 4 * 16], e.ymm0); + + e.vmovdqa(e.ptr[addr + 6 * 16], e.ymm0); break; default: assert_unhandled_case(i.src3.constant()); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 73e2d646b..bbd5f6d21 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -25,12 +25,12 @@ #include "xenia/cpu/backend/x64/x64_sequences.h" #include +#include #include #include "xenia/base/assert.h" #include "xenia/base/clock.h" #include "xenia/base/logging.h" -#include "xenia/base/string.h" #include "xenia/base/threading.h" #include "xenia/cpu/backend/x64/x64_emitter.h" #include "xenia/cpu/backend/x64/x64_op.h" @@ -45,6 +45,11 @@ DEFINE_bool(use_fast_dot_product, false, "treating inf as overflow instead of using mcxsr" "four insn dotprod", "CPU"); + +DEFINE_bool(no_round_to_single, false, + "Not for users, breaks games. Skip rounding double values to " + "single precision and back", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -70,7 +75,7 @@ struct COMMENT : Sequence> { auto str = reinterpret_cast(i.src1.value); // TODO(benvanik): pass through. // TODO(benvanik): don't just leak this memory. - auto str_copy = xe_strdup(str); + auto str_copy = strdup(str); e.mov(e.rdx, reinterpret_cast(str_copy)); e.CallNative(reinterpret_cast(TraceString)); } @@ -372,6 +377,27 @@ EMITTER_OPCODE_TABLE(OPCODE_CONVERT, CONVERT_I32_F32, CONVERT_I32_F64, CONVERT_I64_F64, CONVERT_F32_I32, CONVERT_F32_F64, CONVERT_F64_I64, CONVERT_F64_F32); +struct TOSINGLE_F64_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + /* todo: + manually round, honestly might be faster than this. this sequence takes > + 6 cycles on zen 2 we can also get closer to the correct behavior by + manually rounding: + https://randomascii.wordpress.com/2019/03/20/exercises-in-emulation-xbox-360s-fma-instruction/ + + */ + if (cvars::no_round_to_single) { + if (i.dest != i.src1) { + e.vmovapd(i.dest, i.src1); + } + } else { + e.vcvtsd2ss(e.xmm0, i.src1); + e.vcvtss2sd(i.dest, e.xmm0); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_TO_SINGLE, TOSINGLE_F64_F64); // ============================================================================ // OPCODE_ROUND // ============================================================================ @@ -779,7 +805,8 @@ struct SELECT_V128_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { Xmm src1 = i.src1.is_constant ? e.xmm0 : i.src1; PermittedBlend mayblend = GetPermittedBlendForSelectV128(i.src1.value); - //todo: detect whether src1 is only 0 or FFFF and use blends if so. currently we only detect cmps + // todo: detect whether src1 is only 0 or FFFF and use blends if so. + // currently we only detect cmps if (i.src1.is_constant) { e.LoadConstantXmm(src1, i.src1.constant()); } @@ -810,100 +837,128 @@ EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32, SELECT_I64, SELECT_F32, SELECT_F64, SELECT_V128_I8, SELECT_V128_V128); -// ============================================================================ -// OPCODE_IS_TRUE -// ============================================================================ -struct IS_TRUE_I8 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setnz(i.dest); +static const hir::Instr* GetFirstPrecedingInstrWithPossibleFlagEffects( + const hir::Instr* i) { + Opcode iop; + +go_further: + i = i->GetNonFakePrev(); + if (!i) { + return false; } -}; -struct IS_TRUE_I16 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setnz(i.dest); + iop = i->opcode->num; + // context/local loads are just movs from mem. we know they will not spoil the + // flags + switch (iop) { + case OPCODE_LOAD_CONTEXT: + case OPCODE_STORE_CONTEXT: + case OPCODE_LOAD_LOCAL: + case OPCODE_STORE_LOCAL: + case OPCODE_ASSIGN: + goto go_further; + default: + return i; } -}; -struct IS_TRUE_I32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setnz(i.dest); +} + +static bool HasPrecedingCmpOfSameValues(const hir::Instr* i) { + if (IsTracingData()) { + return false; // no cmp elim if tracing } -}; -struct IS_TRUE_I64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setnz(i.dest); + auto prev = GetFirstPrecedingInstrWithPossibleFlagEffects(i); + + if (prev == nullptr) { + return false; } -}; -struct IS_TRUE_F32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setnz(i.dest); + + Opcode num = prev->opcode->num; + + if (num < OPCODE_COMPARE_EQ || num > OPCODE_COMPARE_UGE) { + return false; } -}; -struct IS_TRUE_F64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setnz(i.dest); + + return prev->src1.value->IsEqual(i->src1.value) && + prev->src2.value->IsEqual(i->src2.value); +} +static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn, + unsigned& out_offset) { + if (IsTracingData()) { + return false; } -}; -struct IS_TRUE_V128 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setnz(i.dest); + hir::Value* defed = setx_insn->dest; + + if (!defed->HasSingleUse()) { + return false; } -}; + hir::Value::Use* single_use = defed->use_head; + + hir::Instr* shouldbestore = single_use->instr; + + if (!shouldbestore) { + return false; // probs impossible + } + + if (shouldbestore->opcode->num == OPCODE_STORE_CONTEXT) { + if (shouldbestore->GetNonFakePrev() == setx_insn) { + out_offset = static_cast(shouldbestore->src1.offset); + shouldbestore->backend_flags |= + INSTR_X64_FLAGS_ELIMINATED; // eliminate store + return true; + } + } + return false; +} +#define EMITTER_IS_TRUE(typ, tester) \ + struct IS_TRUE_##typ \ + : Sequence> { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.tester(i.src1, i.src1); \ + unsigned ctxoffset = 0; \ + if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ + e.setnz(e.byte[e.GetContextReg() + ctxoffset]); \ + } else { \ + e.setnz(i.dest); \ + } \ + } \ + } + +#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test) + +EMITTER_IS_TRUE_INT(I8); +EMITTER_IS_TRUE_INT(I16); +EMITTER_IS_TRUE_INT(I32); +EMITTER_IS_TRUE_INT(I64); +EMITTER_IS_TRUE(F32, vtestps); +EMITTER_IS_TRUE(F64, vtestpd); + +EMITTER_IS_TRUE(V128, vptest); + EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32, IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128); -// ============================================================================ -// OPCODE_IS_FALSE -// ============================================================================ -struct IS_FALSE_I8 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setz(i.dest); +#define EMITTER_IS_FALSE(typ, tester) \ + struct IS_FALSE_##typ \ + : Sequence> { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.tester(i.src1, i.src1); \ + unsigned ctxoffset = 0; \ + if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ + e.setz(e.byte[e.GetContextReg() + ctxoffset]); \ + } else { \ + e.setz(i.dest); \ + } \ + } \ } -}; -struct IS_FALSE_I16 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setz(i.dest); - } -}; -struct IS_FALSE_I32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setz(i.dest); - } -}; -struct IS_FALSE_I64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.setz(i.dest); - } -}; -struct IS_FALSE_F32 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setz(i.dest); - } -}; -struct IS_FALSE_F64 : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setz(i.dest); - } -}; -struct IS_FALSE_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.setz(i.dest); - } -}; +#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test) +EMITTER_IS_FALSE_INT(I8); +EMITTER_IS_FALSE_INT(I16); +EMITTER_IS_FALSE_INT(I32); +EMITTER_IS_FALSE_INT(I64); +EMITTER_IS_FALSE(F32, vtestps); +EMITTER_IS_FALSE(F64, vtestpd); + +EMITTER_IS_FALSE(V128, vptest); + EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32, IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128); @@ -925,208 +980,268 @@ struct IS_NAN_F64 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_IS_NAN, IS_NAN_F32, IS_NAN_F64); +template +static void CompareEqDoSete(X64Emitter& e, const Instr* instr, + const dest& dst) { + unsigned ctxoffset = 0; + if (MayCombineSetxWithFollowingCtxStore(instr, ctxoffset)) { + e.sete(e.byte[e.GetContextReg() + ctxoffset]); + } else { + e.sete(dst); + } +} + // ============================================================================ // OPCODE_COMPARE_EQ // ============================================================================ struct COMPARE_EQ_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg8& src1, int32_t constant) { - if (constant == 0) { - e.test(src1, src1); - } else - e.cmp(src1, constant); - }); - e.sete(i.dest); + // x86 flags already set? + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; struct COMPARE_EQ_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg16& src1, int32_t constant) { - if (constant == 0) { - e.test(src1, src1); - } else - e.cmp(src1, constant); - }); - e.sete(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; struct COMPARE_EQ_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg32& src1, int32_t constant) { - if (constant == 0) { - e.test(src1, src1); - } else - e.cmp(src1, constant); - }); - e.sete(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; struct COMPARE_EQ_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg64& src1, int32_t constant) { - if (constant == 0) { - e.test(src1, src1); - } else - e.cmp(src1, constant); - }); - e.sete(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { + if (constant == 0) { + e.test(src1, src1); + } else + e.cmp(src1, constant); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; struct COMPARE_EQ_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { - e.vcomiss(src1, src2); - }); - e.sete(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeBinaryXmmOp( + e, i, + [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e.vcomiss(src1, src2); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; struct COMPARE_EQ_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { - e.vcomisd(src1, src2); - }); - e.sete(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeBinaryXmmOp( + e, i, + [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e.vcomisd(src1, src2); + }); + } + CompareEqDoSete(e, i.instr, i.dest); } }; EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EQ, COMPARE_EQ_I8, COMPARE_EQ_I16, COMPARE_EQ_I32, COMPARE_EQ_I64, COMPARE_EQ_F32, COMPARE_EQ_F64); +template +static void CompareNeDoSetne(X64Emitter& e, const Instr* instr, + const dest& dst) { + unsigned ctxoffset = 0; + if (MayCombineSetxWithFollowingCtxStore(instr, ctxoffset)) { + e.setne(e.byte[e.GetContextReg() + ctxoffset]); + } else { + e.setne(dst); + } +} // ============================================================================ // OPCODE_COMPARE_NE // ============================================================================ struct COMPARE_NE_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg8& src1, int32_t constant) { - e.cmp(src1, constant); - }); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { + e.cmp(src1, constant); + }); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; struct COMPARE_NE_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg16& src1, int32_t constant) { - e.cmp(src1, constant); - }); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { + e.cmp(src1, constant); + }); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; struct COMPARE_NE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg32& src1, int32_t constant) { - e.cmp(src1, constant); - }); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { + e.cmp(src1, constant); + }); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; struct COMPARE_NE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitCommutativeCompareOp( - e, i, - [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { - e.cmp(src1, src2); - }, - [](X64Emitter& e, const Reg64& src1, int32_t constant) { - e.cmp(src1, constant); - }); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { + e.cmp(src1, src2); + }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { + e.cmp(src1, constant); + }); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; struct COMPARE_NE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcomiss(i.src1, i.src2); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + e.vcomiss(i.src1, i.src2); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; struct COMPARE_NE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcomisd(i.src1, i.src2); - e.setne(i.dest); + if (!HasPrecedingCmpOfSameValues(i.instr)) { + e.vcomisd(i.src1, i.src2); + } + CompareNeDoSetne(e, i.instr, i.dest); } }; EMITTER_OPCODE_TABLE(OPCODE_COMPARE_NE, COMPARE_NE_I8, COMPARE_NE_I16, COMPARE_NE_I32, COMPARE_NE_I64, COMPARE_NE_F32, COMPARE_NE_F64); +#define EMITTER_ASSOCIATE_CMP_INT_DO_SET(emit_instr, inverse_instr) \ + unsigned ctxoffset = 0; \ + if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ + auto addr = e.byte[e.GetContextReg() + ctxoffset]; \ + if (!inverse) { \ + e.emit_instr(addr); \ + } else { \ + e.inverse_instr(addr); \ + } \ + } else { \ + if (!inverse) { \ + e.emit_instr(dest); \ + } else { \ + e.inverse_instr(dest); \ + } \ + } // ============================================================================ // OPCODE_COMPARE_* // ============================================================================ -#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, type, \ - reg_type) \ - struct COMPARE_##op##_##type \ - : Sequence> { \ - static void Emit(X64Emitter& e, const EmitArgType& i) { \ - EmitAssociativeCompareOp( \ - e, i, \ - [](X64Emitter& e, const Reg8& dest, const reg_type& src1, \ - const reg_type& src2, bool inverse) { \ - e.cmp(src1, src2); \ - if (!inverse) { \ - e.instr(dest); \ - } else { \ - e.inverse_instr(dest); \ - } \ - }, \ - [](X64Emitter& e, const Reg8& dest, const reg_type& src1, \ - int32_t constant, bool inverse) { \ - e.cmp(src1, constant); \ - if (!inverse) { \ - e.instr(dest); \ - } else { \ - e.inverse_instr(dest); \ - } \ - }); \ - } \ +#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, emit_instr, inverse_instr, type, \ + reg_type) \ + struct COMPARE_##op##_##type \ + : Sequence> { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + EmitAssociativeCompareOp( \ + e, i, \ + [&i](X64Emitter& e, const Reg8& dest, const reg_type& src1, \ + const reg_type& src2, bool inverse) { \ + if (!HasPrecedingCmpOfSameValues(i.instr)) { \ + e.cmp(src1, src2); \ + } \ + EMITTER_ASSOCIATE_CMP_INT_DO_SET(emit_instr, inverse_instr) \ + }, \ + [&i](X64Emitter& e, const Reg8& dest, const reg_type& src1, \ + int32_t constant, bool inverse) { \ + if (!HasPrecedingCmpOfSameValues(i.instr)) { \ + e.cmp(src1, constant); \ + } \ + EMITTER_ASSOCIATE_CMP_INT_DO_SET(emit_instr, inverse_instr) \ + }); \ + } \ }; #define EMITTER_ASSOCIATIVE_COMPARE_XX(op, instr, inverse_instr) \ EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I8Op, Reg8); \ @@ -1147,29 +1262,43 @@ EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setbe); // https://web.archive.org/web/20171129015931/https://x86.renejeschke.de/html/file_module_x86_id_288.html // Original link: https://x86.renejeschke.de/html/file_module_x86_id_288.html -#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, instr) \ +#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, emit_instr) \ struct COMPARE_##op##_F32 \ : Sequence> { \ static void Emit(X64Emitter& e, const EmitArgType& i) { \ - e.vcomiss(i.src1, i.src2); \ - e.instr(i.dest); \ + if (!HasPrecedingCmpOfSameValues(i.instr)) { \ + e.vcomiss(i.src1, i.src2); \ + } \ + unsigned ctxoffset = 0; \ + if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ + e.emit_instr(e.byte[e.GetContextReg() + ctxoffset]); \ + } else { \ + e.emit_instr(i.dest); \ + } \ } \ }; \ struct COMPARE_##op##_F64 \ : Sequence> { \ static void Emit(X64Emitter& e, const EmitArgType& i) { \ - if (i.src1.is_constant) { \ - e.LoadConstantXmm(e.xmm0, i.src1.constant()); \ - e.vcomisd(e.xmm0, i.src2); \ - } else if (i.src2.is_constant) { \ - e.LoadConstantXmm(e.xmm0, i.src2.constant()); \ - e.vcomisd(i.src1, e.xmm0); \ - } else { \ - e.vcomisd(i.src1, i.src2); \ + if (!HasPrecedingCmpOfSameValues(i.instr)) { \ + if (i.src1.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src1.constant()); \ + e.vcomisd(e.xmm0, i.src2); \ + } else if (i.src2.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src2.constant()); \ + e.vcomisd(i.src1, e.xmm0); \ + } else { \ + e.vcomisd(i.src1, i.src2); \ + } \ + } \ + unsigned ctxoffset = 0; \ + if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ + e.emit_instr(e.byte[e.GetContextReg() + ctxoffset]); \ + } else { \ + e.emit_instr(i.dest); \ } \ - e.instr(i.dest); \ } \ }; \ EMITTER_OPCODE_TABLE(OPCODE_COMPARE_##op##_FLT, COMPARE_##op##_F32, \ @@ -1207,7 +1336,11 @@ void EmitAddXX(X64Emitter& e, const ARGS& i) { e.add(dest_src, src); }, [](X64Emitter& e, const REG& dest_src, int32_t constant) { - e.add(dest_src, constant); + if (constant == 1 && e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.inc(dest_src); + } else { + e.add(dest_src, constant); + } }); } struct ADD_I8 : Sequence> { @@ -1322,7 +1455,11 @@ void EmitSubXX(X64Emitter& e, const ARGS& i) { e.sub(dest_src, src); }, [](X64Emitter& e, const REG& dest_src, int32_t constant) { - e.sub(dest_src, constant); + if (constant == 1 && e.IsFeatureEnabled(kX64FlagsIndependentVars)) { + e.dec(dest_src); + } else { + e.sub(dest_src, constant); + } }); } struct SUB_I8 : Sequence> { @@ -1645,89 +1782,13 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL, MUL_I8, MUL_I16, MUL_I32, MUL_I64, MUL_F32, // ============================================================================ struct MUL_HI_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // mulx: $1:$2 = EDX * $3 - - if (e.IsFeatureEnabled(kX64EmitBMI2)) { - // TODO(benvanik): place src1 in eax? still need to sign extend - e.movzx(e.edx, i.src1); - e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); - } else { - // x86 mul instruction - // AH:AL = AL * $1; - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); // can't multiply 2 constants - e.mov(e.al, i.src1.constant()); - e.mul(i.src2); - e.mov(i.dest, e.ah); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); // can't multiply 2 constants - e.mov(e.al, i.src2.constant()); - e.mul(i.src1); - e.mov(i.dest, e.ah); - } else { - e.mov(e.al, i.src1); - e.mul(i.src2); - e.mov(i.dest, e.ah); - } - } - } else { - if (i.src1.is_constant) { - e.mov(e.al, i.src1.constant()); - } else { - e.mov(e.al, i.src1); - } - if (i.src2.is_constant) { - e.mov(e.al, i.src2.constant()); - e.imul(e.al); - } else { - e.imul(i.src2); - } - e.mov(i.dest, e.ah); - } + assert_impossible_sequence(MUL_HI_I8); } }; struct MUL_HI_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - if (e.IsFeatureEnabled(kX64EmitBMI2)) { - // TODO(benvanik): place src1 in eax? still need to sign extend - e.movzx(e.edx, i.src1); - e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); - } else { - // x86 mul instruction - // DX:AX = AX * $1; - if (i.src1.is_constant) { - assert_true(!i.src2.is_constant); // can't multiply 2 constants - e.mov(e.ax, i.src1.constant()); - e.mul(i.src2); - e.mov(i.dest, e.dx); - } else if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); // can't multiply 2 constants - e.mov(e.ax, i.src2.constant()); - e.mul(i.src1); - e.mov(i.dest, e.dx); - } else { - e.mov(e.ax, i.src1); - e.mul(i.src2); - e.mov(i.dest, e.dx); - } - } - } else { - if (i.src1.is_constant) { - e.mov(e.ax, i.src1.constant()); - } else { - e.mov(e.ax, i.src1); - } - if (i.src2.is_constant) { - e.mov(e.dx, i.src2.constant()); - e.imul(e.dx); - } else { - e.imul(i.src2); - } - e.mov(i.dest, e.dx); - } + assert_impossible_sequence(MUL_HI_I8); } }; struct MUL_HI_I32 @@ -1836,92 +1897,12 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_HI, MUL_HI_I8, MUL_HI_I16, MUL_HI_I32, // TODO(benvanik): simplify code! struct DIV_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - Xbyak::Label skip; - e.inLocalLabel(); - - if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.mov(e.cl, i.src2.constant()); - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - e.movzx(e.ax, i.src1); - e.div(e.cl); - } else { - e.movsx(e.ax, i.src1); - e.idiv(e.cl); - } - } else { - // Skip if src2 is zero. - e.test(i.src2, i.src2); - e.jz(skip, CodeGenerator::T_SHORT); - - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - if (i.src1.is_constant) { - e.mov(e.ax, static_cast(i.src1.constant())); - } else { - e.movzx(e.ax, i.src1); - } - e.div(i.src2); - } else { - if (i.src1.is_constant) { - e.mov(e.ax, static_cast(i.src1.constant())); - } else { - e.movsx(e.ax, i.src1); - } - e.idiv(i.src2); - } - } - - e.L(skip); - e.outLocalLabel(); - e.mov(i.dest, e.al); + assert_impossible_sequence(DIV_I8); } }; struct DIV_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - Xbyak::Label skip; - e.inLocalLabel(); - - if (i.src2.is_constant) { - assert_true(!i.src1.is_constant); - e.mov(e.cx, i.src2.constant()); - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - e.mov(e.ax, i.src1); - // Zero upper bits. - e.xor_(e.dx, e.dx); - e.div(e.cx); - } else { - e.mov(e.ax, i.src1); - e.cwd(); // dx:ax = sign-extend ax - e.idiv(e.cx); - } - } else { - // Skip if src2 is zero. - e.test(i.src2, i.src2); - e.jz(skip, CodeGenerator::T_SHORT); - - if (i.instr->flags & ARITHMETIC_UNSIGNED) { - if (i.src1.is_constant) { - e.mov(e.ax, i.src1.constant()); - } else { - e.mov(e.ax, i.src1); - } - // Zero upper bits. - e.xor_(e.dx, e.dx); - e.div(i.src2); - } else { - if (i.src1.is_constant) { - e.mov(e.ax, i.src1.constant()); - } else { - e.mov(e.ax, i.src1); - } - e.cwd(); // dx:ax = sign-extend ax - e.idiv(i.src2); - } - } - - e.L(skip); - e.outLocalLabel(); - e.mov(i.dest, e.ax); + assert_impossible_sequence(DIV_I16); } }; struct DIV_I32 : Sequence> { @@ -2040,13 +2021,7 @@ struct DIV_F64 : Sequence> { }; struct DIV_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(!i.instr->flags); - EmitAssociativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - // e.vrcpps(e.xmm0, src2); - // e.vmulps(dest, src1, e.xmm0); - e.vdivps(dest, src1, src2); - }); + assert_impossible_sequence(DIV_V128); } }; EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32, @@ -2064,50 +2039,9 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32, struct MUL_ADD_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // FMA extension - if (e.IsFeatureEnabled(kX64EmitFMA)) { - EmitCommutativeBinaryXmmOp(e, i, - [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { - Xmm src3 = - i.src3.is_constant ? e.xmm1 : i.src3; - if (i.src3.is_constant) { - e.LoadConstantXmm(src3, i.src3.constant()); - } - if (i.dest == src1) { - e.vfmadd213ss(i.dest, src2, src3); - } else if (i.dest == src2) { - e.vfmadd213ss(i.dest, src1, src3); - } else if (i.dest == i.src3) { - e.vfmadd231ss(i.dest, src1, src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, src1); - e.vfmadd213ss(i.dest, src2, src3); - } - }); - } else { - Xmm src3; - if (i.src3.is_constant) { - src3 = e.xmm1; - e.LoadConstantXmm(src3, i.src3.constant()); - } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - src3 = i.src3; - if (i.dest == i.src3) { - e.vmovss(e.xmm1, i.src3); - src3 = e.xmm1; - } - } - - // Multiply operation is commutative. - EmitCommutativeBinaryXmmOp( - e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmulss(dest, src1, src2); // $0 = $1 * $2 - }); - - e.vaddss(i.dest, i.dest, src3); // $0 = $1 + $2 - } + assert_impossible_sequence( + MUL_ADD_F32); // this can never happen, there are very few actual + // float32 instructions } }; struct MUL_ADD_F64 @@ -2167,7 +2101,7 @@ struct MUL_ADD_V128 // than vmul+vadd and it'd be nice to know why. Until we know, it's // disabled so tests pass. // chrispy: reenabled, i have added the DAZ behavior that was missing - if (true && e.IsFeatureEnabled(kX64EmitFMA)) { + if (e.IsFeatureEnabled(kX64EmitFMA)) { EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { @@ -2682,11 +2616,12 @@ struct DOT_PRODUCT_3_V128 detecting overflow would be to just compare with inf. todo: test whether cmp with inf can replace */ - e.vstmxcsr(mxcsr_storage); - + if (!cvars::use_fast_dot_product) { + e.vstmxcsr(mxcsr_storage); + e.mov(e.eax, 8); + } e.vmovaps(e.xmm2, e.GetXmmConstPtr(XMMThreeFloatMask)); - - e.mov(e.eax, 8); + bool is_lensqr = i.instr->src1.value == i.instr->src2.value; auto src1v = e.xmm0; auto src2v = e.xmm1; @@ -2702,43 +2637,74 @@ struct DOT_PRODUCT_3_V128 } else { src2v = i.src2.reg(); } - e.not_(e.eax); + if (!cvars::use_fast_dot_product) { + e.not_(e.eax); + } // todo: maybe the top element should be cleared by the InstrEmit_ function // so that in the future this could be optimized away if the top is known to // be zero. Right now im not sure that happens often though and its // currently not worth it also, maybe pre-and if constant - e.vandps(e.xmm3, src1v, e.xmm2); - e.vandps(e.xmm2, src2v, e.xmm2); + if (!is_lensqr) { + e.vandps(e.xmm3, src1v, e.xmm2); - e.and_(mxcsr_storage, e.eax); - e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good to - // go + e.vandps(e.xmm2, src2v, e.xmm2); - e.vcvtps2pd(e.ymm0, e.xmm3); - e.vcvtps2pd(e.ymm1, e.xmm2); - /* - ymm0 = src1 as doubles, ele 3 cleared - ymm1 = src2 as doubles, ele 3 cleared - */ - e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + if (!cvars::use_fast_dot_product) { + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good + // to go + } + e.vcvtps2pd(e.ymm0, e.xmm3); + e.vcvtps2pd(e.ymm1, e.xmm2); + + /* + ymm0 = src1 as doubles, ele 3 cleared + ymm1 = src2 as doubles, ele 3 cleared + */ + e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + } else { + e.vandps(e.xmm3, src1v, e.xmm2); + if (!cvars::use_fast_dot_product) { + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); // overflow flag is cleared, now we're good + // to go + } + e.vcvtps2pd(e.ymm0, e.xmm3); + e.vmulpd(e.ymm3, e.ymm0, e.ymm0); + } e.vextractf128(e.xmm2, e.ymm3, 1); e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3); // get element [1] in xmm3 e.vaddsd(e.xmm3, e.xmm3, e.xmm2); - e.not_(e.eax); + if (!cvars::use_fast_dot_product) { + e.not_(e.eax); + } e.vaddsd(e.xmm2, e.xmm3, e.xmm0); e.vcvtsd2ss(e.xmm1, e.xmm2); - // this is awful - e.vstmxcsr(mxcsr_storage); - e.test(mxcsr_storage, e.eax); - Xbyak::Label ret_qnan; - Xbyak::Label done; - e.jnz(ret_qnan); - e.vshufps(i.dest, e.xmm1, e.xmm1, 0); // broadcast - e.jmp(done); - e.L(ret_qnan); - e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); - e.L(done); + if (!cvars::use_fast_dot_product) { + e.vstmxcsr(mxcsr_storage); + + e.test(mxcsr_storage, e.eax); + + Xbyak::Label& done = e.NewCachedLabel(); + Xbyak::Label& ret_qnan = + e.AddToTail([i, &done](X64Emitter& e, Xbyak::Label& me) { + e.L(me); + e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); + e.jmp(done, X64Emitter::T_NEAR); + }); + + e.jnz(ret_qnan, X64Emitter::T_NEAR); // reorder these jmps later, just + // want to get this fix in + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); + e.L(done); + } else { + e.vandps(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS)); + + e.vcmpgeps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMFloatInf)); + e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMQNaN), e.xmm2); + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); + } } }; EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3, DOT_PRODUCT_3_V128); @@ -2754,9 +2720,7 @@ struct DOT_PRODUCT_4_V128 // using mxcsr auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; - e.vstmxcsr(mxcsr_storage); - - e.mov(e.eax, 8); + bool is_lensqr = i.instr->src1.value == i.instr->src2.value; auto src1v = e.xmm3; auto src2v = e.xmm2; @@ -2772,36 +2736,59 @@ struct DOT_PRODUCT_4_V128 } else { src2v = i.src2.reg(); } - e.not_(e.eax); + if (!cvars::use_fast_dot_product) { + e.vstmxcsr(mxcsr_storage); - e.and_(mxcsr_storage, e.eax); - e.vldmxcsr(mxcsr_storage); + e.mov(e.eax, 8); + e.not_(e.eax); - e.vcvtps2pd(e.ymm0, src1v); - e.vcvtps2pd(e.ymm1, src2v); + e.and_(mxcsr_storage, e.eax); + e.vldmxcsr(mxcsr_storage); + } + if (is_lensqr) { + e.vcvtps2pd(e.ymm0, src1v); - e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + e.vmulpd(e.ymm3, e.ymm0, e.ymm0); + } else { + e.vcvtps2pd(e.ymm0, src1v); + e.vcvtps2pd(e.ymm1, src2v); + + e.vmulpd(e.ymm3, e.ymm0, e.ymm1); + } e.vextractf128(e.xmm2, e.ymm3, 1); e.vaddpd(e.xmm3, e.xmm3, e.xmm2); e.vunpckhpd(e.xmm0, e.xmm3, e.xmm3); - e.not_(e.eax); + if (!cvars::use_fast_dot_product) { + e.not_(e.eax); + } e.vaddsd(e.xmm2, e.xmm3, e.xmm0); e.vcvtsd2ss(e.xmm1, e.xmm2); - e.vstmxcsr(mxcsr_storage); + if (!cvars::use_fast_dot_product) { + e.vstmxcsr(mxcsr_storage); - e.test(mxcsr_storage, e.eax); + e.test(mxcsr_storage, e.eax); - Xbyak::Label ret_qnan; - Xbyak::Label done; - e.jnz(ret_qnan); // reorder these jmps later, just want to get this fix in - e.vshufps(i.dest, e.xmm1, e.xmm1, 0); - e.jmp(done); - e.L(ret_qnan); - e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); - e.L(done); - // e.DebugBreak(); + Xbyak::Label& done = e.NewCachedLabel(); + Xbyak::Label& ret_qnan = + e.AddToTail([i, &done](X64Emitter& e, Xbyak::Label& me) { + e.L(me); + e.vmovaps(i.dest, e.GetXmmConstPtr(XMMQNaN)); + e.jmp(done, X64Emitter::T_NEAR); + }); + + e.jnz(ret_qnan, X64Emitter::T_NEAR); // reorder these jmps later, just + // want to get this fix in + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); + e.L(done); + } else { + e.vandps(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMAbsMaskPS)); + + e.vcmpgeps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMFloatInf)); + e.vblendvps(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMQNaN), e.xmm2); + e.vshufps(i.dest, e.xmm1, e.xmm1, 0); + } } }; EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4, DOT_PRODUCT_4_V128); @@ -3136,9 +3123,7 @@ struct NOT_I64 : Sequence> { }; struct NOT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - - SimdDomain domain = - e.DeduceSimdDomain(i.src1.value); + SimdDomain domain = e.DeduceSimdDomain(i.src1.value); if (domain == SimdDomain::FLOATING) { e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMFFFF /* FF... */)); } else { @@ -3274,15 +3259,51 @@ struct SHR_I64 : Sequence> { }; struct SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.mov(e.GetNativeParam(1), i.src2.constant()); + /* + godbolt link: + https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGIMwAcpK4AMngMmAByPgBGmMQgAGwaAJykAA6oCoRODB7evv5BmdmOAmER0SxxCclpdpgOuUIETMQE%2BT5%2BgbaY9mUMLW0EFVGx8Umptq3tnYU9CjMj4WPVE3UAlLaoXsTI7BzmAMzhyN5YANQmR26qAYnhBMThAHQI19gmGgCCx6fnmCuNxihA%2BX1%2BZhODDOXku1zci3wgjeYJ%2B4L%2BVAuGnRkLwmKwNAi6AgAH0SQBxSJyNxkjbgi4XAgAT3SmAJDI5nIutAEwG5vO5tGuVh%2BDOZrPZXgY2WARP5RnlfK8tCFRxF3wZxwJKwuZMeiUkisV9KukO1EV1JMeRzMF0eJq1mEJgL1gi4iQuCgQJAIDrNTp1roIAQZyAQbT9R3NgIAst8ANLYEIhCAMHwbC5plimo7HC7JyPRi4AMRjABUSQbTWYVeYzDijn08Rdo8SSTGhDSAGrYABKdNFjJZbKdXK5QartbVJvFI8xUplconhuVqvVmv9zouccTydT6czPhzebwBsLAYtpYrVbrAEkz2Z62jIU38Re2RdSSSLAB5Xshb5IgAERpEkBw1IcJVHMcOWXQVhRnYdJWlPBZQ/ODVwQwdHS3HckxTLMMyzY9ITtM9sM3HUr0rQ06xCOsGz6JRI3iYgSGrKUAGsGFQAB3BgLjQFh0joeIGOfRsGHwKhwVnZDFw/R4Li8e1px%2BOTRwXVC5TDNplN04gsO%2BDT5xQtD0E9b12mUr0fSMkzlLMuUeQVZVeSM2SkOgmDBPDYgOUeAJ7K8zEGQUiyDI5bJBCCtTjJCxzwt8vSGRUmLgqg0KfNs6y7TdRIMrnKLtI/HKCDCx53UK%2BSSossrUsqgq4ocny8vKgLBBtarvKSpTis6%2BtmoSrTzLazk0oILqhsywVWq5fVJG6zEVTmzlooIM9pqK1dVoawRNvVcEAHojouZRhjwMRaCZFt3ws2cFBeC4ywQTAbraQEvCUCzeNegSCFe26hJE%2Bh/PQVBMAUTNUHK7i%2BOO07DCZAHwj5JgYh2cqAcBWcLkwVR9nScrCCh7IAC9MBeBsi2/ABNMtsD24NqffXUAHU/yApmqokmmgI53suYmrredZkkAEUBaFhaG2bMAwFbUkO27PtwJwwMQh/SJyU17XLUqwJGKkvF0R%2BE6LkiAQAFpFkMdA2gsjHPEwQxIMhp6Xrei4Ppsj9fsYRlAawYHRP80QGB48qvswBHA8BW2pId6snaFR83YuOJRGji5UExbHPTwCmLhYPBFhYJgCDDDOvCxwGSmyGJ6FjgA3MQvEh73iEBARrqxb2pIuLgnqETBATEBRUAuMAOF/H8Qmn9O4h5XiqfUhLAt1WeQi4Ja2vdTefznwb1Qc61bW/Q%2BQkWrb2QWg%2B59iw6JLxKTRxJNnb2An82aEElPJmjeFh6afBvqORqFwpa7zPhcfmnMoEDXzFrck8Dypb2FDBc2Xh0isj2EwJQFwt52ihl9LwV0bqGhiMjSGRtpL/yKnfSWcC4oYlfpiMkyB0jeAUJwr6dDb6CAzqgTw6Cxzm14oCXihgsaT2zinPKOddgXDcBcdIbFgDEFYAoGhJszanSEKgNggkBDN0YHgRg%2Bxi5MGQGxKGRBLGcUBOkC6YhvbIH2AoJQUMGB4H2IZUWW4AJCArJ/ICEBVAZGGCSWcGYOQQHJpgXOYSNhHXiYkpx7QonDgzFbQeatcRvmdG2OmDMSSc1VqaAqZgPRkiASUspvYgRAWuFzGpt5yQkmwMBW8gEGwMiLJrNmJIQlhIiRk6JHJAnBOAiM9JBBMmsjyUcPprMAASbSVlDOmeE2Z8zMAxOxBJJiMcJLLK3Gs8kGzhnbMieM/M3wgmbNCdcsZWTem3QCd/R5MyblZI5AciEklaH%2BJ1LU7ADARmZhiZ%2BAAVFAYp2BoV0iqUk6wDANiLKLFLcF4TIWxNhaSKWiLzCJBRZYNFGLWawMFti0guKYVwqpUBIlyLVBIosOS02AL%2Bk/lBUkhkoKaUDK%2BeE%2BF6KWYfKlnyiBnNBWfKuaQd%2BnMxXAotJrRlfLGWysGfKkkjLlVctWbeXlrL%2BXAJpecy5WyFWgv1erC0azJUmuldSkZFrhUKqlrayi9rbzqpNZq116z3W6s9RSrcoKuBSoIWaiFuTWrm0oQQQEXBPxoClI4BUVA2LZg0GGkFwCzBRoFbGsweaLSgqOEWmNOKLhHDLYCUFkgq0MxpQySQ9bo0MwAKzNrBbGrtHbQUkqdZ2vtNbEiDuAQAdl7a2i4U7J0MwCLO2NARF3YBSCumtKR11cA0FK4tOK927sjU6w9tKuBcF3YWs91aL2lvFfmhmXBK23pbRCl9u6m1vrHRe9tj7y3AK4D2n9rbgMdqlqeqFWLY1XoA4CKWN7oMypLVC0Rp0UbEB%2BiQCyuc445xiNoRoBBaUjSJPB51QFX3IZdTWutFGpbfpo0BOd/6VUIc5iB5jc6B0Mc5sO7jsaJ18cFjOkdMGa0Ls5ebHivEC6jXLtYrIn584KFYICGINcLi8UIAgeTAl8ZJpQgIDtQhz10vpRAQKzKBOoq9VGVmQgJO0rRXiqAjUbOkvZfZosQgA04tc5Zs%2BnnWV2bVuxi4QhNbGpiWZu9Qr5WBR845gZnMpVOZQ%2BEhLVrGrJa3FFn8fqMx%2Bec9lp55ABp5Z1EINZMWGRxffeEt1iWYpVYtDV28jrYvOeazl/KbXAQdaK5F/zpBevlbPgNyLEao0Nd/QyODEW5tIY5HNudD6lsVtm%2BZ2tpnG3bbvW2vbwCuOrZ27xzbwCBNncOxcYTl2GZiahWt2NUmHvYGXSOl7Na10Ubm5ur7O2d1/Yjfup132L25pB0BqD9XzOXuO8%2Blb03DtcA2wa/LEbqNw9R/R97Uh0vw7Yxj6rEbTso8axei7JP2uQdm85hbpnEP08y7Si46O7WDaltj%2BrDPdt/cYyz2jbPiec8i1Lcn4vWcMmp2LjLgtru8%2Bl3dpnnMnurb52934uiLjkkYPECuY8VFMDwP5PDqAcF20epFz0rQpJQ34P5ae4Vp4UbJEIZQ3xby9ndGSCACBUIIFpcvGJUArP9YZP7wPGZ4TwgZGuq4U7lEQAmgniAIeO3u8997m0fuA/ACD/yXiof3OVcj/nhAMebhx/dDHpPn4Jq1/T3xKbWeve9gNHnwPweW%2BR9Lxtdt5fo9AjcHHm0dfk/C1Lc34vmeSQe/b2jgIXeC89%2BL5%2BfvS%2BMxR4L1X0fNw7uD5MPXlPC0Ngz9bySbPPvEgr8LyH2JUBG8Ts/BXvfceLgJ%2BP5PpLn4M9u6v3b1zxJB33v17z71PzL1APfwP1r0Tx/36wvzn2v07xAIrzXyhTDwmgNG3zfxHzH1LXgIb0myQIAOvyXzvwwIgMb0CHPzwNjwPxwKIMgIH3P3/zRB%2BC%2BlRmN2QAcXQCiXxhJDQBwwUCiUaUSlqjag8h%2BEWGIC8AcAuBMWQDMDEOP3XA5CoB5ArguBxSZA8inSaWYWfkxB3h%2BHBi8EbkBGDgMXSC/Dvi7gUGVBIxbB2EsO9l%2BzRCnXUKUmbmPguHNmIBSBNCDH3mblzDVH8NOmIEvRNB8OvgsEiIuGICCkHB8K7XQQCL3WCKtHykUKagSMyNMIgnMLcJiCsU4iEPDHCAyNOhMC7QsG4WsA0HeC7S5jqIsCtj3RaKaUHBKPoEUMfkSPaMaMsDMGaLqLaPqOsC6ImM5QZGbhDGaXcKMgZGbAgHcMaSWI0BeA0AuHAk1C8JNHmNtC2JWMTx6IgiOQdEOMHHmKWSWIdTSyYF%2Bzik5DWM/EeMFggGeJjyqSxFUCnWLGLFzU2KOC5n%2BTHGJWJQ32blojBIuDWXVR%2BNpWbi7XELVUlWRI%2BN9UxK/z%2BI0FUCBKJIzHli2In2/0QSRLXQzH2I5DUKOI5F8PEM6I0DMB3lePmkxHWIgBmx%2BIqX%2BOPVBPBL2IZIOPULHHBlFLpJuIgh8lhIuGhSWOPilM5ERMlQWKry5lhLOJ8neNRJHz7lpJ8npNuNanlO/yWK4C8B1NajVLSw1PEO1I5ONIMJVMZPuPhM%2BNCQ1JtLHH1MVPhOVNNLHCtitl8N9OlIuJ8l%2BlEk/E/BmwdOhIJMFOaS2JFOdK5AxPtK/3hNRIjOPyjK5Gbg9CWLCP5IJKBOwGLAjK9IgETNzPyKlPeOeINO2N2KNK5FrPrK1JLPrwJICA0EHIjLKN4MqJNwElLMfilNrJHIqN0nCE1IRM62zN%2BI9H7MHOLCIIJKOGLGwGxAzILIZKuNNJNNlI5FnLHPCHEOeJrOXK%2BIvPnNcweLvNCT5KTLuA3K3NUB3L3IjKZKWKfycnQhyIICb1rLfPxIBKBJBLBCOEZkHxyT3UfBtMPNNJbKWIfKqIYDONQoglhRDU5gVI2AcKcMdKDIgi7gIF2AEhvOYVdMOWNhkh%2BD6kcJiBJACDMHqMSBSCOH3RCI9GhSYC4FpSUiYDMBEoESYCOAkvKiYGkDGiYC7Rku9kSGUqYC11PNWIEQWJqKSKyNSIERItoF9AiICNzAMvKmbiyNMqiPMogh8JiJsqSKOCKK0ssrR10uIF4tiO0pfU8rMCCIssUKkH8pSO1wggWOvJ3nqPrFaLOPeO%2BJOIP0TJPykoNKEvaNzFaIn0/DkvSuiosEWmyuYMUqBEZgyvqPSOKopO%2BJLLgu9gKoKmqtSqnTKoavaKnQmJpOuPFLtC5O9iSuUUisQpFODIuGABhkngAgsCTBJACptKst2KWIqosCysMN6ubI9KGr3QuBGo7IZAmrsWmtmvmozKspTWWoKqKvWoZObLhO2pTT2qlMOqmu%2BBmpCDmqCLOrRyivaKqputNObLRI%2BGCpOMsEHlGrPIOsmruXes%2BoWpfV%2Bq4tmIzObLqsZisvuPBpTX2vGphuOo%2BtOt6qsrhK5hWs6ritRv6vUrapJtNASJxuevxrepOq%2Bo2upqSoxpDFxrEsdKnMBupo9Ixp2p5o9LJoKrWptLutpsvUhrHDysuvaOuqlupuBuSp%2Bp5rVvFr%2BpRvZrjKYDqu2qWR5rqu1uRspr1vWJprjzpp5tasVvqIpoBrPIStKpyTEozGhVyo9HdrrQVNytavdoQs/DppyXOs9pDsRrDsFK9rrJ%2BrDpzMZltA7NPIJs%2BpAClKMvKlOIPPorPOPLPM0q5BWtiraPhLEojKzvEPZN6oZBWrWuvPCNrvavqOuuvPiKlJWv%2BuvPSIzLrsaomOvMbLGvJsHuWs6tzsLNwoZALrFIZKrstJwrzoZEouoqSMhicP0IuPBDkQUA0R0xTyAoskwBiBNSLs1E4KPsUPEJPtUH5KgAuur1gOyVySMjkVPsdKnjABzvVDIw/GiK4AsiWNvrKpBuTJkjilXuIAEg/uhIAfQC3uk0VMvqShcj5DQaSNoB0wYBYEPtQYFAwfCHSBrlIyvrYmcL/osnLgUE4jr16soaNAwbOBdi7iAfhMAt6kEA2CIZrjpCMLPOYY%2BiAaTyWMEdYYYLcCmifnCs5CgYEjJE0zoAzRJHIcN3oCoAHwgDEbZFpXIaRUSGoc4kQY4O%2BAYaUiTUWEjQYYwcIYYGIYIDofBDkcwewdwZ4ecOTKYAvT4cOjzuhSOk5TMYEXBkICujwckKUhxQMi9teiZE03QHQCZEcdNLkTDEaE4kIHEKoHHhjiprCQNJ3UT2btDFel4MyaWKeHbgjIZBOmcaJoZOno5GhVificSfELSbKZMukbeKoC0dKYyYcaKbdIUZUmMvCBJClC7isXDEsIgB8YaeXo5Gcd4rZvRDzttkcGQAuE0NQG0PDFoCoDMB2YrnCdGh0OSbPOcZJBYBYCEObgIA8RJCoBtFJBubuYIHSAQBJHSAUFedueQHueyFz2yCXw4Yia4dZTFToouI2Z8QkNGnqmOYIDMH2d6aRe2YufnswCougdtwrh8SELwRMrcAYbKg%2BD%2BfeZBcCBJCBcLWuf%2BfuZ%2Be%2BYQApaUDmQ8WoC2F1BjBjBJGLF7B/DkGAhJEiB/A6QAA1lEKT2xeX%2BXBXhWywxWAAtPsH8dFeZzwi49F1QRgAQEkVFo5rQggU5uUc5oZkx8aK%2BhQWUcQu%2B98gcoc/hy1pKfGdIcQvlaEiAFy/A2AqHKmz8eWV1i5nyZxxEEAEAEEOZUQRYeEJF8l61icp%2B5RWvFCvO26gRZAYSQxm%2B1QN11M9hlyiRlMnc3alNEALEG0k6V13agCzN9IbNz19k/fNwY9DVs8hh9etAaURwKipNNhrmO%2BnJLRrNvBTie%2Bpt%2BEVt6Qs89FlgJwvAThMxfyJY2F5AQlmNm4ONuCqAOthtv4wgmqiAAAP13dHdXNyTbbmI0KNcEnuaiWEnEINaRYgC7i7cWEIBrjZDbdkexbXoBbmQIAfa9rnbGcXbEn4bWe3pfEYs5QxArZMfpDJBYCXx0N1YYH1bEEOefaQ6XzwDNfPv8OQaho5DfbtweYZP8YZJw8CDw9RZJFdYEEYDmQbfhPpZpexa4Ho5EndC9a4DkB9eTb9d6uo4CDw8XFEJY5aTebZc48wG45JXXMHLkG/dgmudw7xlzcY8EBJGDi7jYG08k91Gk44647wB48fubcHiE6o7U5o%2B2aNfo804iG09OD06Y%2BudHfELY5k9M5eb44E/Z3oyddU%2BQ7s4pjYi7j5CWO8%2BxfC9QBpbM8CDmZtJE7w9dahmi7ebtgS9BZxTo4Y%2Bc%2BY7PZS9s9E9uh4mIHLloAk888y/%2BeEkwAAEdTOeP0vaU4vIvL2LWQv1OsAKuquauaGSQbQvO3mpQnFeCeQWuSU%2BuSABuPOaHaVZvKvx4FvaGajmwKyiTaVM4DFAQtuiSiSnoIgRwLJHcM56ApJm5uRDc/EZGOQq3c2hTa0LOa8ISeu7Ou5GuVIHDHO5kNEjBMA1vRv6vWRmu5OzOSU2uLh8unP3PDGVOGRUuNP0gdPGg3ODPR2Pw6uSRsuqXl0vufvIY/uVHDBgAgfDHaUCv3PdPMB9OiuaHEfLRQuyudMAZHPMhCvcf0BtAvo%2B2QeaWLDpuIVXW0frE6f4esf0AmeROjEGATF2g2QSQ2fvQa4SeAfyfufefFgBf/3PnvnfmVeOetO5kmAee%2Bev3p3ORkfxOBe8fEvPtaVxO1uuuzybftIFARucfxvyipuIeeOOvMBgAnePefHIjy4HELhtfypxOr2kfSu8PCfTdifDNSfAevfWOxu7Hff4v/eSVA/g/17vvk/RDU%2BNeKfiuMzkey%2ByegekWcjxf6e7epIcukuk/fua/0/xL7Pdm5lqeXPoQMeGf1vgv4%2BWe8Ok1iBnhLCVHMBHh4hVFsWhDjF4h%2Becfo2PnRD8etGV/Ff%2BCVfMZ1fa%2Bte%2Bew%2BTpvRe4vRR5iZBA%2BhaBS5GB8E8Abm2RLok1L%2BvBgByfFgPwUYoZtC2OcfZnup0n7T96ATzXPP3zmQ0BVAWCO3gk186FpQBeAGfl3Hn7EBF%2BUbXfn2yp6qB/ux/evq5wl5cNp2NncfpYiwBo9%2BuW/WqCSDi4dwcelhK7qZ2XTICZ%2BzzFQlAKeZ4BYB6QEPrKE97d9lu83QxsNwfSj916OLeRm8w3748DeEAcuJQOEE0DzIdA%2BIBDAUCu9YOedI6ER05BEBwY5bZ/qolQAmJA4pccEJR1FDI8dWPEdDkixRaYcvwLPDFua3u5j91O9fdGINzHY48fOEPF5gSUBKHcgSXgV3tbwT76ZRBPzAXv4JEiBCPyg5JIdZ1NLI9TB8QHZrxDT7k9YhJnAIYWgJKSBghe5QcmEJK7kCpQWAYgJkOyGAg/BeQ%2BIQUO/IBAHWSQsoVX0iHpDqhy8Wfo4WMq5DQKwvIIUSXaHCdIhi4UQfvHqGDC8%2BEARTikLd6lc8WGSQzK4CV7l8BhnHH5hAESBdou0GgSQDsSoAy9IhovWnvTwz5SdbmcQvAH5344SNAuJw8gaLyIH08phmfa4Q0Mh4QB/ODw49Fbw%2B5lcC%2BAwuLq32XRPD1OkbSOJF114NdweIkZdAX3a7qDOu5QlDlQFx4xBvoTzEgEIQfY49su2wqgLSjY6yCHe8grwViJd7hDAReHDEd4KV7z91ELidfkSyZbb96RWIpXo7jxHpAmeugt0lYiopkJaUaaWgBZGzglwGAR0cuHfXCA/8zeOcTEHW36F2I6wbAYgOTzrB5QJmr8boXxFqFHRxmXQmoeXwo4BNUhkQyODkRJAmiehGw9fsJGMqIDqAmI7EUyI0S0BaUdog0eXwhF2dxmlQjIfaNr6wjOEffRoa6IZH8EPRYgUhlUNNG19/RZXY0Qr0TGA8BeqgXEdv2tGpjgxvo2vjt0IDQig%2BNIjwYECL5E9S%2BeAtYfwQdEfDeRwAOZESLdHcjcRdbJ3ksD%2B61jahZYq4Ah1uLmw0haYnobiKDH6i5MOPbMQ73Dy2iRxBYwHhJV1EJiQxgPPscj3b4p8axr8OscfzQG38MBlFbHg2I37siZxm46sUZiwC7i1xaIuzhiP3GT9MBx4q4eu035gjOWAI8sWV3OFMd4gzffgtvwvHdidxtQ2fugOfHoBcBqPX8ZVi/HAC7OsEvtjiOIBH8MxfgoXvkNdGPiF%2BR4qCUDCH7xB1xSwwwDDFeioSmAvEKgMqHAkHjMBuvIlnIO2FIS2xqEzvuTz7FkD1O%2BzExKIWtEwClecA9foy1EKfNvikccibjyok0ScJh47FiSJ5Z8sBWQrICCKzFbYBJWRBGVkpPlaqTFWJIFVgK35FHRsY%2BCH2JgOZDjUbECgWlEyGoSWjyBwE1YaBPL7vDXxE3TiKX0jFOTtx14sCbJMgnIiIupYu8WVx9G8Q3JRnW5h5K8m3DC0eYicbUKCkkAQpHQioXqMyGRS2OMUl0YGIymrjyeyU1EWlPU7hTfcY4/KXxCylvNpxoLMqapUcgriqpwmCDg5PU6utXO1U25vb1BY%2BSrx6w2vu6GgkN8h%2BKbCQdX1zbcCdW/bKKdz34KzCeJxPfiTwMEl8CUenUlqb1ROiyZPQfQYjJ6FZDIBLotARQm3GOSLDyB3CRoMdJJCtxvAdQhsT1KS71TvR84rIVnSZ5XSjpYgW6WdIF61SkuX0m6XdPbgkisuLfbfqLwEn4SJho7EkLXmInkDoZKEmlodOOkZcGxTA9AM3BYEQAoZK0mGWjJ%2BkgzMAr0/MRFNvwVSmpFMvsc42Rk8igZ48HCpq3BDYFJAYhEDgPiICgs2ZYhC3MJWUgLQ%2BZ6QDnDKXGhCyo4j9LmBAH5nllVAVABWYrIVlM9eZ7ld1hbjBprl5ZSsxWSrIlmMh3WXgFNF7Ssp6z9o7M8qLxDFoqY5ZOs3WTaVVkXBfB7DbOnBRBrj54Jss2CiAi5hSMGSrs6WbLJNlXp6aTsm0s7OlnZ1IKds5WQ7P1kRT3WrskBIzA9kSCNZoDN2fCT9mmkA5DeY2TnBFm/EEiRjDMhHJtneza04g7pks1/a4sZZ6QfORrKLnaYLq4NEudI1wrHBXAOicKg5FkLyFZK9/WUHgh%2BEv4lCFBDLmLMci5A3Ba0SpgoQI4MhtWEZdFnoT7o99tCZMFeTe14iptnac8uQgvKnlvEBoYhHgRGTPhny15zdS%2BcTC3nrzb5doXeZPRtL9yj5zdKfGIS8CqAIyn8qONfKlJ/zlI98j%2BY/K8DPzeqh5DMuiw4FK1daDJPooCCoBngYq8CwGjFDPlBR6iVUC2gyVTxfysFFgHBfvI6igVb8xMHBRYCwUkLuY5C5SJQuoVS1T5xMLqK3TQXtswFrCwquwpPkD4z5KCiwKXTjnmyv5AioRRmTfnZ1j5HIR2cvF/n6yK8e8pes7RNByILcBuIgIZDBLNId%2B3bRQo0AnnQkYgGYUjrPJrlSCi8z3GIC8GXgVJoSFeKxW8EDzKLVFQiE6eoo0SaLSSYJXRTr3HnUstEfxYxXLx14EdnGti7xVzGsW2KLS9eBxZEozhOKC8LiwcP4uXxQwPFFcEgHUWwC%2BLLKBigJUYpMUCBQl0itJTS29jiFoUf/G0kwCSV6YclGFepbUpsV8QrgXaH2YkrkUZlnGTAFxRyXKWBKPQmSzRY0p8WmLBlRSsxQyGqUB4MZXMWZaXHaWMwYgNpZxosq0QckoF6hQZQXI0XZLkUEygpekqmWmKCOuypgFUpqUZk6lDiqEvCWsWKKblrSycVUiaXdLeqvS/pTsuOU0toSIyg5U1FCRHLkAhioJcGyaZ/8rlcy/kqsp6W1yBIGy75SaF2UArDISePJfotBWFLwVISqRcTV%2BX/9oVFg55XcpEYPLmlzymJeSqiUvK1lCK72MitSWEr/lluUZTSsxWTLcVBHDZcSsnnKI4VnyhlUiq2WuloWWKigl0zmKrsEJZXbACB0NxCAEAxALsKCxt57ABZ%2BCmPnsAPbSLzY3wWgCwCyCDyTp8ou3O4k7jxxwwLAXOQLM0zlQ7BVsUjlIlXhnlzYXoVgMTAy71ENApAJ2ngsChiEPVNq8QnvWUJyyJ6vVMQNpBHlwFylllM6Sl2k6aKgeQEufnRMopvj4QqXAMrkuJQkyuWYauDAeUhCqImAwAcuIJAMCuQGAbEFUCSB5AW4GS53cPMTGrq90EieHMBuglDJ4AIVnIEmS8C8CEKu1HS/4g3QAqDrh17RUdYzG3Jj1M5jMYNYMwtJjVdoPkOsmdKHWEKrMu1ONDkvHULrLOEAEMDkmXVQtIFizeYluunU%2Brllh61oo6RvWEKlqY6%2BdY%2BornLr6VFitjloUAnnj01T4zNRv2zUJ9c1EAfNWdIvXXFt61gyIRY0n5zM3BXEz7hvX6HTC2JCIuYdrIvS0pu%2B0lC4PJSUp3ZaUU6WlEEAuBpArOF6AWWjgvQEapAVOJns43lXKhFVyq1VW3zQ3OEpM0LPhJpCvqRRE0mAJQmITJAVwngKAz9n7jrLEZpxFMCAIaC9qpJemjwdVm2xt4IAvACs0GM4PU5FrSMga7VcoUxYyEdV9yoCEEIWq/KY%2BWmnTUDyzr0kxVBK7Fcvls3abNCqa5VU5poUSrqW7m%2BzTSwQBChDizmvBVfX67RrxC%2B5Xqi2qUhdr4SwGdBAlrnU9qrYeAUgJFvv7WBrA/arkF6A83gCs626mdfevfVl0uYWWvANU0siFavNxAErfUSq0LquYs6w9evLCh2bPNQW2gI1vZRzdo1LWz8Hh0Qox43135JRaaQK2Bbitt6iwK%2BrnUTaP18JZrRIq63gCvQDWubc1uW3CkD1TdBktNu61ehet22gbdlt23/EByNpYTaJts3EBptL42aUdvAEQ9l0BmhUgFOA1EtQN4/cDcShe31boNKTKhHdssjVc7NT27zutqB5va4kGq2lNCi%2B1L8QNNwHNeSwB0w6etwOs8ptse1sNk5K7czWtoh3abjxzbACqes9A6rEZ6nbQKgBWAzS8dkO4RsohO347v1a9aHXVtxn07GdiO5HVGx%2B1o6wNGOqpIDt6FOEcdncn4Mj2bh714ZemuzkWry2hsYdxAeHaCrw2XsDCHALYLQE4BdpeAfgDgFoFICoBOAo%2BSwNYE9A7A9gdQyEDwFIAEBNAeurYJxBACSBEgLwSQCkD3QpANARwAcgFUSA9ADdHASQMbtd3m7OAvABQCAF9Uu7Tdeu0gHAFgBIAQ4oMcgJQEz0TAzgZPKJHIQYCcQ%2BASjeIPHogAxBo9IIZgMQCZCcAndQkenj%2BAYDXRo9WAcuEYHEDJ7SA%2BAV9o4F4nR78YjQT9g3t4Dz9w9Zu%2B/jEA0R16PAWAaPZJpYBj6tgmhctQoC7BmJeIP4VkCbqd38BBAIgMQOwEY0H75ASgNQNHt0DCVq1xgHLZYH0AoD49kALYBbgGDx6OAVsH8Nkx167Vy4ewd4GCTJjy94gtoG2AQAQZglZQ1pa3aMSYBx69pTQZwHjKkhzA/AwlUICsCqA1A9AJQHIAIDQO4Gsg%2BBhgKMGwMTBhKDQJA4MCWCEHKDiBgYEMHaBkHxgCQSg7Qc8BdA9AtsZg1gdYMSAtgE8XYPsD0BPBR4K%2B/QIbqj096LdHAO4IkCthVh89CoCAJU2L3b4rdVgB/RcFwCEB2IxwAWR4GEihwcwV6XgEnq0DopSAr0M3hMDmakAPdXaMwC8CnQvpJARwJw5ICKEcUihkhiPdIbN2yG49Ce53a7q2Bp7EAIATGPY2z079jDoMSIOpk4CqAqwLABQM3C2ZWUUgLwKQJ%2BA/iRBsAGwXgK/00V4B0AegM/UfvECn7ZAigFQOoB73X7SAvEDROkAkPh6jdmW6PbIZ/A1x7GSonQvcEUOGhlDfIVQ0XtoafgjDIMf8QYaKOhHk9Vhj3dsSnQpAu0RQ9w1Ol91skqkRwaQOHsj1dGZDse2wCEYsNu6/DZgAI7wCCMLHLDWwBXtkGcCSAgAA%3D%3D%3D + */ + /* + todo: this is a naive version, we can do far more optimizations for + constant src2 + */ + bool consts2 = false; + + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); } else { - e.mov(e.GetNativeParam(1), i.src2); + e.vmovdqa(e.xmm0, i.src1); } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateShrV128)); - e.vmovdqa(i.dest, e.xmm0); + if (i.src2.is_constant) { + consts2 = true; + e.mov(e.r8d, i.src2.constant() & 7); + e.mov(e.eax, 8 - (i.src2.constant() & 7)); + } else { + e.movzx(e.r8d, i.src2); + e.and_(e.r8d, 7); + } + + e.vpshufd(e.xmm1, e.xmm0, 27); + e.vpcmpeqd(e.xmm3, e.xmm3, e.xmm3); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMVSRShlByteshuf)); + if (!consts2) { + e.mov(e.eax, 8); + } + e.vmovd(e.xmm2, e.r8d); + if (!consts2) { + e.sub(e.eax, e.r8d); + } + e.vpsrlw(e.xmm1, e.xmm1, e.xmm2); + e.vpsrlw(e.xmm2, e.xmm3, e.xmm2); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMVSRMask)); + e.vpand(e.xmm1, e.xmm1, e.xmm2); + e.vmovd(e.xmm2, e.eax); + e.vpsllw(e.xmm0, e.xmm0, e.xmm2); + e.vpsllw(e.xmm2, e.xmm3, e.xmm2); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero)); + e.vpand(e.xmm0, e.xmm0, e.xmm2); + e.vpor(e.xmm0, e.xmm0, e.xmm1); + e.vpshufd(i.dest, e.xmm0, 27); } static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) { // Almost all instances are shamt = 1, but non-constant. @@ -3562,16 +3583,23 @@ extern volatile int anchor_vector; static int anchor_vector_dest = anchor_vector; bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) { - const InstrKey key(i); - auto it = sequence_table.find(key); - if (it != sequence_table.end()) { - if (it->second(*e, i)) { - *new_tail = i->next; - return true; + if ((i->backend_flags & INSTR_X64_FLAGS_ELIMINATED) != 0) { + // skip + *new_tail = i->next; + return true; + } else { + const InstrKey key(i); + + auto it = sequence_table.find(key); + if (it != sequence_table.end()) { + if (it->second(*e, i)) { + *new_tail = i->next; + return true; + } } + XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode)); + return false; } - XELOGE("No sequence match for variant {}", GetOpcodeName(i->opcode)); - return false; } } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h index 07b264ab2..d83e40e28 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.h +++ b/src/xenia/cpu/backend/x64/x64_sequences.h @@ -13,6 +13,8 @@ #include "xenia/cpu/hir/instr.h" #include +#define assert_impossible_sequence(name) \ + assert_always("impossible sequence hit" #name); namespace xe { namespace cpu { diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 6a6a56330..b6985b1d8 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -749,7 +749,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - + case OPCODE_PERMUTE: { if (i->src1.value->IsConstant() && i->src2.value->IsConstant() && i->src3.value->IsConstant() && @@ -760,17 +760,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } - else if (i->src2.value->IsConstantZero() && i->src3.value->IsConstantZero() && + else if (i->src2.value->IsConstantZero() && + i->src3.value->IsConstantZero() && i->flags == INT8_TYPE /*probably safe for int16 too*/) { /* - chrispy: hoisted this check here from x64_seq_vector where if src1 is not constant, but src2 and src3 are zero, then we know the result will always be zero + chrispy: hoisted this check here from x64_seq_vector where if + src1 is not constant, but src2 and src3 are zero, then we know + the result will always be zero */ v->set_zero(VEC128_TYPE); i->Remove(); result = true; } - + break; } case OPCODE_INSERT: @@ -930,6 +933,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; + case OPCODE_TO_SINGLE: + if (i->src1.value->IsConstant()) { + v->set_from(i->src1.value); + v->ToSingle(); + i->Remove(); + result = true; + } + break; default: // Ignored. break; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 038214d88..1b84e417c 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -10,6 +10,7 @@ #include "xenia/cpu/compiler/passes/simplification_pass.h" #include "xenia/base/byte_order.h" +#include "xenia/base/logging.h" #include "xenia/base/profiling.h" namespace xe { namespace cpu { @@ -82,7 +83,7 @@ bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { iter_result |= SimplifyBitArith(builder); iter_result |= EliminateConversions(builder); iter_result |= SimplifyAssignments(builder); - iter_result |= BackpropTruncations(builder); + result |= iter_result; } while (iter_result); return true; @@ -1207,71 +1208,6 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { return result; } -struct TruncateSimplifier { - TypeName type_from, type_to; - uint32_t sizeof_from, sizeof_to; - uint32_t bit_sizeof_from, bit_sizeof_to; - uint64_t typemask_from, typemask_to; - hir::HIRBuilder* builder; - hir::Instr* truncate_instr; - hir::Value* truncated_value; - hir::Instr* truncated_value_def; -}; -bool SimplificationPass::BackpropTruncations(hir::Instr* i, - hir::HIRBuilder* builder) { - if (i->opcode != &OPCODE_TRUNCATE_info) { - return false; - } - TypeName type_from = i->src1.value->type; - TypeName type_to = i->dest->type; - - uint32_t sizeof_from = static_cast(GetTypeSize(type_from)); - uint32_t sizeof_to = static_cast(GetTypeSize(type_to)); - - Instr* input_def = i->src1.value->GetDefSkipAssigns(); - if (!input_def) { - return false; - } - Opcode input_opc = input_def->opcode->num; - - if (input_opc == OPCODE_SHL && input_def->src2.value->IsConstant()) { - uint32_t src2_shift = input_def->src2.value->AsUint32(); - if (src2_shift < (sizeof_to * CHAR_BIT)) { - Value* truncated_preshift = - builder->Truncate(input_def->src1.value, type_to); - - truncated_preshift->def->MoveBefore(i); - i->Replace(&OPCODE_SHL_info, 0); - i->set_src1(truncated_preshift); - i->set_src2(input_def->src2.value); - return true; - } - } - if (input_opc == OPCODE_LOAD_CONTEXT) { - if (sizeof_from == 8 && sizeof_to == 4) { - Value* loadof = builder->LoadContext(input_def->src1.offset, INT32_TYPE); - loadof->def->MoveBefore(input_def); - i->Replace(&OPCODE_ASSIGN_info, 0); - i->set_src1(loadof); - return true; - } - } - - return false; -} -bool SimplificationPass::BackpropTruncations(hir::HIRBuilder* builder) { - bool result = false; - auto block = builder->first_block(); - while (block) { - auto i = block->instr_head; - while (i) { - result |= BackpropTruncations(i, builder); - i = i->next; - } - block = block->next; - } - return result; -} Value* SimplificationPass::CheckValue(Value* value, bool& result) { auto def = value->def; if (def && def->opcode == &OPCODE_ASSIGN_info) { diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index fe8de8474..3e3fa9c46 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -32,8 +32,6 @@ class SimplificationPass : public ConditionalGroupSubpass { bool SimplifyAssignments(hir::HIRBuilder* builder); hir::Value* CheckValue(hir::Value* value, bool& result); bool SimplifyBitArith(hir::HIRBuilder* builder); - bool BackpropTruncations(hir::Instr* i, hir::HIRBuilder* builder); - bool BackpropTruncations(hir::HIRBuilder* builder); // handle either or or xor with 0 bool CheckOrXorZero(hir::Instr* i); bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 2665842a5..03e73ca1b 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -692,6 +692,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags, instr->block = block; instr->opcode = &opcode_info; instr->flags = flags; + instr->backend_flags = 0; instr->dest = dest; instr->src1.value = instr->src2.value = instr->src3.value = NULL; instr->src1_use = instr->src2_use = instr->src3_use = NULL; @@ -1492,7 +1493,6 @@ Value* HIRBuilder::VectorCompareUGE(Value* value1, Value* value2, part_type); } Value* HIRBuilder::VectorDenormFlush(Value* value1) { - return value1; ASSERT_VECTOR_TYPE(value1); Instr* i = AppendInstr(OPCODE_VECTOR_DENORMFLUSH_info, 0, AllocValue(VEC128_TYPE)); @@ -1501,6 +1501,14 @@ Value* HIRBuilder::VectorDenormFlush(Value* value1) { i->src3.value = nullptr; return i->dest; } +Value* HIRBuilder::ToSingle(Value* value) { + assert_true(value->type == FLOAT64_TYPE); + Instr* i = AppendInstr(OPCODE_TO_SINGLE_info, 0, AllocValue(FLOAT64_TYPE)); + i->set_src1(value); + i->src2.value = nullptr; + i->src3.value = nullptr; + return i->dest; +} Value* HIRBuilder::Add(Value* value1, Value* value2, uint32_t arithmetic_flags) { ASSERT_TYPES_EQUAL(value1, value2); @@ -1720,7 +1728,6 @@ Value* HIRBuilder::Log2(Value* value) { return i->dest; } - Value* HIRBuilder::DotProduct3(Value* value1, Value* value2) { ASSERT_VECTOR_TYPE(value1); ASSERT_VECTOR_TYPE(value2); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index 3b29867e9..be08dbc98 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -200,7 +200,7 @@ class HIRBuilder { Value* VectorCompareUGT(Value* value1, Value* value2, TypeName part_type); Value* VectorCompareUGE(Value* value1, Value* value2, TypeName part_type); Value* VectorDenormFlush(Value* value1); - + Value* ToSingle(Value* value); Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* AddWithCarry(Value* value1, Value* value2, Value* value3, uint32_t arithmetic_flags = 0); diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 118895719..92e2848f8 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -180,6 +180,26 @@ exit_loop: *tunnel_flags = traversed_types; return current_def; } +bool Instr::IsFake() const { + Opcode num = opcode->num; + switch (num) { + case OPCODE_NOP: + case OPCODE_COMMENT: + case OPCODE_CONTEXT_BARRIER: + case OPCODE_SOURCE_OFFSET: + return true; + } + return false; +} + +const Instr* Instr::GetNonFakePrev() const { + const Instr* curr = prev; + + while (curr && curr->IsFake()) { + curr = curr->prev; + } + return curr; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index db3c78922..337622215 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -42,6 +42,7 @@ class Instr { const OpcodeInfo* opcode; uint16_t flags; + uint16_t backend_flags; // backends may do whatever they wish with this uint32_t ordinal; typedef union { @@ -158,6 +159,11 @@ if both are constant, return nullptr, nullptr call_for_values(src3.value, 2); } } + bool IsFake() const; + + // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER, + // OPCODE_SOURCE_OFFSET + const hir::Instr* GetNonFakePrev() const; }; } // namespace hir diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index acc61d047..93b3e7e62 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -281,7 +281,10 @@ enum Opcode { OPCODE_ATOMIC_COMPARE_EXCHANGE, OPCODE_SET_ROUNDING_MODE, OPCODE_VECTOR_DENORMFLUSH, // converts denormals to signed zeros in a vector - __OPCODE_MAX_VALUE, // Keep at end. + OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode, + // as we already have OPCODE_ROUND. round double to float ( + // ppc "single" fpu instruction result rounding behavior ) + __OPCODE_MAX_VALUE, // Keep at end. }; enum OpcodeFlags { @@ -352,7 +355,9 @@ static bool IsOpcodeBinaryValue(uint32_t signature) { return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); } - +static bool IsOpcodeUnaryValue(uint32_t signature) { + return (signature & ~(0x7)) == ((OPCODE_SIG_TYPE_V << 3)); +} static void UnpackOpcodeSig(uint32_t sig, OpcodeSignatureType& dest, OpcodeSignatureType& src1, OpcodeSignatureType& src2, diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index a1ca73f7d..be06171f0 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -679,4 +679,11 @@ DEFINE_OPCODE( "vector_denormflush", OPCODE_SIG_V_V, 0 +) + +DEFINE_OPCODE( + OPCODE_TO_SINGLE, + "to_single", + OPCODE_SIG_V_V, + 0 ) \ No newline at end of file diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index c4ebdeb2c..a1e6fc2ea 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -1643,6 +1643,11 @@ void Value::DenormalFlush() { constant.v128.u32[i] = current_element; } } +void Value::ToSingle() { + assert_true(type == FLOAT64_TYPE); + + constant.f64 = static_cast(static_cast(constant.f64)); +} void Value::CountLeadingZeros(const Value* other) { switch (other->type) { case INT8_TYPE: @@ -1805,6 +1810,25 @@ hir::Instr* Value::GetDefTunnelMovs(unsigned int* tunnel_flags) { return nullptr; } } +// does the value only have one instr that uses it? +bool Value::HasSingleUse() const { + return use_head && use_head->next == nullptr; +} +bool Value::AllUsesByOneInsn() const { + if (!use_head) { + return false; + } + const Use* first_use = use_head; + const Instr* should_match = first_use->instr; + + for (const Use* current_use = first_use->next; current_use; + current_use = current_use->next) { + if (current_use->instr != should_match) { + return false; + } + } + return true; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index 84d121a26..d878f29cd 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -226,6 +226,15 @@ class Value { return (flags & VALUE_IS_CONSTANT) ? nullptr : local_slot; } inline bool IsConstant() const { return !!(flags & VALUE_IS_CONSTANT); } + + inline bool IsEqual(const Value* other) const { + if (this == other) { + return true; + } else if ((this->flags & other->flags) & VALUE_IS_CONSTANT) { + return this->IsConstantEQ(other); + } + return false; + } bool IsConstantTrue() const { if (type == VEC128_TYPE) { assert_always(); @@ -327,7 +336,7 @@ class Value { return false; } } - bool IsConstantEQ(Value* other) const { + bool IsConstantEQ(const Value* other) const { if (type == VEC128_TYPE) { assert_always(); } @@ -594,13 +603,19 @@ class Value { bool saturate); void ByteSwap(); void DenormalFlush(); - + void ToSingle(); void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); hir::Instr* GetDefSkipAssigns(); // tunnel_flags is updated to the kinds we actually traversed hir::Instr* GetDefTunnelMovs(unsigned int* tunnel_flags); + // does the value only have one instr that uses it? + bool HasSingleUse() const; + // returns true if every single use is as an operand to a single instruction + // (add var2, var1, var1) + bool AllUsesByOneInsn() const; + private: static bool CompareInt8(Opcode opcode, Value* a, Value* b); static bool CompareInt16(Opcode opcode, Value* a, Value* b); diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index 777ef568a..a9c0c8ed1 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -379,7 +379,7 @@ typedef struct alignas(64) PPCContext_s { uint64_t lr; // 0x10 Link register double f[32]; // 0x120 Floating-point registers vec128_t v[128]; // 0x220 VMX128 vector registers - + vec128_t vscr_vec; // XER register: // Split to make it easier to do individual updates. uint8_t xer_ca; @@ -422,7 +422,7 @@ typedef struct alignas(64) PPCContext_s { // Value of last reserved load uint64_t reserved_val; ThreadState* thread_state; - uint8_t* virtual_membase; + uint8_t* virtual_membase; static std::string GetRegisterName(PPCRegister reg); std::string GetStringFromValue(PPCRegister reg) const; void SetValueFromString(PPCRegister reg, std::string value); @@ -432,6 +432,7 @@ typedef struct alignas(64) PPCContext_s { std::string& result) const; } PPCContext; #pragma pack(pop) +constexpr size_t ppcctx_size = sizeof(PPCContext); static_assert(sizeof(PPCContext) % 64 == 0, "64b padded"); } // namespace ppc diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 37ee10396..40d3f32cd 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -355,13 +355,18 @@ int InstrEmit_stvrxl128(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_mfvscr(PPCHIRBuilder& f, const InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // is this the right format? + + f.StoreVR(i.VX128_1.RB, + f.LoadContext(offsetof(PPCContext, vscr_vec), VEC128_TYPE)); + return 0; } int InstrEmit_mtvscr(PPCHIRBuilder& f, const InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // is this the right format? + Value* v = f.LoadVR(i.VX128_1.RB); + f.StoreContext(offsetof(PPCContext, vscr_vec), v); + return 0; } int InstrEmit_vaddcuw(PPCHIRBuilder& f, const InstrData& i) { @@ -1105,7 +1110,7 @@ int InstrEmit_vmsum3fp128(PPCHIRBuilder& f, const InstrData& i) { // Dot product XYZ. // (VD.xyzw) = (VA.x * VB.x) + (VA.y * VB.y) + (VA.z * VB.z) Value* v = f.DotProduct3(f.LoadVR(VX128_VA128), f.LoadVR(VX128_VB128)); - //chrispy: denormal outputs for Dot product are unconditionally made 0 + // chrispy: denormal outputs for Dot product are unconditionally made 0 v = f.VectorDenormFlush(v); f.StoreVR(VX128_VD128, v); return 0; diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 8c71e08b5..6c9fd9120 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -336,6 +336,7 @@ int InstrEmit_mulhwx(PPCHIRBuilder& f, const InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; } + Value* v = f.SignExtend(f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)), INT64_TYPE); @@ -353,6 +354,7 @@ int InstrEmit_mulhwux(PPCHIRBuilder& f, const InstrData& i) { XEINSTRNOTIMPLEMENTED(); return 1; } + Value* v = f.ZeroExtend( f.MulHi(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), ARITHMETIC_UNSIGNED), diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 979ca3aa9..872ee1ff4 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -46,7 +46,7 @@ int InstrEmit_faddx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_faddsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- (frA) + (frB) Value* v = f.Add(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -63,7 +63,7 @@ int InstrEmit_fdivx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fdivsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- frA / frB Value* v = f.Div(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -80,7 +80,7 @@ int InstrEmit_fmulx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- (frA) x (frC) Value* v = f.Mul(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -88,9 +88,9 @@ int InstrEmit_fmulsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fresx(PPCHIRBuilder& f, const InstrData& i) { // frD <- 1.0 / (frB) - Value* v = f.Convert(f.Div(f.LoadConstantFloat32(1.0f), - f.Convert(f.LoadFPR(i.A.FRB), FLOAT32_TYPE)), - FLOAT64_TYPE); + + Value* v = f.Recip(f.LoadFPR(i.A.FRB)); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -116,7 +116,7 @@ int InstrEmit_fsubx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fsubsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- (frA) - (frB) Value* v = f.Sub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -132,64 +132,63 @@ int InstrEmit_fselx(PPCHIRBuilder& f, const InstrData& i) { f.UpdateFPSCR(v, i.A.Rc); return 0; } - -int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) { - // Double precision: +static int InstrEmit_fsqrt(PPCHIRBuilder& f, const InstrData& i, bool single) { // frD <- sqrt(frB) Value* v = f.Sqrt(f.LoadFPR(i.A.FRB)); + if (single) { + v = f.ToSingle(v); + } f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; } +int InstrEmit_fsqrtx(PPCHIRBuilder& f, const InstrData& i) { + return InstrEmit_fsqrt(f, i, false); +} int InstrEmit_fsqrtsx(PPCHIRBuilder& f, const InstrData& i) { - // Single precision: - // frD <- sqrt(frB) - Value* v = f.Sqrt(f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); - f.StoreFPR(i.A.FRT, v); - f.UpdateFPSCR(v, i.A.Rc); - return 0; + return InstrEmit_fsqrt(f, i, true); } // Floating-point multiply-add (A-9) -int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) { +static int InstrEmit_fmadd(PPCHIRBuilder& f, const InstrData& i, bool single) { // frD <- (frA x frC) + frB Value* v = f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)); + if (single) { + v = f.ToSingle(v); + } f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; } +int InstrEmit_fmaddx(PPCHIRBuilder& f, const InstrData& i) { + return InstrEmit_fmadd(f, i, false); +} + int InstrEmit_fmaddsx(PPCHIRBuilder& f, const InstrData& i) { - // frD <- (frA x frC) + frB + return InstrEmit_fmadd(f, i, true); +} + +static int InstrEmit_fmsub(PPCHIRBuilder& f, const InstrData& i, bool single) { + // frD <- (frA x frC) - frB Value* v = - f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)); + if (single) { + v = f.ToSingle(v); + } f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; } - int InstrEmit_fmsubx(PPCHIRBuilder& f, const InstrData& i) { - // frD <- (frA x frC) - frB - Value* v = - f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)); - f.StoreFPR(i.A.FRT, v); - f.UpdateFPSCR(v, i.A.Rc); - return 0; + return InstrEmit_fmsub(f, i, false); } int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) { - // frD <- (frA x frC) - frB - Value* v = - f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); - f.StoreFPR(i.A.FRT, v); - f.UpdateFPSCR(v, i.A.Rc); - return 0; + return InstrEmit_fmsub(f, i, true); } int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { @@ -205,7 +204,7 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] + frB) Value* v = f.Neg( f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -224,7 +223,7 @@ int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] - frB) Value* v = f.Neg( f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); - v = f.Convert(f.Convert(v, FLOAT32_TYPE), FLOAT64_TYPE); + v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0;