From 79465708aa85cfc61a9fcd2b73e1bd0c79dd6bfc Mon Sep 17 00:00:00 2001 From: disjtqz Date: Sat, 30 Sep 2023 14:59:56 -0400 Subject: [PATCH] implement bit-perfect vrsqrtefp --- src/xenia/cpu/backend/x64/x64_backend.cc | 319 +++++++++++++++++- src/xenia/cpu/backend/x64/x64_backend.h | 21 +- src/xenia/cpu/backend/x64/x64_emitter.cc | 47 ++- src/xenia/cpu/backend/x64/x64_emitter.h | 6 +- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 11 + src/xenia/cpu/backend/x64/x64_sequences.cc | 42 ++- .../compiler/passes/simplification_pass.cc | 32 ++ src/xenia/cpu/hir/value.cc | 80 +++++ src/xenia/cpu/hir/value.h | 10 +- 9 files changed, 540 insertions(+), 28 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 1e1f3fedb..e9927dc09 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter { void* EmitTryAcquireReservationHelper(); void* EmitReservedStoreHelper(bool bit64 = false); + void* EmitScalarVRsqrteHelper(); + void* EmitVectorVRsqrteHelper(void* scalar_helper); + private: void* EmitCurrentForOffsets(const _code_offsets& offsets, size_t stack_size = 0); @@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) { if (!code_cache_->Initialize()) { return false; } + // Allocate emitter constant data. + emitter_data_ = X64Emitter::PlaceConstData(); // Generate thunks used to transition between jitted code and host code. XbyakAllocator allocator; @@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) { thunk_emitter.EmitTryAcquireReservationHelper(); reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false); reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true); - + vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper(); + vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper); // Set the code cache to use the ResolveFunction thunk for default // indirections. assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull); @@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) { // Allocate some special indirections. code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF); - // Allocate emitter constant data. - emitter_data_ = X64Emitter::PlaceConstData(); - // Setup exception callback ExceptionHandler::Install(&ExceptionCallbackThunk, this); if (cvars::record_mmio_access_exceptions) { @@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk( _code_offsets code_offsets = {}; code_offsets.prolog = getSize(); pop(r8); // return address - + switch (stack_element_size) { case 4: mov(r11d, ptr[r8]); @@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk( return EmitCurrentForOffsets(code_offsets); } +void* X64HelperEmitter::EmitScalarVRsqrteHelper() { + _code_offsets code_offsets = {}; + + Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1; + Xbyak::Label LC1, _LCPI3_1; + Xbyak::Label handle_denormal_input; + Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal; + + auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) { + inLocalLabel(); + Xbyak::Label end_lzcnt; + bsr(scratch_reg, reg); + mov(reg, 0x20); + jz(end_lzcnt); + xor_(scratch_reg, 0x1F); + mov(reg, scratch_reg); + L(end_lzcnt); + outLocalLabel(); + }; + + vmovd(r8d, xmm0); + vmovaps(xmm1, xmm0); + mov(ecx, r8d); + //extract mantissa + and_(ecx, 0x7fffff); + mov(edx, ecx); + cmp(r8d, 0xff800000); + jz(specialcheck_1, CodeGenerator::T_NEAR); + //is exponent zero? + test(r8d, 0x7f800000); + jne(L18); + test(ecx, ecx); + jne(L2); + + L(L18); + //extract biased exponent and unbias + mov(r9d, r8d); + shr(r9d, 23); + movzx(r9d, r9b); + lea(eax, ptr[r9 - 127]); + cmp(r9d, 255); + jne(L4); + jmp(L35); + + L(L2); + + bt(GetBackendFlagsPtr(), kX64BackendNJMOn); + jnc(handle_denormal_input, CodeGenerator::T_NEAR); + + // handle denormal input with NJM on + // denorms get converted to zero w/ input sign, jump to our label + // that handles inputs of 0 for this + + jmp(convert_to_signed_inf_and_ret); + L(L35); + + vxorps(xmm0, xmm0, xmm0); + mov(eax, 128); + vcomiss(xmm1, xmm0); + jb(L4); + test(ecx, ecx); + jne(L8); + ret(); + + L(L4); + cmp(eax, 128); + jne(L9); + vxorps(xmm0, xmm0, xmm0); + vcomiss(xmm0, xmm1); + jbe(L9); + vmovss(xmm2, ptr[rip+LC1]); + vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32)); + + test(edx, edx); + jne(L8); + vorps(xmm0, xmm2, xmm2); + ret(); + + L(L9); + test(edx, edx); + je(L10); + cmp(eax, 128); + jne(L11); + L(L8); + or_(r8d, 0x400000); + vmovd(xmm0, r8d); + ret(); + L(L10); + test(r9d, r9d); + jne(L11); + L(convert_to_signed_inf_and_ret); + not_(r8d); + shr(r8d, 31); + + lea(rdx, ptr[rip + _LCPI3_1]); + shl(r8d, 2); + vmovss(xmm0, ptr[r8 + rdx]); + ret(); + + L(L11); + vxorps(xmm2, xmm2, xmm2); + vmovss(xmm0, ptr[rip+LC1]); + vcomiss(xmm2, xmm1); + ja(L1, CodeGenerator::T_NEAR); + mov(ecx, 127); + sal(eax, 4); + sub(ecx, r9d); + mov(r9d, edx); + and_(eax, 16); + shr(edx, 9); + shr(r9d, 19); + and_(edx, 1023); + sar(ecx, 1); + or_(eax, r9d); + xor_(eax, 16); + mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) + + rax * 4]); + mov(eax, r9d); + shr(r9d, 16); + imul(edx, r9d); + sal(eax, 10); + and_(eax, 0x3fffc00); + sub(eax, edx); + bt(eax, 25); + jc(L12); + mov(edx, eax); + add(ecx, 6); + and_(edx, 0x1ffffff); + + if (IsFeatureEnabled(kX64EmitLZCNT)) { + lzcnt(edx, edx); + } else { + emulate_lzcnt_helper_unary_reg(edx, r9d); + } + + lea(r9d, ptr[rdx - 6]); + sub(ecx, edx); + if (IsFeatureEnabled(kX64EmitBMI2)) { + shlx(eax, eax, r9d); + } else { + xchg(ecx, r9d); + shl(eax, cl); + xchg(ecx, r9d); + } + + L(L12); + test(al, 5); + je(L13); + test(al, 2); + je(L13); + add(eax, 4); + + L(L13); + sal(ecx, 23); + and_(r8d, 0x80000000); + shr(eax, 2); + add(ecx, 0x3f800000); + and_(eax, 0x7fffff); + vxorps(xmm1, xmm1); + or_(ecx, r8d); + or_(ecx, eax); + vmovd(xmm0, ecx); + vaddss(xmm0, xmm1);//apply DAZ behavior to output + + L(L1); + ret(); + + + L(handle_denormal_input); + mov(r9d, r8d); + and_(r9d, 0x7FFFFFFF); + cmp(r9d, 0x400000); + jz(handle_oddball_denormal); + if (IsFeatureEnabled(kX64EmitLZCNT)) { + lzcnt(ecx, ecx); + } else { + emulate_lzcnt_helper_unary_reg(ecx, r9d); + } + + mov(r9d, 9); + mov(eax, -118); + lea(edx, ptr[rcx - 8]); + sub(r9d, ecx); + sub(eax, ecx); + if (IsFeatureEnabled(kX64EmitBMI2)) { + shlx(edx, r8d, edx); + } else { + xchg(ecx, edx); + // esi is just the value of xmm0's low word, so we can restore it from there + shl(r8d, cl); + mov(ecx, edx); // restore ecx, dont xchg because we're going to spoil edx anyway + mov(edx, r8d); + vmovd(r8d, xmm0); + } + and_(edx, 0x7ffffe); + jmp(L4); + + L(specialcheck_1); + //should be extremely rare + vmovss(xmm0, ptr[rip+LC1]); + ret(); + + L(handle_oddball_denormal); + not_(r8d); + lea(r9, ptr[rip + LC1]); + + shr(r8d, 31); + movss(xmm0, ptr[r9 + r8 * 4]); + ret(); + + L(_LCPI3_1); + dd(0xFF800000); + dd(0x7F800000); + L(LC1); + //the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal + dd(0x7FC00000); + dd(0x5F34FD00); + + + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.prolog = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + return EmitCurrentForOffsets(code_offsets); +} + +void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) { + _code_offsets code_offsets = {}; + Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version; + auto result_ptr = + GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0])); + auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2])); + counter_ptr.setBit(64); + + //shuffle and xor to check whether all lanes are equal + //sadly has to leave the float pipeline for the vptest, which is moderate yikes + vmovhlps(xmm2, xmm0, xmm0); + vmovsldup(xmm1, xmm0); + vxorps(xmm1, xmm1, xmm0); + vxorps(xmm2, xmm2, xmm0); + vorps(xmm2, xmm1, xmm2); + vptest(xmm2, xmm2); + jnz(check_scalar_operation_in_vmx); + //jmp(scalar_helper, CodeGenerator::T_NEAR); + call(scalar_helper); + vshufps(xmm0, xmm0, xmm0, 0); + ret(); + + L(check_scalar_operation_in_vmx); + + vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]); + jnz(actual_vector_version); + vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3)); + call(scalar_helper); + // this->DebugBreak(); + vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6)); + + vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)], + 0b0111); + + ret(); + + + L(actual_vector_version); + + + xor_(ecx, ecx); + vmovaps(result_ptr, xmm0); + + mov(counter_ptr, rcx); + Xbyak::Label loop; + + L(loop); + lea(rax, result_ptr); + vmovss(xmm0, ptr[rax+rcx*4]); + call(scalar_helper); + mov(rcx, counter_ptr); + lea(rax, result_ptr); + vmovss(ptr[rax+rcx*4], xmm0); + inc(ecx); + cmp(ecx, 4); + mov(counter_ptr, rcx); + jl(loop); + vmovaps(xmm0, result_ptr); + ret(); + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + code_offsets.prolog = getSize(); + return EmitCurrentForOffsets(code_offsets); +} + void* X64HelperEmitter::EmitTryAcquireReservationHelper() { _code_offsets code_offsets = {}; code_offsets.prolog = getSize(); @@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() { Xbyak::Label already_has_a_reservation; Xbyak::Label acquire_new_reservation; - btr(GetBackendFlagsPtr(), 1); + btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit); mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_))); jc(already_has_a_reservation); @@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() { // set flag on local backend context for thread to indicate our previous // attempt to get the reservation succeeded setnc(r9b); // success = bitmap did not have a set bit at the idx - shl(r9b, 1); + shl(r9b, kX64BackendHasReserveBit); mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)), rdx); @@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) { Xbyak::Label somehow_double_cleared; // carry must be set + zero flag must be set - btr(GetBackendFlagsPtr(), 1); + btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit); jnc(done); @@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) { : nullptr; bctx->current_stackpoint_depth = 0; bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR; - bctx->flags = 0; + bctx->flags = (1U << kX64BackendNJMOn); // NJM on by default // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; bctx->guest_tick_count = Clock::GetGuestTickCountPointer(); @@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) { uint32_t control = mode & 7; _mm_setcsr(mxcsr_table[control]); bctx->mxcsr_fpu = mxcsr_table[control]; - ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control; + auto ppc_context = ((ppc::PPCContext*)ctx); + ppc_context->fpscr.bits.rn = control; + ppc_context->fpscr.bits.ni = control >> 2; } bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) { diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 97b37a692..a7676ec2d 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -61,11 +61,22 @@ struct X64BackendStackpoint { // use unsigned guest_return_address_; }; +enum : uint32_t { + kX64BackendMXCSRModeBit = 0, + kX64BackendHasReserveBit = 1, + kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines + kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu. +}; // located prior to the ctx register // some things it would be nice to have be per-emulator instance instead of per // context (somehow placing a global X64BackendCtx prior to membase, so we can // negatively index the membase reg) struct X64BackendContext { + union { + __m128 helper_scratch_xmms[4]; + uint64_t helper_scratch_u64s[8]; + uint32_t helper_scratch_u32s[16]; + }; ReserveHelper* reserve_helper_; uint64_t cached_reserve_value_; // guest_tick_count is used if inline_loadclock is used @@ -147,6 +158,13 @@ class X64Backend : public Backend { virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override; void RecordMMIOExceptionForGuestInstruction(void* host_address); + + uint32_t LookupXMMConstantAddress32(unsigned index) { + return static_cast(emitter_data() + sizeof(vec128_t) * index); + } + void* LookupXMMConstantAddress(unsigned index) { + return reinterpret_cast(emitter_data() + sizeof(vec128_t) * index); + } #if XE_X64_PROFILER_AVAILABLE == 1 uint64_t* GetProfilerRecordForFunction(uint32_t guest_address); #endif @@ -173,7 +191,8 @@ class X64Backend : public Backend { void* try_acquire_reservation_helper_ = nullptr; void* reserved_store_32_helper = nullptr; void* reserved_store_64_helper = nullptr; - + void* vrsqrtefp_vector_helper = nullptr; + void* vrsqrtefp_scalar_helper = nullptr; private: #if XE_X64_PROFILER_AVAILABLE == 1 GuestProfilerData profiler_data_; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index da9816a35..239c14b75 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1, return result; } +static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2, + uint32_t v3) { + vec128_t result; + result.u32[0] = v0; + result.u32[1] = v1; + result.u32[2] = v2; + result.u32[3] = v3; + return result; +} + static const vec128_t xmm_consts[] = { /* XMMZero */ vec128f(0.0f), /* XMMByteSwapMask */ @@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = { vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/ v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), // XMMVSRMask - vec128b(1)}; + vec128b(1), + //XMMVRsqrteTableStart + v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618), + v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8), + v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57), + v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B), + v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572), + v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0), + v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287), + v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4), + //XMMVRsqrteTableBase + vec128i(0) //filled in later +}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { @@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() { // The pointer must not be greater than 31 bits. assert_zero(reinterpret_cast(mem) & ~0x7FFFFFFF); + std::memcpy(mem, xmm_consts, sizeof(xmm_consts)); + /* + set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart + this + */ + vec128_t* deferred_constants = reinterpret_cast(mem); + vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase]; + uint32_t ptr_to_vrsqrte_table32 = static_cast(reinterpret_cast(&deferred_constants[XMMVRsqrteTableStart])); + *vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32); + memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr); return reinterpret_cast(mem); @@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) { Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) { // Load through fixed constant table setup by PlaceConstData. // It's important that the pointer is not signed, as it will be sign-extended. - return ptr[reinterpret_cast(backend_->emitter_data() + - sizeof(vec128_t) * id)]; + void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast(id)); + xenia_assert(reinterpret_cast(emitter_data_ptr) < (1ULL << 31));//must not have signbit set + return ptr[emitter_data_ptr]; } // Implies possible StashXmm(0, ...)! void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { @@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) { } else { // even if already set, we still need to update flags to reflect // our mode if (new_mode == MXCSRMode::Fpu) { - btr(GetBackendFlagsPtr(), 0); + btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit); } else if (new_mode == MXCSRMode::Vmx) { - bts(GetBackendFlagsPtr(), 0); + bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit); } else { assert_unhandled_case(new_mode); } @@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) { if (!already_set) { if (new_mode == MXCSRMode::Fpu) { LoadFpuMxcsrDirect(); - btr(GetBackendFlagsPtr(), 0); + btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit); return true; } else if (new_mode == MXCSRMode::Vmx) { LoadVmxMxcsrDirect(); - bts(GetBackendFlagsPtr(), 0); + bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit); return true; } else { assert_unhandled_case(new_mode); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 4fdeab4a4..d6ca1e028 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -174,7 +174,9 @@ enum XmmConst { XMMSTVLShuffle, XMMSTVRSwapMask, // swapwordmask with bit 7 set XMMVSRShlByteshuf, - XMMVSRMask + XMMVSRMask, + XMMVRsqrteTableStart, + XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm }; using amdfx::xopcompare_e; @@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator { size_t stack_size() const { return stack_size_; } SimdDomain DeduceSimdDomain(const hir::Value* for_value); - + void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; } /* returns true if had to load mxcsr. DOT_PRODUCT can use this to skip diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 791b9a87d..e97040f17 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence> { auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx)); addr_vmx.setBit(32); + auto flags_ptr = e.GetBackendFlagsPtr(); if (i.src1.is_constant) { if (i.src1.constant() == 0) { // turn off daz/flush2z e.mov(addr_vmx, _MM_MASK_MASK); + e.btr(flags_ptr, kX64BackendNJMOn); } else { e.mov(addr_vmx, DEFAULT_VMX_MXCSR); + e.bts(flags_ptr, kX64BackendNJMOn); } } else { + e.mov(e.eax, flags_ptr); + e.mov(e.edx, 1U << kX64BackendNJMOn); + e.mov(e.ecx, e.edx); + e.not_(e.ecx); + e.and_(e.ecx, e.eax); + e.or_(e.edx, e.eax); e.test(i.src1, i.src1); + e.cmove(e.edx, e.ecx); + e.mov(flags_ptr, e.edx); e.mov(e.edx, DEFAULT_VMX_MXCSR); e.mov(e.eax, _MM_MASK_MASK); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 5f428ad6c..e60e6c33b 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3); - if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vrsqrt14ps(i.dest, src1); + /* + the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part + of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles + that have inputs which do not come from vmsum. + */ + if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) { + e.vmovss(e.xmm0, src1); + e.call(e.backend()->vrsqrtefp_scalar_helper); + e.vshufps(i.dest, e.xmm0, e.xmm0, 0); } else { - e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vsqrtps(e.xmm1, src1); - e.vdivps(i.dest, e.xmm0, e.xmm1); + e.vmovaps(e.xmm0, src1); + e.call(e.backend()->vrsqrtefp_vector_helper); + e.vmovaps(i.dest, e.xmm0); } } }; @@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32 // removed the And with 7 and hoisted that and into the InstrEmit_'s that // generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and // backends dont have to worry about it + auto flags_ptr = e.GetBackendFlagsPtr(); if (i.src1.is_constant) { - e.mov(e.eax, mxcsr_table[i.src1.constant()]); + unsigned constant_value = i.src1.constant(); + e.mov(e.eax, mxcsr_table[constant_value]); + + if (constant_value & 4) { + e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode); + } + else { + e.btr(flags_ptr, kX64BackendNonIEEEMode); + } e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax); e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax); e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]); } else { - e.mov(e.ecx, i.src1); + //can andnot, but this is a very infrequently used opcode + e.mov(e.eax, 1U << kX64BackendNonIEEEMode); + e.mov(e.edx, e.eax); + e.not_(e.edx); + e.mov(e.ecx, flags_ptr); + //edx = flags w/ non ieee cleared + e.and_(e.edx, e.ecx); + //eax = flags w/ non ieee set + e.or_(e.eax, e.ecx); + e.bt(i.src1, 2); + e.mov(e.ecx, i.src1); + e.cmovc(e.edx, e.eax); e.mov(e.rax, uintptr_t(mxcsr_table)); + e.mov(flags_ptr, e.edx); e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]); // this was not here e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx); diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index da6f0dfe1..466bf21a5 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i, } } } + + /* + splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and + instead use the vector extracted from, which will be identical + have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there + */ + if (opc == OPCODE_SPLAT) { + if (i->dest->type == VEC128_TYPE) { + auto splatted_value = i->src1.value; + auto splat_type = splatted_value->type; + if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) { + //its a splat of a fourbyte value, check the definition + auto splat_input_definition = splatted_value->GetDefSkipAssigns(); + if (splat_input_definition) { + auto defining_opcode = splat_input_definition->GetOpcodeNum(); + if (defining_opcode == OPCODE_EXTRACT) { + auto value_extracted_from = splat_input_definition->src1.value; + if (value_extracted_from->type == VEC128_TYPE) { + + xenia_assert(splat_input_definition->dest->type == splat_type); + + if (value_extracted_from->AllFloatVectorLanesSameValue()) { + i->Replace(&OPCODE_ASSIGN_info,0); + i->set_src1(value_extracted_from); + return true; + } + } + } + } + } + } + } return false; } bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) { diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 84b28a8f2..95b945fa9 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const { } return true; } +bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value, + uint32_t current_depth) { + // limit recursion, otherwise this function will slow down emission + if (current_depth == 16) { + return false; + } + using namespace hir; + hir::Instr* definition; + Opcode definition_opcode_number; +re_enter: + definition = for_value->def; + if (!definition) { + xenia_assert(for_value->IsConstant()); + + auto&& constant_value = for_value->constant.v128; + for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) { + if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) { + return false; + } + } + return true; + } + definition_opcode_number = definition->GetOpcodeNum(); + + if (definition_opcode_number == OPCODE_ASSIGN) { + for_value = definition->src1.value; + goto re_enter; + } + + if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) { + for_value = definition->src1.value; + goto re_enter; + } + /* + vmsum propagates its result to every lane + */ + if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 || + definition_opcode_number == OPCODE_DOT_PRODUCT_3) { + return true; + } + //if splat of 32-bit value type, return true + //technically a splat of int16 or int8 would also produce the same "float" in all lanes + //but i think its best to keep this function focused on specifically float data + if (definition_opcode_number == OPCODE_SPLAT) { + if (definition->dest->type == VEC128_TYPE) { + auto splat_src_value_type = definition->src1.value->type; + if (splat_src_value_type == INT32_TYPE || + splat_src_value_type == FLOAT32_TYPE) { + return true; + } + } + } + + switch (definition_opcode_number) { + //all of these opcodes produce the same value for the same input + case OPCODE_RSQRT: + case OPCODE_RECIP: + case OPCODE_POW2: + case OPCODE_LOG2: + for_value = definition->src1.value; + goto re_enter; + + //binary opcodes + case OPCODE_ADD: + case OPCODE_SUB: + case OPCODE_MUL: + if (!AllFloatVectorLanesSameValue(definition->src1.value, + current_depth + 1)) { + return false; + } + for_value = definition->src2.value; + goto re_enter; + default: + break; + } + + return false; +} + + } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index 3a1cd442e..2eec39e00 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -618,8 +618,16 @@ class Value { bool MaybeFloaty() const { return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE; } - + bool AllFloatVectorLanesSameValue() const { + return Value::AllFloatVectorLanesSameValue(this); + } private: + /* +returns true if for_value (which must be VEC128_TYPE) has the same value in +every float +*/ + static bool AllFloatVectorLanesSameValue(const hir::Value* for_value, + uint32_t current_depth = 0); static bool CompareInt8(Opcode opcode, Value* a, Value* b); static bool CompareInt16(Opcode opcode, Value* a, Value* b); static bool CompareInt32(Opcode opcode, Value* a, Value* b);