From b5ef3453c76f46ad21a8ac3b802793d59c3be665 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 21 Aug 2022 12:32:33 -0700 Subject: [PATCH] Disable most XOP code by default, the manual must be wrong for the shifts or we must be assembling them incorrectly, will return to it later and fix comparisons and select done by xop are fine though --- src/xenia/cpu/backend/x64/x64_emitter.cc | 9 +- src/xenia/cpu/backend/x64/x64_emitter.h | 31 +- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 304 ++++++++++++++------ src/xenia/cpu/backend/x64/x64_sequences.cc | 2 +- src/xenia/cpu/ppc/ppc_emit_fpu.cc | 12 +- 5 files changed, 255 insertions(+), 103 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index a6c98c955..d1394d202 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -1030,8 +1030,13 @@ static const vec128_t xmm_consts[] = { /* XMMF16PackLCPI6 */ - vec128i(0x8000) - + vec128i(0x8000), + /* XMMXOPByteShiftMask,*/ + vec128b(7), + /*XMMXOPWordShiftMask*/ + vec128s(15), + /*XMMXOPDwordShiftMask*/ + vec128i(31) }; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index c7773f08f..01027cc0c 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -167,7 +167,11 @@ enum XmmConst { XMMF16PackLCPI3, XMMF16PackLCPI4, XMMF16PackLCPI5, - XMMF16PackLCPI6 + XMMF16PackLCPI6, + XMMXOPByteShiftMask, + XMMXOPWordShiftMask, + XMMXOPDwordShiftMask, + }; using amdfx::xopcompare_e; using Xbyak::Xmm; @@ -383,7 +387,30 @@ class X64Emitter : public Xbyak::CodeGenerator { DEFINECOMPARE(vpcomud); DEFINECOMPARE(vpcomq); DEFINECOMPARE(vpcomuq); - #undef DEFINECOMPARE +#undef DEFINECOMPARE + +#define DEFINESHIFTER(name) \ + void name(Xmm dest, Xmm src1, Xmm src2) { \ + auto xop_bytes = \ + amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \ + EmitXOP(xop_bytes); \ + } + + DEFINESHIFTER(vprotb) + DEFINESHIFTER(vprotw) + DEFINESHIFTER(vprotd) + DEFINESHIFTER(vprotq) + + DEFINESHIFTER(vpshab) + DEFINESHIFTER(vpshaw) + DEFINESHIFTER(vpshad) + DEFINESHIFTER(vpshaq) + + DEFINESHIFTER(vpshlb) + DEFINESHIFTER(vpshlw) + DEFINESHIFTER(vpshld) + DEFINESHIFTER(vpshlq) + protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 0de48e5c3..46eb285cf 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -19,6 +19,16 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h" +DEFINE_bool(xop_rotates, false, "rotate via xop", "X64"); + +DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64"); + +DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64"); + +DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64"); + +DEFINE_bool(xop_compares, true, "compare via xop", "X64"); + namespace xe { namespace cpu { namespace backend { @@ -407,7 +417,7 @@ struct VECTOR_COMPARE_SGE_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { EmitAssociativeBinaryXmmOp( e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - if (e.IsFeatureEnabled(kX64EmitXOP)) { + if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) { switch (i.instr->flags) { case INT8_TYPE: e.vpcomb(dest, src1, src2, xopcompare_e::GTE); @@ -775,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { // Store result and return it. return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } - +static XmmConst GetShiftmaskForType(unsigned typ) { + if (typ == INT8_TYPE) { + return XMMXOPByteShiftMask; + } else if (typ == INT16_TYPE) { + return XMMXOPWordShiftMask; + } else { + return XMMXOPDwordShiftMask; + } +} struct VECTOR_SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1061,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { struct VECTOR_SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1244,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); struct VECTOR_SHA_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshab(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshaw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshad(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1432,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT32_TYPE: { - if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vprolvd(i.dest, i.src1, i.src2); - } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; - } - // Shift left (to get high bits): - if (i.src2.is_constant) { - e.LoadConstantXmm(temp, i.src2.constant()); - e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); - } else { - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - } - e.vpsllvd(e.xmm1, i.src1, e.xmm0); - // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); - e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, temp); - // Merge: - e.vpor(i.dest, e.xmm1); - } else { - // TODO(benvanik): non-AVX2 native version. + if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vprotb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vprotw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vprotd(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); @@ -1489,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128 } e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); + break; + case INT16_TYPE: + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT32_TYPE: { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + e.vprolvd(i.dest, i.src1, i.src2); + } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + if (i.src2.is_constant) { + e.LoadConstantXmm(temp, i.src2.constant()); + e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); + // Merge: + e.vpor(i.dest, e.xmm1); + } else { + // TODO(benvanik): non-AVX2 native version. + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + } + break; } - break; + default: + assert_always(); + break; } - default: - assert_always(); - break; } } }; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index fb8ccba20..bdadc5e74 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -781,7 +781,7 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { - if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { + if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { XELOGCPU("Doing vpcmov!!"); e.vpcmov(i.dest, src3, src2, src1); } else { diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 421edbdf7..c491ad09a 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -452,7 +452,7 @@ Affected: FX, FEX, VX, OX (if Rc = 1) */ // f.UpdateFPSCR(v, i.X.Rc); if (i.X.Rc) { - + // todo } return 0; } @@ -469,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- !abs(frB) Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; } @@ -477,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) { // frD <- ¬ frB[0] || frB[1-63] Value* v = f.Neg(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; }