diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index a6c98c955..d1394d202 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -1030,8 +1030,13 @@ static const vec128_t xmm_consts[] = { /* XMMF16PackLCPI6 */ - vec128i(0x8000) - + vec128i(0x8000), + /* XMMXOPByteShiftMask,*/ + vec128b(7), + /*XMMXOPWordShiftMask*/ + vec128s(15), + /*XMMXOPDwordShiftMask*/ + vec128i(31) }; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index c7773f08f..01027cc0c 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -167,7 +167,11 @@ enum XmmConst { XMMF16PackLCPI3, XMMF16PackLCPI4, XMMF16PackLCPI5, - XMMF16PackLCPI6 + XMMF16PackLCPI6, + XMMXOPByteShiftMask, + XMMXOPWordShiftMask, + XMMXOPDwordShiftMask, + }; using amdfx::xopcompare_e; using Xbyak::Xmm; @@ -383,7 +387,30 @@ class X64Emitter : public Xbyak::CodeGenerator { DEFINECOMPARE(vpcomud); DEFINECOMPARE(vpcomq); DEFINECOMPARE(vpcomuq); - #undef DEFINECOMPARE +#undef DEFINECOMPARE + +#define DEFINESHIFTER(name) \ + void name(Xmm dest, Xmm src1, Xmm src2) { \ + auto xop_bytes = \ + amdfx::operations::name(dest.getIdx(), src1.getIdx(), src2.getIdx()); \ + EmitXOP(xop_bytes); \ + } + + DEFINESHIFTER(vprotb) + DEFINESHIFTER(vprotw) + DEFINESHIFTER(vprotd) + DEFINESHIFTER(vprotq) + + DEFINESHIFTER(vpshab) + DEFINESHIFTER(vpshaw) + DEFINESHIFTER(vpshad) + DEFINESHIFTER(vpshaq) + + DEFINESHIFTER(vpshlb) + DEFINESHIFTER(vpshlw) + DEFINESHIFTER(vpshld) + DEFINESHIFTER(vpshlq) + protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 0de48e5c3..46eb285cf 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -19,6 +19,16 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h" +DEFINE_bool(xop_rotates, false, "rotate via xop", "X64"); + +DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64"); + +DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64"); + +DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64"); + +DEFINE_bool(xop_compares, true, "compare via xop", "X64"); + namespace xe { namespace cpu { namespace backend { @@ -407,7 +417,7 @@ struct VECTOR_COMPARE_SGE_V128 static void Emit(X64Emitter& e, const EmitArgType& i) { EmitAssociativeBinaryXmmOp( e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - if (e.IsFeatureEnabled(kX64EmitXOP)) { + if (cvars::xop_compares && e.IsFeatureEnabled(kX64EmitXOP)) { switch (i.instr->flags) { case INT8_TYPE: e.vpcomb(dest, src1, src2, xopcompare_e::GTE); @@ -775,23 +785,52 @@ static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { // Store result and return it. return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } - +static XmmConst GetShiftmaskForType(unsigned typ) { + if (typ == INT8_TYPE) { + return XMMXOPByteShiftMask; + } else if (typ == INT16_TYPE) { + return XMMXOPWordShiftMask; + } else { + return XMMXOPDwordShiftMask; + } +} struct VECTOR_SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_left_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1061,19 +1100,45 @@ static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { struct VECTOR_SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshlb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshlw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshld(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1244,19 +1309,45 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); struct VECTOR_SHA_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; + if (cvars::xop_arithmetic_right_shifts && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + e.vpcmpeqb(e.xmm3, e.xmm3); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vpsignb(e.xmm2, e.xmm3); + e.vpshab(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vpsignw(e.xmm2, e.xmm3); + e.vpshaw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vpsignd(e.xmm2, e.xmm3); + e.vpshad(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; + } } } @@ -1432,55 +1523,29 @@ struct VECTOR_ROTATE_LEFT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); - } else { - e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); - } - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); - e.vmovaps(i.dest, e.xmm0); - break; - case INT32_TYPE: { - if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { - e.vprolvd(i.dest, i.src1, i.src2); - } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; - } - // Shift left (to get high bits): - if (i.src2.is_constant) { - e.LoadConstantXmm(temp, i.src2.constant()); - e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); - } else { - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - } - e.vpsllvd(e.xmm1, i.src1, e.xmm0); - // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); - e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, temp); - // Merge: - e.vpor(i.dest, e.xmm1); - } else { - // TODO(benvanik): non-AVX2 native version. + if (cvars::xop_rotates && e.IsFeatureEnabled(kX64EmitXOP)) { + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.vpand(e.xmm2, src2, + e.GetXmmConstPtr(GetShiftmaskForType(i.instr->flags))); + + switch (i.instr->flags) { + case INT8_TYPE: + e.vprotb(i.dest, src1, e.xmm2); + break; + case INT16_TYPE: + e.vprotw(i.dest, src1, e.xmm2); + break; + case INT32_TYPE: + e.vprotd(i.dest, src1, e.xmm2); + break; + } + + } else { + switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant())); @@ -1489,14 +1554,63 @@ struct VECTOR_ROTATE_LEFT_V128 } e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); e.CallNativeSafe( - reinterpret_cast(EmulateVectorRotateLeft)); + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); + break; + case INT16_TYPE: + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT32_TYPE: { + if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) { + e.vprolvd(i.dest, i.src1, i.src2); + } else if (e.IsFeatureEnabled(kX64EmitAVX2)) { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + if (i.src2.is_constant) { + e.LoadConstantXmm(temp, i.src2.constant()); + e.vpand(e.xmm0, temp, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); + // Merge: + e.vpor(i.dest, e.xmm1); + } else { + // TODO(benvanik): non-AVX2 native version. + if (i.src2.is_constant) { + e.lea(e.GetNativeParam(1), + e.StashConstantXmm(1, i.src2.constant())); + } else { + e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2)); + } + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); + e.vmovaps(i.dest, e.xmm0); + } + break; } - break; + default: + assert_always(); + break; } - default: - assert_always(); - break; } } }; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index fb8ccba20..bdadc5e74 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -781,7 +781,7 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { - if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { + if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { XELOGCPU("Doing vpcmov!!"); e.vpcmov(i.dest, src3, src2, src1); } else { diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 421edbdf7..c491ad09a 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -452,7 +452,7 @@ Affected: FX, FEX, VX, OX (if Rc = 1) */ // f.UpdateFPSCR(v, i.X.Rc); if (i.X.Rc) { - + // todo } return 0; } @@ -469,7 +469,10 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- !abs(frB) Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; } @@ -477,7 +480,10 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) { // frD <- ¬ frB[0] || frB[1-63] Value* v = f.Neg(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + //f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + //todo + } return 0; }