From b26c6ee1b81da32fffff5a3d1b9b06339d4f331a Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 21 Aug 2022 10:27:54 -0700 Subject: [PATCH] Fix some more constant folding fabsx does NOT set fpscr turns out that our vector unsigned compare instructions are a bit wierd? --- src/xenia/cpu/backend/x64/x64_emitter.cc | 6 ++ src/xenia/cpu/backend/x64/x64_seq_vector.cc | 108 +++++++++----------- src/xenia/cpu/backend/x64/x64_sequences.cc | 5 +- src/xenia/cpu/hir/hir_builder.cc | 24 ++--- src/xenia/cpu/ppc/ppc_emit_fpu.cc | 13 ++- 5 files changed, 75 insertions(+), 81 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index ccd6e969a..a6c98c955 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -143,6 +143,12 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) feature_flags_ |= kX64EmitTBM; } } + if (amd_flags & (1U << 11)) { + if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) { + feature_flags_ |= kX64EmitXOP; + XELOGCPU("Cpu support XOP!\n\n"); + } + } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { bool is_zennish = cpu_.displayFamily >= 0x17; /* diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 385b8e741..0de48e5c3 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -143,6 +143,7 @@ struct VECTOR_DENORMFLUSH e.vandps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123 e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01 + // todo: xop vpcmov here e.vandps(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros // must keep their signs @@ -457,68 +458,52 @@ struct VECTOR_COMPARE_UGT_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->flags != FLOAT32_TYPE && e.IsFeatureEnabled(kX64EmitXOP)) { - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcomub(i.dest, src1, src2, xopcompare_e::GT); - break; - case INT16_TYPE: - e.vpcomuw(i.dest, src1, src2, xopcompare_e::GT); - break; - case INT32_TYPE: - e.vpcomud(i.dest, src1, src2, xopcompare_e::GT); - break; - } + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + e.ChangeMxcsrMode(MXCSRMode::Vmx); + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + default: + assert_always(); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); } else { - Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy - switch (i.instr->flags) { - case INT8_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); - break; - case INT16_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); - break; - case INT32_TYPE: - sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); - break; - case FLOAT32_TYPE: - e.ChangeMxcsrMode(MXCSRMode::Vmx); - sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); - break; - default: - assert_always(); - break; - } - if (i.src1.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.vpxor(e.xmm0, sign_addr); - } else { - e.vpxor(e.xmm0, i.src1, sign_addr); - } - if (i.src2.is_constant) { - // TODO(benvanik): make this constant. - e.LoadConstantXmm(e.xmm1, i.src2.constant()); - e.vpxor(e.xmm1, sign_addr); - } else { - e.vpxor(e.xmm1, i.src2, sign_addr); - } - switch (i.instr->flags) { - case INT8_TYPE: - e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); - break; - case INT16_TYPE: - e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); - break; - case INT32_TYPE: - e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); - break; - case FLOAT32_TYPE: - e.vcmpgtps(i.dest, e.xmm0, e.xmm1); - break; - } + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src2.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, e.xmm0, e.xmm1); + break; } } }; @@ -634,6 +619,7 @@ struct VECTOR_ADD // overflowed (only need to check one input) // if (src1 > res) then overflowed // http://locklessinc.com/articles/sat_arithmetic/ + // chrispy: todo - add xop stuff here e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 8952ca771..fb8ccba20 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -781,11 +781,12 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { - if (e.IsFeatureEnabled(kX64EmitXOP)) { + if (1 && e.IsFeatureEnabled(kX64EmitXOP)) { XELOGCPU("Doing vpcmov!!"); - e.vpcmov(i.dest, src2, src3, src1); + e.vpcmov(i.dest, src3, src2, src1); } else { // src1 ? src2 : src3; + e.vpandn(e.xmm3, src1, src2); e.vpand(i.dest, src1, src3); e.vpor(i.dest, i.dest, e.xmm3); diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index df5b72375..fda6812b4 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1023,13 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) { Value* HIRBuilder::Convert(Value* value, TypeName target_type, RoundMode round_mode) { - if (value->type == target_type) { - return value; - } else if (value->IsConstant()) { - Value* dest = CloneValue(value); - dest->Convert(target_type, round_mode); - return dest; - } Instr* i = AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type)); @@ -1041,11 +1034,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type, Value* HIRBuilder::Round(Value* value, RoundMode round_mode) { ASSERT_FLOAT_OR_VECTOR_TYPE(value); - if (value->IsConstant()) { - Value* dest = CloneValue(value); - dest->Round(round_mode); - return dest; - } Instr* i = AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type)); @@ -1295,7 +1283,7 @@ void HIRBuilder::SetNJM(Value* value) { Value* HIRBuilder::Max(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); - if (value1->type != VEC128_TYPE && value1->IsConstant() && + if (IsScalarIntegralType( value1->type) && value1->IsConstant() && value2->IsConstant()) { return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1; } @@ -1323,7 +1311,7 @@ Value* HIRBuilder::VectorMax(Value* value1, Value* value2, TypeName part_type, Value* HIRBuilder::Min(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); - if (value1->type != VEC128_TYPE && value1->IsConstant() && + if (IsScalarIntegralType(value1->type) && value1->IsConstant() && value2->IsConstant()) { return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value1 : value2; } @@ -1351,8 +1339,9 @@ Value* HIRBuilder::VectorMin(Value* value1, Value* value2, TypeName part_type, Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) { assert_true(cond->type == INT8_TYPE || cond->type == VEC128_TYPE); // for now ASSERT_TYPES_EQUAL(value1, value2); - - if (cond->IsConstant()) { + // chrispy: this was being done with V128, which was breaking stuff obviously + // because that should be an element by element select + if (cond->IsConstant() && IsScalarIntegralType(cond->type)) { return cond->IsConstantTrue() ? value1 : value2; } @@ -1518,7 +1507,8 @@ Value* HIRBuilder::Add(Value* value1, Value* value2, ASSERT_TYPES_EQUAL(value1, value2); // TODO(benvanik): optimize when flags set. - if (!arithmetic_flags) { + + if (!arithmetic_flags && IsScalarIntegralType(value1->type)) { if (value1->IsConstantZero()) { return value2; } else if (value2->IsConstantZero()) { diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 71d323f2e..421edbdf7 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -442,7 +442,18 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- abs(frB) Value* v = f.Abs(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - f.UpdateFPSCR(v, i.X.Rc); + /* + The contents of frB with bit 0 cleared are placed into frD. +Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign +bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR. +Other registers altered: +• Condition Register (CR1 field): +Affected: FX, FEX, VX, OX (if Rc = 1) + */ + // f.UpdateFPSCR(v, i.X.Rc); + if (i.X.Rc) { + + } return 0; }