From d5e7bc8dca7a1182942a8519340c04473845df0c Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sun, 5 Mar 2017 17:43:56 -0600 Subject: [PATCH] JIT - vctsxs: Properly saturate signed integers --- src/xenia/cpu/backend/x64/x64_emitter.cc | 4 +- src/xenia/cpu/backend/x64/x64_emitter.h | 4 +- src/xenia/cpu/backend/x64/x64_sequences.cc | 48 ++++++++++--- src/xenia/cpu/hir/opcodes.h | 1 + src/xenia/cpu/ppc/testing/instr_vctsxs.s | 81 ++++++++++++++++------ 5 files changed, 102 insertions(+), 36 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 69ab8b169..683e4a471 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -670,9 +670,9 @@ static const vec128_t xmm_consts[] = { 0x80000000u), /* XMMShortMinPS */ vec128f(SHRT_MIN), /* XMMShortMaxPS */ vec128f(SHRT_MAX), - /* XMMIntMaxPS */ vec128f(float(INT_MAX)), + /* XMMIntMin */ vec128i(INT_MIN), + /* XMMIntMax */ vec128i(INT_MAX), /* XMMIntMaxPD */ vec128d(INT_MAX), - /* XMMInt64MaxPD */ vec128d(double(INT64_MAX)), }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 79f863091..8f952105b 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -89,9 +89,9 @@ enum XmmConst { XMMSignMaskF32, XMMShortMinPS, XMMShortMaxPS, - XMMIntMaxPS, + XMMIntMin, + XMMIntMax, XMMIntMaxPD, - XMMInt64MaxPD, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 498d8ab3c..9c2567582 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1429,8 +1429,11 @@ struct CONVERT_I32_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): saturation check? cvtt* (trunc?) - e.vminss(e.xmm0, i.src1, e.GetXmmConstPtr(XMMIntMaxPS)); - e.vcvtss2si(i.dest, e.xmm0); + if (i.instr->flags == ROUND_TO_ZERO) { + e.vcvttss2si(i.dest, e.xmm0); + } else { + e.vcvtss2si(i.dest, e.xmm0); + } } }; struct CONVERT_I32_F64 @@ -1440,14 +1443,22 @@ struct CONVERT_I32_F64 // PPC saturates the value instead. // So, we can clamp the double value to (double)0x7FFFFFFF. e.vminsd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMIntMaxPD)); - e.vcvttsd2si(i.dest, e.xmm0); + if (i.instr->flags == ROUND_TO_ZERO) { + e.vcvttsd2si(i.dest, e.xmm0); + } else { + e.vcvtsd2si(i.dest, e.xmm0); + } } }; struct CONVERT_I64_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vminsd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMInt64MaxPD)); - e.vcvttsd2si(i.dest, e.xmm0); + // TODO(benvanik): saturation check? cvtt* (trunc?) + if (i.instr->flags == ROUND_TO_ZERO) { + e.vcvttsd2si(i.dest, e.xmm0); + } else { + e.vcvtsd2si(i.dest, e.xmm0); + } } }; struct CONVERT_F32_I32 @@ -1568,13 +1579,28 @@ struct VECTOR_CONVERT_F2I : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // flags = ARITHMETIC_UNSIGNED | ARITHMETIC_UNSIGNED - // TODO(benvanik): are these really the same? VC++ thinks so. - e.vcvttps2dq(i.dest, i.src1); - if (i.instr->flags & ARITHMETIC_SATURATE) { - // TODO(benvanik): check saturation. - // In theory cvt throws if it saturates. + Xmm src1 = i.src1; + + // Copy src1 if necessary. + bool copy_src1 = !!(i.instr->flags & ARITHMETIC_SATURATE); + if (copy_src1 && i.dest == i.src1) { + e.vmovdqa(e.xmm1, i.src1); + src1 = e.xmm1; } + + e.vcvttps2dq(i.dest, i.src1); + if (i.instr->flags & ARITHMETIC_SATURATE && + !(i.instr->flags & ARITHMETIC_UNSIGNED)) { + // if dest is indeterminate and i.src1 >= 0 (i.e. !(i.src1 & 0x80000000)) + // i.dest = 0x7FFFFFFF + e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin)); + e.vpandn(e.xmm0, src1, e.xmm0); + + // (high bit of xmm0 = is ind. && i.src1 >= 0) + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMIntMax), e.xmm0); + } + + // TODO(DrChat): Unsigned saturation! } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 8d55f0e0b..04206840e 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -32,6 +32,7 @@ enum RoundMode { ROUND_TO_NEAREST, ROUND_TO_MINUS_INFINITY, ROUND_TO_POSITIVE_INFINITY, + ROUND_DYNAMIC, // Round based on the host's rounding mode. }; enum LoadStoreFlags { diff --git a/src/xenia/cpu/ppc/testing/instr_vctsxs.s b/src/xenia/cpu/ppc/testing/instr_vctsxs.s index c3c35f444..1738de95a 100644 --- a/src/xenia/cpu/ppc/testing/instr_vctsxs.s +++ b/src/xenia/cpu/ppc/testing/instr_vctsxs.s @@ -1,32 +1,71 @@ +# 0 * 2^31 test_vctsxs_1: - #_ REGISTER_IN v3 [3f800000, 3fc00000, 3f8ccccd, 3ff33333] - # 1.0, 1.5, 1.1, 1.9 - vctsxs v3, v3, 0 + #_ REGISTER_IN v0 [00000000, 00000000, 00000000, 00000000] + vctsxs v3, v0, 31 blr + #_ REGISTER_OUT v0 [00000000, 00000000, 00000000, 00000000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# -0 ^ 2^31 +test_vctsxs_2: + #_ REGISTER_IN v0 [80000000, 80000000, 80000000, 80000000] + vctsxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [80000000, 80000000, 80000000, 80000000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# smallest positive subnormal * 2^31 +test_vctsxs_3: + #_ REGISTER_IN v0 [00000001, 00000001, 00000001, 00000001] + vctsxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [00000001, 00000001, 00000001, 00000001] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# largest subnormal * 2^31 +test_vctsxs_4: + #_ REGISTER_IN v0 [007FFFFF, 007FFFFF, 007FFFFF, 007FFFFF] + vctsxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [007FFFFF, 007FFFFF, 007FFFFF, 007FFFFF] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# +1 * 2^0 +test_vctsxs_5: + #_ REGISTER_IN v0 [3F800000, 3F800000, 3F800000, 3F800000] + vctsxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [3F800000, 3F800000, 3F800000, 3F800000] #_ REGISTER_OUT v3 [00000001, 00000001, 00000001, 00000001] -test_vctsxs_2: - #_ REGISTER_IN v3 [3f800000, 3fc00000, 3f8ccccd, 3ff33333] - # 1.0, 1.5, 1.1, 1.9 - vctsxs v3, v3, 1 +# -1 * 2^0 +test_vctsxs_6: + #_ REGISTER_IN v0 [BF800000, BF800000, BF800000, BF800000] + vctsxs v3, v0, 0 blr - #_ REGISTER_OUT v3 [00000002, 00000003, 00000002, 00000003] + #_ REGISTER_OUT v0 [BF800000, BF800000, BF800000, BF800000] + #_ REGISTER_OUT v3 [FFFFFFFF, FFFFFFFF, FFFFFFFF, FFFFFFFF] -test_vctsxs_3: - #_ REGISTER_IN v3 [3f800000, 3fc00000, 3f8ccccd, 3ff33333] - # 1.0, 1.5, 1.1, 1.9 - vctsxs v3, v3, 2 +# 2^31 * 2^0 +test_vctsxs_7: + #_ REGISTER_IN v0 [4F000000, 4F000000, 4F000000, 4F000000] + vctsxs v3, v0, 0 blr - #_ REGISTER_OUT v3 [00000004, 00000006, 00000004, 00000007] + #_ REGISTER_OUT v0 [4F000000, 4F000000, 4F000000, 4F000000] + #_ REGISTER_OUT v3 [7FFFFFFF, 7FFFFFFF, 7FFFFFFF, 7FFFFFFF] -test_vctsxs_4: - #_ REGISTER_IN v3 [42c83333, 43480000, 449a4000, c49a4000] - vctsxs v3, v3, 0 +# +infinity * 2^0 +test_vctsxs_8: + #_ REGISTER_IN v0 [7F800000, 7F800000, 7F800000, 7F800000] + vctsxs v3, v0, 0 blr - #_ REGISTER_OUT v3 [00000064, 000000c8, 000004d2, fffffb2e] + #_ REGISTER_OUT v0 [7F800000, 7F800000, 7F800000, 7F800000] + #_ REGISTER_OUT v3 [7FFFFFFF, 7FFFFFFF, 7FFFFFFF, 7FFFFFFF] -test_vctsxs_5: - #_ REGISTER_IN v3 [42c83333, 43480000, 449a4000, c49a4000] - vctsxs v3, v3, 1 +# -infinity * 2^0 +test_vctsxs_9: + #_ REGISTER_IN v0 [FF800000, FF800000, FF800000, FF800000] + vctsxs v3, v0, 0 blr - #_ REGISTER_OUT v3 [000000c8, 00000190, 000009a4, fffff65c] + #_ REGISTER_OUT v0 [FF800000, FF800000, FF800000, FF800000] + #_ REGISTER_OUT v3 [80000000, 80000000, 80000000, 80000000]