diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 81049c6074..9b9804fd39 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3661,6 +3661,7 @@ public: return result; } +#ifdef ARCH_ARM64 template value_t udot(T1 a, T2 b, T3 c) { @@ -3670,11 +3671,6 @@ template const auto data1 = b.eval(m_ir); const auto data2 = c.eval(m_ir); - // ARM hardware requires the multipliers to be treated as 16-byte vectors - //const auto op1 = bitcast(data1, get_type()); - //const auto op2 = bitcast(data2, get_type()); - - // Use the variadic get_intrinsic to resolve the overloaded AArch64 intrinsic result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_udot), {data0, data1, data2}); return result; } @@ -3688,12 +3684,24 @@ template const auto data1 = b.eval(m_ir); const auto data2 = c.eval(m_ir); - //const auto op1 = bitcast(data1, get_type()); - //const auto op2 = bitcast(data2, get_type()); - result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_sdot), {data0, data1, data2}); return result; } + +template + auto addp(T1 a, T2 b) + { + using T_vector = typename is_llvm_expr::type; + const auto data1 = a.eval(m_ir); + const auto data2 = b.eval(m_ir); + + const auto func = get_intrinsic(llvm::Intrinsic::aarch64_neon_addp); + + value_t result; + result.value = m_ir->CreateCall(func, {data1, data2}); + return result; + } +#endif template value_t vpermb(T1 a, T2 b) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 45b4c83ecf..856a039e5e 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -4759,6 +4759,50 @@ public: } const auto a = get_vr(op.ra); + +#ifdef ARCH_ARM64 + // Use dot product instructions with special values to shift then sum results into the preferred slot + if (m_use_dotprod) + { + if (match_vr(op.ra, [&](auto c, auto MP) + { + using VT = typename decltype(MP)::type; + + if (auto [ok, x] = match_expr(c, sext(match]>())); ok) + { + const auto zeroes = splat(0); + + const auto es = zshuffle(bitcast(a), 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 4, 8, 12); + + set_vr(op.rt, sdot(zeroes, es, build( + -0x01, -0x02, -0x04, -0x08, + -0x01, -0x02, -0x04, -0x08, + -0x01, -0x02, -0x04, -0x08, + -0x01, -0x02, -0x04, -0x08 + ))); + return true; + } + return false; + })) + { + return; + } + + const auto zeroes = splat(0); + const auto masked = a & 0x01; + + const auto es = zshuffle(bitcast(masked), 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 4, 8, 12); + + set_vr(op.rt, udot(zeroes, es, build( + 0x01, 0x02, 0x04, 0x08, + 0x01, 0x02, 0x04, 0x08, + 0x01, 0x02, 0x04, 0x08, + 0x01, 0x02, 0x04, 0x08 + ))); + return; + } +#endif + const auto m = zext(bitcast(trunc(a))); set_vr(op.rt, insert(splat(0), 3, eval(m))); } @@ -4774,6 +4818,54 @@ public: } const auto a = get_vr(op.ra); + +#ifdef ARCH_ARM64 + // Use dot product instructions with special values to shift then sum results into the preferred slot + if (m_use_dotprod) + { + if (match_vr(op.ra, [&](auto c, auto MP) + { + using VT = typename decltype(MP)::type; + + if (auto [ok, x] = match_expr(c, sext(match]>())); ok) + { + const auto zeroes = splat(0); + + const auto es = zshuffle(bitcast(a), 16, 16, 16, 16, 16, 16, 16, 16, 0, 2, 4, 6, 8, 10, 12, 14); + + const auto extracted = sdot(zeroes, es, build( + -0x01, -0x02, -0x04, -0x08, + -0x10, -0x20, -0x40, -0x80, + -0x01, -0x02, -0x04, -0x08, + -0x10, -0x20, -0x40, -0x80 + )); + + set_vr(op.rt, addp(zeroes, bitcast(extracted))); + return true; + } + return false; + })) + { + return; + } + + const auto zeroes = splat(0); + const auto masked = a & 0x01; + + const auto es = zshuffle(bitcast(masked), 16, 16, 16, 16, 16, 16, 16, 16, 0, 2, 4, 6, 8, 10, 12, 14); + + const auto extracted = udot(zeroes, es, build( + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80 + )); + + set_vr(op.rt, addp(zeroes, bitcast(extracted))); + return; + } +#endif + const auto m = zext(bitcast(trunc(a))); set_vr(op.rt, insert(splat(0), 3, eval(m))); } @@ -4782,6 +4874,53 @@ public: { const auto a = get_vr(op.ra); +#ifdef ARCH_ARM64 + // Use dot product instructions with special values to shift then sum results into the preferred slot + if (m_use_dotprod) + { + if (match_vr(op.ra, [&](auto c, auto MP) + { + using VT = typename decltype(MP)::type; + + if (auto [ok, x] = match_expr(c, sext(match]>())); ok) + { + const auto zeroes = splat(0); + + const auto extracted = sdot(zeroes, a, build( + -0x01, -0x02, -0x04, -0x08, + -0x10, -0x20, -0x40, -0x80, + -0x01, -0x02, -0x04, -0x08, + -0x10, -0x20, -0x40, -0x80 + )); + + const auto es = zshuffle(bitcast(extracted), 16, 16, 16, 16, 16, 16, 16, 16, 0, 8, 4, 12, 16, 16, 16, 16); + const auto zeroes16 = splat(0); + set_vr(op.rt, addp(zeroes16, bitcast(es))); + return true; + } + return false; + })) + { + return; + } + + const auto zeroes = splat(0); + const auto masked = a & 0x01; + + const auto extracted = udot(zeroes, masked, build( + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, + 0x10, 0x20, 0x40, 0x80 + )); + + const auto es = zshuffle(bitcast(extracted), 16, 16, 16, 16, 16, 16, 16, 16, 0, 8, 4, 12, 16, 16, 16, 16); + const auto zeroes16 = splat(0); + set_vr(op.rt, addp(zeroes16, bitcast(es))); + return; + } +#endif + if (m_use_gfni) { const auto as = zshuffle(a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);