diff --git a/rpcs3/Emu/CPU/CPUTranslator.h b/rpcs3/Emu/CPU/CPUTranslator.h index 99ddafde0a..10750dd45d 100644 --- a/rpcs3/Emu/CPU/CPUTranslator.h +++ b/rpcs3/Emu/CPU/CPUTranslator.h @@ -3936,6 +3936,109 @@ public: }); } +#ifdef ARCH_ARM64 + template + value_t tbl(T1 a, T2 b) + { + value_t result; + const auto data0 = a.eval(m_ir); + const auto index = b.eval(m_ir); + const auto zeros = llvm::ConstantAggregateZero::get(get_type()); + + if (auto c = llvm::dyn_cast(index)) + { + v128 mask{}; + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b_val = cv->getElementAsInteger(i); + mask._u8[i] = (b_val < 16) ? static_cast(b_val) : static_cast(16); + } + } + + + if (cv || llvm::isa(c)) + { + result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); + result.value = m_ir->CreateZExt(result.value, get_type()); + result.value = m_ir->CreateShuffleVector(data0, zeros, result.value); + return result; + } + } + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index }); + return result; + } + + template + value_t tbl2(T1 a, T2 b, T3 indices) + { + value_t result; + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto index = indices.eval(m_ir); + + if (auto c = llvm::dyn_cast(index)) + { + v128 mask{}; + v128 bitmask{}; + const auto cv = llvm::dyn_cast(c); + + if (cv) + { + for (u32 i = 0; i < 16; i++) + { + const u64 b_val = cv->getElementAsInteger(i); + mask._u8[i] = (b_val < 32) ? static_cast(b_val) : static_cast(0); + bitmask._u8[i] = (b_val < 32) ? static_cast(0xFF) : static_cast(0x00); + } + } + + if (cv || llvm::isa(c)) + { + auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&mask), 16)); + auto m_ext = m_ir->CreateZExt(m_val, get_type()); + auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext); + + auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast(&bitmask), 16)); + result.value = m_ir->CreateAnd(lookup, z_mask); + return result; + } + } + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index }); + return result; + } + + template + value_t tbx(T1 fallback, T2 a, T3 indices) + { + value_t result; + const auto v_fallback = fallback.eval(m_ir); + const auto data0 = a.eval(m_ir); + const auto index = indices.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index }); + return result; + } + + template + value_t tbx2(T1 fallback, T2 a, T3 b, T4 indices) + { + value_t result; + const auto v_fallback = fallback.eval(m_ir); + const auto data0 = a.eval(m_ir); + const auto data1 = b.eval(m_ir); + const auto index = indices.eval(m_ir); + + result.value = m_ir->CreateCall(get_intrinsic(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index }); + return result; + } +#endif + // (m << 3) >= 0 ? a : b template static auto select_by_bit4(T&& m, U&& a, V&& b) diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 70d34aa775..8cef760792 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -1641,6 +1641,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op) { const auto [a, b, c] = get_vrs(op.va, op.vb, op.vc); +#ifdef ARCH_ARM64 + + if (op.ra == op.rb) + { + set_vr(op.vd, tbl(a, (~c & 0xf))); + return; + } + + set_vr(op.vd, tbl2(b, a, (~c & 0x1f))); +} +#else + if (op.ra == op.rb) { set_vr(op.vd, pshufb(a, ~c & 0xf)); @@ -1657,6 +1669,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op) const auto i = eval(~c & 0x1f); set_vr(op.vd, select(noncast(c << 3) >= 0, pshufb(a, i), pshufb(b, i))); } +#endif void PPUTranslator::VPKPX(ppu_opcode_t op) { diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index eb44289320..653ebcaaa2 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -5918,6 +5918,77 @@ public: const auto a = get_vr(op.ra); const auto b = get_vr(op.rb); +#ifdef ARCH_ARM64 + + if (auto [ok, as] = match_expr(a, byteswap(match())); ok) + { + if (auto [ok, bs] = match_expr(b, byteswap(match())); ok) + { + if (op.ra == op.rb) + { + if (perm_only) + { + const auto cm = eval(c & 0x0f); + set_vr(op.rt4, tbl(as, cm)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x8f); + set_vr(op.rt4, tbx(x, as, cm)); + return; + } + + if (perm_only) + { + const auto cm = eval(c & 0x1f); + set_vr(op.rt4, tbl2(as, bs, cm)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x9f); + set_vr(op.rt4, tbx2(x, as, bs, cm)); + return; + } + + } + + + if (op.ra == op.rb && !m_interp_magn) + { + if (perm_only) + { + const auto cm = eval(c & 0x0f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbl(a, cr)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + const auto cm = eval(c & 0x8f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbx(x, a, cr)); + return; + } + + if (perm_only) + { + const auto cm = eval(c & 0x9f); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbl2(a, b, cr)); + return; + } + + const auto x = tbl(build(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); + // AND should be before XOR so that llvm can combine them into BCAX + // Though for some reason it doesn't seem to be doing that. + const auto cm = eval(c & ~0x60); + const auto cr = eval(cm ^ 0x0f); + set_vr(op.rt4, tbx2(x, a, b, cr)); + } +#else + // Data with swapped endian from a load instruction if (auto [ok, as] = match_expr(a, byteswap(match())); ok) { @@ -6062,6 +6133,7 @@ public: else set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x); } +#endif void MPYA(spu_opcode_t op) {