PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb

> - SHUFB from 9 instructions down to 5 > - Though it should be 4 if LLVM would just emit BCAX...
2026-05-07 13:37:46 +00:00 · 2026-01-14 04:45:18 +00:00 · 2026-01-14 04:45:18 +00:00 · 7e54a0b3bb
commit 7e54a0b3bb
parent 0e2584fc9f
3 changed files with 188 additions and 0 deletions
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -3936,6 +3936,109 @@ public:
 		});
 	}

+#ifdef ARCH_ARM64
+	template <typename T1, typename T2>
+	value_t<u8[16]> tbl(T1 a, T2 b)
+	{
+ 		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto index = b.eval(m_ir);
+		const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
+				}
+			}
+		
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
+				result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
+				return result;
+			}
+		}
+
+	result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
+	return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		if (auto c = llvm::dyn_cast<llvm::Constant>(index))
+		{
+			v128 mask{};
+			v128 bitmask{};
+			const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
+
+			if (cv)
+			{
+				for (u32 i = 0; i < 16; i++)
+				{
+					const u64 b_val = cv->getElementAsInteger(i);
+					mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
+					bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
+				}
+			}
+
+			if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
+			{
+				auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
+				auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
+				auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
+
+				auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
+				result.value = m_ir->CreateAnd(lookup, z_mask);
+				return result;
+			}
+		}
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3>
+	value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
+		return result;
+	}
+
+	template <typename T1, typename T2, typename T3, typename T4>
+	value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
+	{
+		value_t<u8[16]> result;
+		const auto v_fallback = fallback.eval(m_ir);
+		const auto data0 = a.eval(m_ir);
+		const auto data1 = b.eval(m_ir);
+		const auto index = indices.eval(m_ir);
+
+		result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
+		return result;
+	}
+#endif
+
 	// (m << 3) >= 0 ? a : b
 	template <typename T, typename U, typename V>
 	static auto select_by_bit4(T&& m, U&& a, V&& b)
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -1641,6 +1641,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 {
 	const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);

+#ifdef ARCH_ARM64
+
+	if (op.ra == op.rb)
+	{
+		set_vr(op.vd, tbl(a, (~c & 0xf)));
+		return;
+	}
+
+	set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
+}
+#else
+
 	if (op.ra == op.rb)
 	{
 		set_vr(op.vd, pshufb(a, ~c & 0xf));
@ -1657,6 +1669,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 	const auto i = eval(~c & 0x1f);
 	set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
 }
+#endif

 void PPUTranslator::VPKPX(ppu_opcode_t op)
 {
--- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp
@ -5918,6 +5918,77 @@ public:
 		const auto a = get_vr<u8[16]>(op.ra);
 		const auto b = get_vr<u8[16]>(op.rb);

+#ifdef ARCH_ARM64
+
+		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
+		{
+			if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
+			{
+				if (op.ra == op.rb)
+				{
+					if (perm_only)
+					{
+						const auto cm = eval(c & 0x0f);
+						set_vr(op.rt4, tbl(as, cm));
+						return;
+					}
+				
+					const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));                        
+					const auto cm = eval(c & 0x8f);                                                                                                               
+ 					set_vr(op.rt4, tbx(x, as, cm));
+					return;
+				}
+
+				if (perm_only)
+				{
+					const auto cm = eval(c & 0x1f);
+					set_vr(op.rt4, tbl2(as, bs, cm));
+					return;
+				}
+
+				const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+				const auto cm = eval(c & 0x9f);
+				set_vr(op.rt4, tbx2(x, as, bs, cm));
+				return;
+			}
+
+		}
+
+
+		if (op.ra == op.rb && !m_interp_magn)
+		{
+			if (perm_only)
+			{
+				const auto cm = eval(c & 0x0f);
+				const auto cr = eval(cm ^ 0x0f);
+				set_vr(op.rt4, tbl(a, cr));
+				return;
+			}
+
+			const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
+			const auto cm = eval(c & 0x8f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbx(x, a, cr));
+			return;
+		}
+
+		if (perm_only)
+		{
+			const auto cm = eval(c & 0x9f);
+			const auto cr = eval(cm ^ 0x0f);
+			set_vr(op.rt4, tbl2(a, b, cr));
+			return;
+		}
+
+		const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4)); 
+		// AND should be before XOR so that llvm can combine them into BCAX
+		// Though for some reason it doesn't seem to be doing that.
+		const auto cm = eval(c & ~0x60);
+		const auto cr = eval(cm ^ 0x0f);
+		set_vr(op.rt4, tbx2(x, a, b, cr));
+	}                                                                                              
+#else
+
 		// Data with swapped endian from a load instruction
 		if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
 		{
@ -6062,6 +6133,7 @@ public:
 		else
 			set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
 	}
+#endif

 	void MPYA(spu_opcode_t op)
 	{