PPU/SPU LLVM: Allow Zen4 cpus to use VPERMI2B/VPERMT2B instead of the vperm2b256to128 path

- Zen4 based cpus can process VPERM2B in a single uop, unlike intel where it is 3 uops.
2026-04-20 22:05:12 +00:00 · 2022-09-30 17:39:24 -04:00 · 2022-09-30 17:39:24 -04:00 · d8897c585d
commit d8897c585d
parent 7d32dc312f
5 changed files with 27 additions and 6 deletions
--- a/rpcs3/Emu/CPU/CPUTranslator.h
+++ b/rpcs3/Emu/CPU/CPUTranslator.h
@ -30,6 +30,7 @@
 #endif

 #include "util/types.hpp"
+#include "util/sysinfo.hpp"
 #include "Utilities/StrFmt.h"
 #include "Utilities/BitField.h"
 #include "Utilities/JIT.h"
@ -3442,6 +3443,11 @@ public:
 	template <typename T1, typename T2, typename T3>
 	value_t<u8[16]> vperm2b(T1 a, T2 b, T3 c)
 	{
+		if (!utils::has_fast_vperm2b())
+		{
+			return vperm2b256to128(a, b, c);
+		}
+
 		value_t<u8[16]> result;

 		const auto data0 = a.eval(m_ir);
--- a/rpcs3/Emu/Cell/PPUTranslator.cpp
+++ b/rpcs3/Emu/Cell/PPUTranslator.cpp
@ -1289,7 +1289,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
 	if (m_use_avx512_icl)
 	{
 		const auto i = eval(~c);
-		set_vr(op.vd, vperm2b256to128(b, a, i));
+		set_vr(op.vd, vperm2b(b, a, i));
 		return;
 	}

--- a/rpcs3/Emu/Cell/SPURecompiler.cpp
+++ b/rpcs3/Emu/Cell/SPURecompiler.cpp
@ -8313,13 +8313,13 @@ public:
 				{
 					if (perm_only)
 					{
-						set_vr(op.rt4, vperm2b256to128(as, bs, c));
+						set_vr(op.rt4, vperm2b(as, bs, c));
 						return;
 					}

 					const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
 					const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
-					const auto ab = vperm2b256to128(as, bs, c);
+					const auto ab = vperm2b(as, bs, c);
 					set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
 					return;
 				}
@ -8371,18 +8371,18 @@ public:
 			}
 		}

-		if (m_use_avx512_icl && (op.ra != op.rb))
+		if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn))
 		{
 			if (perm_only)
 			{
-				set_vr(op.rt4, vperm2b256to128(a, b, eval(c ^ 0xf)));
+				set_vr(op.rt4, vperm2b(a, b, eval(c ^ 0xf)));
 				return;
 			}

 			const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
 			const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
 			const auto cr = eval(c ^ 0xf);
-			const auto ab = vperm2b256to128(a, b, cr);
+			const auto ab = vperm2b(a, b, cr);
 			set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
 			return;
 		}