PPU/SPU LLVM: Use arm shuffles in recompilers instead of emulating x86 pshufb

> - SHUFB from 9 instructions down to 5
> - Though it should be 4 if LLVM would just emit BCAX...
This commit is contained in:
Malcolm 2026-01-14 04:45:18 +00:00
parent 0e2584fc9f
commit 7e54a0b3bb
3 changed files with 188 additions and 0 deletions

View file

@ -3936,6 +3936,109 @@ public:
});
}
#ifdef ARCH_ARM64
template <typename T1, typename T2>
value_t<u8[16]> tbl(T1 a, T2 b)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto index = b.eval(m_ir);
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
v128 mask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
for (u32 i = 0; i < 16; i++)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
}
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
return result;
}
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
{
value_t<u8[16]> result;
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
{
v128 mask{};
v128 bitmask{};
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
if (cv)
{
for (u32 i = 0; i < 16; i++)
{
const u64 b_val = cv->getElementAsInteger(i);
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
}
}
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
{
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
result.value = m_ir->CreateAnd(lookup, z_mask);
return result;
}
}
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
return result;
}
template <typename T1, typename T2, typename T3>
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
{
value_t<u8[16]> result;
const auto v_fallback = fallback.eval(m_ir);
const auto data0 = a.eval(m_ir);
const auto index = indices.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
return result;
}
template <typename T1, typename T2, typename T3, typename T4>
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
{
value_t<u8[16]> result;
const auto v_fallback = fallback.eval(m_ir);
const auto data0 = a.eval(m_ir);
const auto data1 = b.eval(m_ir);
const auto index = indices.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
return result;
}
#endif
// (m << 3) >= 0 ? a : b
template <typename T, typename U, typename V>
static auto select_by_bit4(T&& m, U&& a, V&& b)

View file

@ -1641,6 +1641,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
{
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
#ifdef ARCH_ARM64
if (op.ra == op.rb)
{
set_vr(op.vd, tbl(a, (~c & 0xf)));
return;
}
set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
}
#else
if (op.ra == op.rb)
{
set_vr(op.vd, pshufb(a, ~c & 0xf));
@ -1657,6 +1669,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
const auto i = eval(~c & 0x1f);
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
}
#endif
void PPUTranslator::VPKPX(ppu_opcode_t op)
{

View file

@ -5918,6 +5918,77 @@ public:
const auto a = get_vr<u8[16]>(op.ra);
const auto b = get_vr<u8[16]>(op.rb);
#ifdef ARCH_ARM64
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
{
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
{
if (op.ra == op.rb)
{
if (perm_only)
{
const auto cm = eval(c & 0x0f);
set_vr(op.rt4, tbl(as, cm));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x8f);
set_vr(op.rt4, tbx(x, as, cm));
return;
}
if (perm_only)
{
const auto cm = eval(c & 0x1f);
set_vr(op.rt4, tbl2(as, bs, cm));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x9f);
set_vr(op.rt4, tbx2(x, as, bs, cm));
return;
}
}
if (op.ra == op.rb && !m_interp_magn)
{
if (perm_only)
{
const auto cm = eval(c & 0x0f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbl(a, cr));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto cm = eval(c & 0x8f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbx(x, a, cr));
return;
}
if (perm_only)
{
const auto cm = eval(c & 0x9f);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbl2(a, b, cr));
return;
}
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
// AND should be before XOR so that llvm can combine them into BCAX
// Though for some reason it doesn't seem to be doing that.
const auto cm = eval(c & ~0x60);
const auto cr = eval(cm ^ 0x0f);
set_vr(op.rt4, tbx2(x, a, b, cr));
}
#else
// Data with swapped endian from a load instruction
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
{
@ -6062,6 +6133,7 @@ public:
else
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
}
#endif
void MPYA(spu_opcode_t op)
{