SPU LLVM: Add VNNI optimized variant of sumb

- Uses vpdpbusd to horrizontally add values, for some reason this is much faster than the normal horizontal add instructions.
This commit is contained in:
Malcolm Jestadt 2021-08-29 00:48:00 -04:00 committed by Ivan
parent a86b278115
commit d304b52391
2 changed files with 24 additions and 0 deletions

View file

@ -7032,6 +7032,18 @@ public:
void SUMB(spu_opcode_t op)
{
// TODO: Some future CPUS will support VNNI but not avx512
if (m_use_avx512_icl)
{
const auto [a, b] = get_vrs<u32[4]>(op.ra, op.rb);
const auto zeroes = splat<u32[4]>(0);
const auto ones = splat<u32[4]>(0x01010101);
const auto ax = bitcast<u16[8]>(vpdpbusd(zeroes, a, ones));
const auto bx = bitcast<u16[8]>(vpdpbusd(zeroes, b, ones));
set_vr(op.rt, shuffle2(ax, bx, 0, 8, 2, 10, 4, 12, 6, 14));
return;
}
const auto [a, b] = get_vrs<u16[8]>(op.ra, op.rb);
const auto ahs = eval((a >> 8) + (a & 0xff));
const auto bhs = eval((b >> 8) + (b & 0xff));