mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-03-17 18:55:19 +01:00
SPU LLVM: Emulate GBB with udot/sdot
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.9, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.9, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.9, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.9, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (0, 51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (1, 8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang ${{ matrix.arch }} (aarch64, clang, clangarm64, ARM64, windows-11-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang ${{ matrix.arch }} (x86_64, clang, clang64, X64, windows-2025) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.9, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.9, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.9, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.9, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (0, 51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (1, 8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang ${{ matrix.arch }} (aarch64, clang, clangarm64, ARM64, windows-11-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang ${{ matrix.arch }} (x86_64, clang, clang64, X64, windows-2025) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run
Greatly simplifies GB, GBH, and GBH. Uses udot/sdot with power of two masks to shift values and sum them in a single step, effectively gathering them
This commit is contained in:
parent
4542020c86
commit
1627757608
|
|
@ -3661,6 +3661,7 @@ public:
|
|||
return result;
|
||||
}
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
template <typename T1, typename T2, typename T3>
|
||||
value_t<u32[4]> udot(T1 a, T2 b, T3 c)
|
||||
{
|
||||
|
|
@ -3670,11 +3671,6 @@ template <typename T1, typename T2, typename T3>
|
|||
const auto data1 = b.eval(m_ir);
|
||||
const auto data2 = c.eval(m_ir);
|
||||
|
||||
// ARM hardware requires the multipliers to be treated as 16-byte vectors
|
||||
//const auto op1 = bitcast(data1, get_type<u8[16]>());
|
||||
//const auto op2 = bitcast(data2, get_type<u8[16]>());
|
||||
|
||||
// Use the variadic get_intrinsic to resolve the overloaded AArch64 intrinsic
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u32[4], u8[16]>(llvm::Intrinsic::aarch64_neon_udot), {data0, data1, data2});
|
||||
return result;
|
||||
}
|
||||
|
|
@ -3688,12 +3684,24 @@ template <typename T1, typename T2, typename T3>
|
|||
const auto data1 = b.eval(m_ir);
|
||||
const auto data2 = c.eval(m_ir);
|
||||
|
||||
//const auto op1 = bitcast(data1, get_type<u8[16]>());
|
||||
//const auto op2 = bitcast(data2, get_type<u8[16]>());
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u32[4], u8[16]>(llvm::Intrinsic::aarch64_neon_sdot), {data0, data1, data2});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
auto addp(T1 a, T2 b)
|
||||
{
|
||||
using T_vector = typename is_llvm_expr<T1>::type;
|
||||
const auto data1 = a.eval(m_ir);
|
||||
const auto data2 = b.eval(m_ir);
|
||||
|
||||
const auto func = get_intrinsic<T_vector>(llvm::Intrinsic::aarch64_neon_addp);
|
||||
|
||||
value_t<T_vector> result;
|
||||
result.value = m_ir->CreateCall(func, {data1, data2});
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T1, typename T2>
|
||||
value_t<u8[16]> vpermb(T1 a, T2 b)
|
||||
|
|
|
|||
|
|
@ -4759,6 +4759,50 @@ public:
|
|||
}
|
||||
|
||||
const auto a = get_vr<s32[4]>(op.ra);
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
// Use dot product instructions with special values to shift then sum results into the preferred slot
|
||||
if (m_use_dotprod)
|
||||
{
|
||||
if (match_vr<s32[4], s64[2]>(op.ra, [&](auto c, auto MP)
|
||||
{
|
||||
using VT = typename decltype(MP)::type;
|
||||
|
||||
if (auto [ok, x] = match_expr(c, sext<VT>(match<bool[std::extent_v<VT>]>())); ok)
|
||||
{
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(a), 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 4, 8, 12);
|
||||
|
||||
set_vr(op.rt, sdot(zeroes, es, build<u8[16]>(
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x01, -0x02, -0x04, -0x08
|
||||
)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
const auto masked = a & 0x01;
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(masked), 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 4, 8, 12);
|
||||
|
||||
set_vr(op.rt, udot(zeroes, es, build<u8[16]>(
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x01, 0x02, 0x04, 0x08
|
||||
)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto m = zext<u32>(bitcast<i4>(trunc<bool[4]>(a)));
|
||||
set_vr(op.rt, insert(splat<u32[4]>(0), 3, eval(m)));
|
||||
}
|
||||
|
|
@ -4774,6 +4818,54 @@ public:
|
|||
}
|
||||
|
||||
const auto a = get_vr<s16[8]>(op.ra);
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
// Use dot product instructions with special values to shift then sum results into the preferred slot
|
||||
if (m_use_dotprod)
|
||||
{
|
||||
if (match_vr<s16[8], s32[4], s64[2]>(op.ra, [&](auto c, auto MP)
|
||||
{
|
||||
using VT = typename decltype(MP)::type;
|
||||
|
||||
if (auto [ok, x] = match_expr(c, sext<VT>(match<bool[std::extent_v<VT>]>())); ok)
|
||||
{
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(a), 16, 16, 16, 16, 16, 16, 16, 16, 0, 2, 4, 6, 8, 10, 12, 14);
|
||||
|
||||
const auto extracted = sdot(zeroes, es, build<u8[16]>(
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x10, -0x20, -0x40, -0x80,
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x10, -0x20, -0x40, -0x80
|
||||
));
|
||||
|
||||
set_vr(op.rt, addp(zeroes, bitcast<u32[4]>(extracted)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
const auto masked = a & 0x01;
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(masked), 16, 16, 16, 16, 16, 16, 16, 16, 0, 2, 4, 6, 8, 10, 12, 14);
|
||||
|
||||
const auto extracted = udot(zeroes, es, build<u8[16]>(
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x10, 0x20, 0x40, 0x80
|
||||
));
|
||||
|
||||
set_vr(op.rt, addp(zeroes, bitcast<u32[4]>(extracted)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto m = zext<u32>(bitcast<u8>(trunc<bool[8]>(a)));
|
||||
set_vr(op.rt, insert(splat<u32[4]>(0), 3, eval(m)));
|
||||
}
|
||||
|
|
@ -4782,6 +4874,53 @@ public:
|
|||
{
|
||||
const auto a = get_vr<u8[16]>(op.ra);
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
// Use dot product instructions with special values to shift then sum results into the preferred slot
|
||||
if (m_use_dotprod)
|
||||
{
|
||||
if (match_vr<s8[16], s16[8], s32[4], s64[2]>(op.ra, [&](auto c, auto MP)
|
||||
{
|
||||
using VT = typename decltype(MP)::type;
|
||||
|
||||
if (auto [ok, x] = match_expr(c, sext<VT>(match<bool[std::extent_v<VT>]>())); ok)
|
||||
{
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
|
||||
const auto extracted = sdot(zeroes, a, build<u8[16]>(
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x10, -0x20, -0x40, -0x80,
|
||||
-0x01, -0x02, -0x04, -0x08,
|
||||
-0x10, -0x20, -0x40, -0x80
|
||||
));
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(extracted), 16, 16, 16, 16, 16, 16, 16, 16, 0, 8, 4, 12, 16, 16, 16, 16);
|
||||
const auto zeroes16 = splat<u16[8]>(0);
|
||||
set_vr(op.rt, addp(zeroes16, bitcast<u16[8]>(es)));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
const auto zeroes = splat<u32[4]>(0);
|
||||
const auto masked = a & 0x01;
|
||||
|
||||
const auto extracted = udot(zeroes, masked, build<u8[16]>(
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08,
|
||||
0x10, 0x20, 0x40, 0x80
|
||||
));
|
||||
|
||||
const auto es = zshuffle(bitcast<u8[16]>(extracted), 16, 16, 16, 16, 16, 16, 16, 16, 0, 8, 4, 12, 16, 16, 16, 16);
|
||||
const auto zeroes16 = splat<u16[8]>(0);
|
||||
set_vr(op.rt, addp(zeroes16, bitcast<u16[8]>(es)));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (m_use_gfni)
|
||||
{
|
||||
const auto as = zshuffle(a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
|
||||
|
|
|
|||
Loading…
Reference in a new issue