mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-03-11 07:56:15 +01:00
Merge 7e54a0b3bb into 0603d24a91
This commit is contained in:
commit
ffe36fc786
|
|
@ -3943,6 +3943,109 @@ public:
|
|||
});
|
||||
}
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
template <typename T1, typename T2>
|
||||
value_t<u8[16]> tbl(T1 a, T2 b)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto index = b.eval(m_ir);
|
||||
const auto zeros = llvm::ConstantAggregateZero::get(get_type<u8[16]>());
|
||||
|
||||
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
|
||||
{
|
||||
v128 mask{};
|
||||
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
|
||||
|
||||
if (cv)
|
||||
{
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
{
|
||||
const u64 b_val = cv->getElementAsInteger(i);
|
||||
mask._u8[i] = (b_val < 16) ? static_cast<u8>(b_val) : static_cast<u8>(16);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
|
||||
{
|
||||
result.value = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
|
||||
result.value = m_ir->CreateZExt(result.value, get_type<u32[16]>());
|
||||
result.value = m_ir->CreateShuffleVector(data0, zeros, result.value);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl1), { data0, index });
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
value_t<u8[16]> tbl2(T1 a, T2 b, T3 indices)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto data1 = b.eval(m_ir);
|
||||
const auto index = indices.eval(m_ir);
|
||||
|
||||
if (auto c = llvm::dyn_cast<llvm::Constant>(index))
|
||||
{
|
||||
v128 mask{};
|
||||
v128 bitmask{};
|
||||
const auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(c);
|
||||
|
||||
if (cv)
|
||||
{
|
||||
for (u32 i = 0; i < 16; i++)
|
||||
{
|
||||
const u64 b_val = cv->getElementAsInteger(i);
|
||||
mask._u8[i] = (b_val < 32) ? static_cast<u8>(b_val) : static_cast<u8>(0);
|
||||
bitmask._u8[i] = (b_val < 32) ? static_cast<u8>(0xFF) : static_cast<u8>(0x00);
|
||||
}
|
||||
}
|
||||
|
||||
if (cv || llvm::isa<llvm::ConstantAggregateZero>(c))
|
||||
{
|
||||
auto m_val = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&mask), 16));
|
||||
auto m_ext = m_ir->CreateZExt(m_val, get_type<u32[16]>());
|
||||
auto lookup = m_ir->CreateShuffleVector(data0, data1, m_ext);
|
||||
|
||||
auto z_mask = llvm::ConstantDataVector::get(m_context, llvm::ArrayRef(reinterpret_cast<const u8*>(&bitmask), 16));
|
||||
result.value = m_ir->CreateAnd(lookup, z_mask);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbl2), { data0, data1, index });
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3>
|
||||
value_t<u8[16]> tbx(T1 fallback, T2 a, T3 indices)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
const auto v_fallback = fallback.eval(m_ir);
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto index = indices.eval(m_ir);
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx1), { v_fallback, data0, index });
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename T3, typename T4>
|
||||
value_t<u8[16]> tbx2(T1 fallback, T2 a, T3 b, T4 indices)
|
||||
{
|
||||
value_t<u8[16]> result;
|
||||
const auto v_fallback = fallback.eval(m_ir);
|
||||
const auto data0 = a.eval(m_ir);
|
||||
const auto data1 = b.eval(m_ir);
|
||||
const auto index = indices.eval(m_ir);
|
||||
|
||||
result.value = m_ir->CreateCall(get_intrinsic<u8[16]>(llvm::Intrinsic::aarch64_neon_tbx2), { v_fallback, data0, data1, index });
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
// (m << 3) >= 0 ? a : b
|
||||
template <typename T, typename U, typename V>
|
||||
static auto select_by_bit4(T&& m, U&& a, V&& b)
|
||||
|
|
|
|||
|
|
@ -1642,6 +1642,18 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
|
|||
{
|
||||
const auto [a, b, c] = get_vrs<u8[16]>(op.va, op.vb, op.vc);
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
|
||||
if (op.ra == op.rb)
|
||||
{
|
||||
set_vr(op.vd, tbl(a, (~c & 0xf)));
|
||||
return;
|
||||
}
|
||||
|
||||
set_vr(op.vd, tbl2(b, a, (~c & 0x1f)));
|
||||
}
|
||||
#else
|
||||
|
||||
if (op.ra == op.rb)
|
||||
{
|
||||
set_vr(op.vd, pshufb(a, ~c & 0xf));
|
||||
|
|
@ -1658,6 +1670,7 @@ void PPUTranslator::VPERM(ppu_opcode_t op)
|
|||
const auto i = eval(~c & 0x1f);
|
||||
set_vr(op.vd, select(noncast<s8[16]>(c << 3) >= 0, pshufb(a, i), pshufb(b, i)));
|
||||
}
|
||||
#endif
|
||||
|
||||
void PPUTranslator::VPKPX(ppu_opcode_t op)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -5920,6 +5920,77 @@ public:
|
|||
const auto a = get_vr<u8[16]>(op.ra);
|
||||
const auto b = get_vr<u8[16]>(op.rb);
|
||||
|
||||
#ifdef ARCH_ARM64
|
||||
|
||||
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
if (auto [ok, bs] = match_expr(b, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
if (op.ra == op.rb)
|
||||
{
|
||||
if (perm_only)
|
||||
{
|
||||
const auto cm = eval(c & 0x0f);
|
||||
set_vr(op.rt4, tbl(as, cm));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
|
||||
const auto cm = eval(c & 0x8f);
|
||||
set_vr(op.rt4, tbx(x, as, cm));
|
||||
return;
|
||||
}
|
||||
|
||||
if (perm_only)
|
||||
{
|
||||
const auto cm = eval(c & 0x1f);
|
||||
set_vr(op.rt4, tbl2(as, bs, cm));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
|
||||
const auto cm = eval(c & 0x9f);
|
||||
set_vr(op.rt4, tbx2(x, as, bs, cm));
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (op.ra == op.rb && !m_interp_magn)
|
||||
{
|
||||
if (perm_only)
|
||||
{
|
||||
const auto cm = eval(c & 0x0f);
|
||||
const auto cr = eval(cm ^ 0x0f);
|
||||
set_vr(op.rt4, tbl(a, cr));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
|
||||
const auto cm = eval(c & 0x8f);
|
||||
const auto cr = eval(cm ^ 0x0f);
|
||||
set_vr(op.rt4, tbx(x, a, cr));
|
||||
return;
|
||||
}
|
||||
|
||||
if (perm_only)
|
||||
{
|
||||
const auto cm = eval(c & 0x9f);
|
||||
const auto cr = eval(cm ^ 0x0f);
|
||||
set_vr(op.rt4, tbl2(a, b, cr));
|
||||
return;
|
||||
}
|
||||
|
||||
const auto x = tbl(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
|
||||
// AND should be before XOR so that llvm can combine them into BCAX
|
||||
// Though for some reason it doesn't seem to be doing that.
|
||||
const auto cm = eval(c & ~0x60);
|
||||
const auto cr = eval(cm ^ 0x0f);
|
||||
set_vr(op.rt4, tbx2(x, a, b, cr));
|
||||
}
|
||||
#else
|
||||
|
||||
// Data with swapped endian from a load instruction
|
||||
if (auto [ok, as] = match_expr(a, byteswap(match<u8[16]>())); ok)
|
||||
{
|
||||
|
|
@ -6064,6 +6135,7 @@ public:
|
|||
else
|
||||
set_vr(op.rt4, select_by_bit4(cr, ax, bx) | x);
|
||||
}
|
||||
#endif
|
||||
|
||||
void MPYA(spu_opcode_t op)
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in a new issue