SPU LLVM: improve xfloat precision

Use doubles for intermediate representation
Add option "Accurate xfloat" to enable
This commit is contained in:
Nekotekina 2018-07-27 13:00:05 +03:00
parent d1fd4d5000
commit fdd4f03b93
3 changed files with 519 additions and 49 deletions

View file

@ -926,6 +926,19 @@ public:
template <typename T>
using value_t = llvm_value_t<T>;
template <typename T>
value_t<T> value(llvm::Value* value)
{
if (!value || value->getType() != get_type<T>())
{
fmt::throw_exception("cpu_translator::value<>(): invalid value type");
}
value_t<T> result;
result.value = value;
return result;
}
template <typename T>
auto eval(T expr)
{
@ -1169,6 +1182,18 @@ public:
return result;
}
// Opportunistic hardware FMA, can be used if results are identical for all possible input values
template <typename T>
auto fmuladd(T a, T b, T c)
{
value_t<typename T::type> result;
const auto av = a.eval(m_ir);
const auto bv = b.eval(m_ir);
const auto cv = c.eval(m_ir);
result.value = m_ir->CreateCall(get_intrinsic<typename T::type>(llvm::Intrinsic::fmuladd), {av, bv, cv});
return result;
}
template <typename T1, typename T2>
value_t<u8[16]> pshufb(T1 a, T2 b)
{

View file

@ -1732,7 +1732,8 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
{
if (const auto phi = m_blocks[target].phi[i])
{
phi->addIncoming(get_vr(i, get_reg_type(i)), m_block->block_end);
const auto typ = phi->getType() == get_type<f64[4]>() ? get_type<f64[4]>() : get_reg_type(i);
phi->addIncoming(get_vr(i, typ), m_block->block_end);
}
}
}
@ -1821,6 +1822,133 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
return ptr;
}
llvm::Value* double_as_uint64(llvm::Value* val)
{
if (llvm::isa<llvm::ConstantAggregateZero>(val))
{
return splat<u64[4]>(0).value;
}
if (auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(val))
{
const f64 data[4]
{
cv->getElementAsDouble(0),
cv->getElementAsDouble(1),
cv->getElementAsDouble(2),
cv->getElementAsDouble(3)
};
return llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const u64*)(const u8*)+data, 4));
}
if (llvm::isa<llvm::Constant>(val))
{
fmt::throw_exception("[0x%x] double_as_uint64: bad constant type", m_pos);
}
return m_ir->CreateBitCast(val, get_type<u64[4]>());
}
llvm::Value* uint64_as_double(llvm::Value* val)
{
if (llvm::isa<llvm::ConstantAggregateZero>(val))
{
return fsplat<f64[4]>(0.).value;
}
if (auto cv = llvm::dyn_cast<llvm::ConstantDataVector>(val))
{
const u64 data[4]
{
cv->getElementAsInteger(0),
cv->getElementAsInteger(1),
cv->getElementAsInteger(2),
cv->getElementAsInteger(3)
};
return llvm::ConstantDataVector::get(m_context, llvm::makeArrayRef((const f64*)(const u8*)+data, 4));
}
if (llvm::isa<llvm::Constant>(val))
{
fmt::throw_exception("[0x%x] uint64_as_double: bad constant type", m_pos);
}
return m_ir->CreateBitCast(val, get_type<f64[4]>());
}
llvm::Value* double_to_xfloat(llvm::Value* val)
{
verify("double_to_xfloat" HERE), val, val->getType() == get_type<f64[4]>();
// Detect xfloat_to_double to avoid unnecessary ops and prevent zeroed denormals
if (auto _bitcast = llvm::dyn_cast<llvm::CastInst>(val))
{
if (_bitcast->getOpcode() == llvm::Instruction::BitCast)
{
if (auto _select = llvm::dyn_cast<llvm::SelectInst>(_bitcast->getOperand(0)))
{
if (auto _icmp = llvm::dyn_cast<llvm::ICmpInst>(_select->getOperand(0)))
{
if (auto _and = llvm::dyn_cast<llvm::BinaryOperator>(_icmp->getOperand(0)))
{
if (auto _zext = llvm::dyn_cast<llvm::CastInst>(_and->getOperand(0)))
{
// TODO: check all details and return xfloat_to_double() arg
}
}
}
}
}
}
const auto d = double_as_uint64(val);
const auto s = m_ir->CreateAnd(m_ir->CreateLShr(d, 32), 0x80000000);
const auto m = m_ir->CreateXor(m_ir->CreateLShr(d, 29), 0x40000000);
const auto r = m_ir->CreateOr(m_ir->CreateAnd(m, 0x7fffffff), s);
return m_ir->CreateTrunc(m_ir->CreateSelect(m_ir->CreateIsNotNull(d), r, splat<u64[4]>(0).value), get_type<u32[4]>());
}
llvm::Value* xfloat_to_double(llvm::Value* val)
{
verify("xfloat_to_double" HERE), val, val->getType() == get_type<u32[4]>();
const auto x = m_ir->CreateZExt(val, get_type<u64[4]>());
const auto s = m_ir->CreateShl(m_ir->CreateAnd(x, 0x80000000), 32);
const auto a = m_ir->CreateAnd(x, 0x7fffffff);
const auto m = m_ir->CreateShl(m_ir->CreateAdd(a, splat<u64[4]>(0x1c0000000).value), 29);
const auto r = m_ir->CreateSelect(m_ir->CreateICmpSGT(a, splat<u64[4]>(0x7fffff).value), m, splat<u64[4]>(0).value);
const auto f = m_ir->CreateOr(s, r);
return uint64_as_double(f);
}
// Clamp double values to ±Smax, flush values smaller than ±Smin to positive zero
llvm::Value* xfloat_in_double(llvm::Value* val)
{
verify("xfloat_in_double" HERE), val, val->getType() == get_type<f64[4]>();
const auto smax = uint64_as_double(splat<u64[4]>(0x47ffffffe0000000).value);
const auto smin = uint64_as_double(splat<u64[4]>(0x3810000000000000).value);
const auto d = double_as_uint64(val);
const auto s = m_ir->CreateAnd(d, 0x8000000000000000);
const auto a = uint64_as_double(m_ir->CreateAnd(d, 0x7fffffffe0000000));
const auto n = m_ir->CreateFCmpOLT(a, smax);
const auto z = m_ir->CreateFCmpOLT(a, smin);
const auto c = double_as_uint64(m_ir->CreateSelect(n, a, smax));
return m_ir->CreateSelect(z, fsplat<f64[4]>(0.).value, uint64_as_double(m_ir->CreateOr(c, s)));
}
// Expand 32-bit mask for xfloat values to 64-bit, 29 least significant bits are always zero
llvm::Value* conv_xfloat_mask(llvm::Value* val)
{
const auto d = m_ir->CreateZExt(val, get_type<u64[4]>());
const auto s = m_ir->CreateShl(m_ir->CreateAnd(d, 0x80000000), 32);
const auto e = m_ir->CreateLShr(m_ir->CreateAShr(m_ir->CreateShl(d, 33), 4), 1);
return m_ir->CreateOr(s, e);
}
llvm::Value* get_vr(u32 index, llvm::Type* type)
{
auto& reg = m_block->reg.at(index);
@ -1831,6 +1959,67 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
reg = m_ir->CreateLoad(init_vr(index));
}
if (reg->getType() == get_type<f64[4]>())
{
if (type == reg->getType())
{
return reg;
}
const auto res = double_to_xfloat(reg);
if (auto c = llvm::dyn_cast<llvm::Constant>(res))
{
return make_const_vector(get_const_vector(c, m_pos, 1000 + index), type);
}
return m_ir->CreateBitCast(res, type);
}
if (type == get_type<f64[4]>())
{
if (const auto phi = llvm::dyn_cast<llvm::PHINode>(reg))
{
if (phi->getNumUses())
{
LOG_TODO(SPU, "[0x%x] $%u: Phi has uses :(", m_pos, index);
}
else
{
const auto cblock = m_ir->GetInsertBlock();
m_ir->SetInsertPoint(phi);
const auto newphi = m_ir->CreatePHI(get_type<f64[4]>(), phi->getNumIncomingValues());
for (u32 i = 0; i < phi->getNumIncomingValues(); i++)
{
const auto iblock = phi->getIncomingBlock(i);
m_ir->SetInsertPoint(iblock->getTerminator());
const auto ivalue = phi->getIncomingValue(i);
newphi->addIncoming(xfloat_to_double(ivalue), iblock);
}
if (phi->getParent() == m_block->block)
{
m_block->phi[index] = newphi;
}
reg = newphi;
m_ir->SetInsertPoint(cblock);
phi->eraseFromParent();
return reg;
}
}
if (auto c = llvm::dyn_cast<llvm::Constant>(reg))
{
return xfloat_to_double(make_const_vector(get_const_vector(c, m_pos, 2000 + index), get_type<u32[4]>()));
}
return xfloat_to_double(m_ir->CreateBitCast(reg, get_type<u32[4]>()));
}
// Bitcast the constant if necessary
if (auto c = llvm::dyn_cast<llvm::Constant>(reg))
{
@ -1852,13 +2041,19 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
return r;
}
void set_vr(u32 index, llvm::Value* value)
void set_vr(u32 index, llvm::Value* value, bool fixup = true)
{
// Check
verify(HERE), m_regmod[m_pos / 4] == index;
// Test for special case
const bool is_xfloat = value->getType() == get_type<f64[4]>();
// Clamp value if necessary
const auto saved_value = is_xfloat && fixup ? xfloat_in_double(value) : value;
// Set register value
m_block->reg.at(index) = value;
m_block->reg.at(index) = saved_value;
// Get register location
const auto addr = init_vr(index);
@ -1871,13 +2066,13 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
}
// Write register to the context
m_block->store[index] = m_ir->CreateStore(m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr);
m_block->store[index] = m_ir->CreateStore(is_xfloat ? double_to_xfloat(saved_value) : m_ir->CreateBitCast(value, addr->getType()->getPointerElementType()), addr);
}
template <typename T>
void set_vr(u32 index, T expr)
void set_vr(u32 index, T expr, bool fixup = true)
{
set_vr(index, expr.eval(m_ir));
set_vr(index, expr.eval(m_ir), fixup);
}
// Return either basic block addr with single dominating value, or negative number of PHI entries
@ -2374,7 +2569,11 @@ public:
value = m_finfo->reg[i] ? m_finfo->reg[i] : m_ir->CreateLoad(regptr);
}
if (i < 128 && llvm::isa<llvm::Constant>(value))
if (value->getType() == get_type<f64[4]>())
{
value = double_to_xfloat(value);
}
else if (i < 128 && llvm::isa<llvm::Constant>(value))
{
// Bitcast the constant
value = make_const_vector(get_const_vector(llvm::cast<llvm::Constant>(value), baddr, i), _phi->getType());
@ -2546,9 +2745,11 @@ public:
// Basic optimizations
pm.add(createEarlyCSEPass());
pm.add(createAggressiveDCEPass());
pm.add(createCFGSimplificationPass());
pm.add(createNewGVNPass());
pm.add(createDeadStoreEliminationPass());
pm.add(createLoopVersioningLICMPass());
pm.add(createAggressiveDCEPass());
//pm.add(createLintPass()); // Check
for (const auto& func : m_functions)
@ -4448,6 +4649,11 @@ public:
op1 = get_vr<f32[4]>(op.rb).value;
op2 = get_vr<f32[4]>(op.ra).value;
}
else if (op1 && op1->getType() == get_type<f64[4]>() || op2 && op2->getType() == get_type<f64[4]>())
{
op1 = get_vr<f64[4]>(op.rb).value;
op2 = get_vr<f64[4]>(op.ra).value;
}
else
{
op1 = get_vr<u32[4]>(op.rb).value;
@ -4478,6 +4684,22 @@ public:
}
}
const auto op1 = m_block->reg[op.rb];
const auto op2 = m_block->reg[op.ra];
if (op1 && op1->getType() == get_type<f64[4]>() || op2 && op2->getType() == get_type<f64[4]>())
{
// Optimization: keep xfloat values in doubles even if the mask is unpredictable (hard way)
const auto c = get_vr<u32[4]>(op.rc);
const auto b = get_vr<f64[4]>(op.rb);
const auto a = get_vr<f64[4]>(op.ra);
const auto m = conv_xfloat_mask(c.value);
const auto x = m_ir->CreateAnd(double_as_uint64(b.value), m);
const auto y = m_ir->CreateAnd(double_as_uint64(a.value), m_ir->CreateNot(m));
set_vr(op.rt4, uint64_as_double(m_ir->CreateOr(x, y)));
return;
}
set_vr(op.rt4, merge(get_vr(op.rc), get_vr(op.rb), get_vr(op.ra)));
}
@ -4695,121 +4917,343 @@ public:
void FREST(spu_opcode_t op) //
{
set_vr(op.rt, fsplat<f32[4]>(1.0) / get_vr<f32[4]>(op.ra));
// TODO
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, fsplat<f64[4]>(1.0) / get_vr<f64[4]>(op.ra));
else
set_vr(op.rt, fsplat<f32[4]>(1.0) / get_vr<f32[4]>(op.ra));
}
void FRSQEST(spu_opcode_t op) //
{
set_vr(op.rt, fsplat<f32[4]>(1.0) / sqrt(fabs(get_vr<f32[4]>(op.ra))));
// TODO
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, fsplat<f64[4]>(1.0) / sqrt(fabs(get_vr<f64[4]>(op.ra))));
else
set_vr(op.rt, fsplat<f32[4]>(1.0) / sqrt(fabs(get_vr<f32[4]>(op.ra))));
}
void FCGT(spu_opcode_t op) //
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb))));
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))));
}
void FCMGT(spu_opcode_t op) //
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(fabs(get_vr<f64[4]>(op.ra)), fabs(get_vr<f64[4]>(op.rb)))));
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OGT>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
}
void FA(spu_opcode_t op) //
{
set_vr(op.rt, get_vr<f32[4]>(op.ra) + get_vr<f32[4]>(op.rb));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.ra) + get_vr<f64[4]>(op.rb));
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) + get_vr<f32[4]>(op.rb));
}
void FS(spu_opcode_t op) //
{
set_vr(op.rt, get_vr<f32[4]>(op.ra) - get_vr<f32[4]>(op.rb));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.ra) - get_vr<f64[4]>(op.rb));
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) - get_vr<f32[4]>(op.rb));
}
void FM(spu_opcode_t op) //
{
set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.ra) * get_vr<f64[4]>(op.rb));
else
set_vr(op.rt, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
}
void FESD(spu_opcode_t op) //
{
value_t<f64[2]> r;
r.value = m_ir->CreateFPExt(shuffle2<f32[2]>(get_vr<f32[4]>(op.ra), fsplat<f32[4]>(0.), 1, 3).value, get_type<f64[2]>());
set_vr(op.rt, r);
if (g_cfg.core.spu_accurate_xfloat)
{
const auto r = shuffle2<f64[2]>(get_vr<f64[4]>(op.ra), fsplat<f64[4]>(0.), 1, 3);
const auto d = bitcast<s64[2]>(r);
const auto a = eval(d & 0x7fffffffffffffff);
const auto s = eval(d & 0x8000000000000000);
const auto i = select(a == 0x47f0000000000000, eval(s | 0x7ff0000000000000), d);
const auto n = select(a > 0x47f0000000000000, splat<s64[2]>(0x7ff8000000000000), i);
set_vr(op.rt, bitcast<f64[2]>(n));
}
else
{
value_t<f64[2]> r;
r.value = m_ir->CreateFPExt(shuffle2<f32[2]>(get_vr<f32[4]>(op.ra), fsplat<f32[4]>(0.), 1, 3).value, get_type<f64[2]>());
set_vr(op.rt, r);
}
}
void FRDS(spu_opcode_t op) //
{
value_t<f32[2]> r;
r.value = m_ir->CreateFPTrunc(get_vr<f64[2]>(op.ra).value, get_type<f32[2]>());
set_vr(op.rt, shuffle2<f32[4]>(r, fsplat<f32[2]>(0.), 2, 0, 3, 1));
if (g_cfg.core.spu_accurate_xfloat)
{
const auto r = get_vr<f64[2]>(op.ra);
const auto d = bitcast<s64[2]>(r);
const auto a = eval(d & 0x7fffffffffffffff);
const auto s = eval(d & 0x8000000000000000);
const auto i = select(a > 0x47f0000000000000, eval(s | 0x47f0000000000000), d);
const auto n = select(a > 0x7ff0000000000000, splat<s64[2]>(0x47f8000000000000), i);
const auto z = select(a < 0x3810000000000000, s, n);
set_vr(op.rt, shuffle2<f64[4]>(bitcast<f64[2]>(z), fsplat<f64[2]>(0.), 2, 0, 3, 1), false);
}
else
{
value_t<f32[2]> r;
r.value = m_ir->CreateFPTrunc(get_vr<f64[2]>(op.ra).value, get_type<f32[2]>());
set_vr(op.rt, shuffle2<f32[4]>(r, fsplat<f32[2]>(0.), 2, 0, 3, 1));
}
}
void FCEQ(spu_opcode_t op) //
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb))));
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(get_vr<f32[4]>(op.ra), get_vr<f32[4]>(op.rb))));
}
void FCMEQ(spu_opcode_t op) //
{
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(fabs(get_vr<f64[4]>(op.ra)), fabs(get_vr<f64[4]>(op.rb)))));
else
set_vr(op.rt, sext<u32[4]>(fcmp<llvm::FCmpInst::FCMP_OEQ>(fabs(get_vr<f32[4]>(op.ra)), fabs(get_vr<f32[4]>(op.rb)))));
}
void FNMS(spu_opcode_t op) //
{
set_vr(op.rt4, get_vr<f32[4]>(op.rc) - get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
// See FMA.
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, -fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), eval(-get_vr<f64[4]>(op.rc))));
else
set_vr(op.rt4, get_vr<f32[4]>(op.rc) - get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb));
}
void FMA(spu_opcode_t op) //
{
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) + get_vr<f32[4]>(op.rc));
// Hardware FMA produces the same result as multiple + add on the limited double range (xfloat).
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), get_vr<f64[4]>(op.rc)));
else
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) + get_vr<f32[4]>(op.rc));
}
void FMS(spu_opcode_t op) //
{
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) - get_vr<f32[4]>(op.rc));
// See FMA.
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt4, fmuladd(get_vr<f64[4]>(op.ra), get_vr<f64[4]>(op.rb), eval(-get_vr<f64[4]>(op.rc))));
else
set_vr(op.rt4, get_vr<f32[4]>(op.ra) * get_vr<f32[4]>(op.rb) - get_vr<f32[4]>(op.rc));
}
void FI(spu_opcode_t op) //
{
set_vr(op.rt, get_vr<f32[4]>(op.rb));
// TODO
if (g_cfg.core.spu_accurate_xfloat)
set_vr(op.rt, get_vr<f64[4]>(op.rb));
else
set_vr(op.rt, get_vr<f32[4]>(op.rb));
}
void CFLTS(spu_opcode_t op) //
{
value_t<f32[4]> a = get_vr<f32[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))));
if (g_cfg.core.spu_accurate_xfloat)
{
value_t<f64[4]> a = get_vr<f64[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f64[4]>(std::exp2(static_cast<int>(173 - op.i8))));
value_t<s32[4]> r;
r.value = m_ir->CreateFPToSI(a.value, get_type<s32[4]>());
set_vr(op.rt, r ^ sext<s32[4]>(fcmp<llvm::FCmpInst::FCMP_OGE>(a, fsplat<f32[4]>(std::exp2(31.f)))));
value_t<s32[4]> r;
if (auto ca = llvm::dyn_cast<llvm::ConstantDataVector>(a.value))
{
const f64 data[4]
{
ca->getElementAsDouble(0),
ca->getElementAsDouble(1),
ca->getElementAsDouble(2),
ca->getElementAsDouble(3)
};
v128 result;
for (u32 i = 0; i < 4; i++)
{
if (data[i] >= std::exp2(31.f))
{
result._s32[i] = INT32_MAX;
}
else if (data[i] < std::exp2(-31.f))
{
result._s32[i] = INT32_MIN;
}
else
{
result._s32[i] = static_cast<s32>(data[i]);
}
}
r.value = make_const_vector(result, get_type<s32[4]>());
set_vr(op.rt, r);
return;
}
if (llvm::isa<llvm::ConstantAggregateZero>(a.value))
{
set_vr(op.rt, splat<u32[4]>(0));
return;
}
r.value = m_ir->CreateFPToSI(a.value, get_type<s32[4]>());
set_vr(op.rt, r ^ sext<s32[4]>(fcmp<llvm::FCmpInst::FCMP_OGE>(a, fsplat<f64[4]>(std::exp2(31.f)))));
}
else
{
value_t<f32[4]> a = get_vr<f32[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))));
value_t<s32[4]> r;
r.value = m_ir->CreateFPToSI(a.value, get_type<s32[4]>());
set_vr(op.rt, r ^ sext<s32[4]>(fcmp<llvm::FCmpInst::FCMP_OGE>(a, fsplat<f32[4]>(std::exp2(31.f)))));
}
}
void CFLTU(spu_opcode_t op) //
{
value_t<f32[4]> a = get_vr<f32[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))));
if (g_cfg.core.spu_accurate_xfloat)
{
value_t<f64[4]> a = get_vr<f64[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f64[4]>(std::exp2(static_cast<int>(173 - op.i8))));
value_t<s32[4]> r;
r.value = m_ir->CreateFPToUI(a.value, get_type<s32[4]>());
set_vr(op.rt, r & ~(bitcast<s32[4]>(a) >> 31));
value_t<s32[4]> r;
if (auto ca = llvm::dyn_cast<llvm::ConstantDataVector>(a.value))
{
const f64 data[4]
{
ca->getElementAsDouble(0),
ca->getElementAsDouble(1),
ca->getElementAsDouble(2),
ca->getElementAsDouble(3)
};
v128 result;
for (u32 i = 0; i < 4; i++)
{
if (data[i] >= std::exp2(32.f))
{
result._u32[i] = UINT32_MAX;
}
else if (data[i] < 0.)
{
result._u32[i] = 0;
}
else
{
result._u32[i] = static_cast<u32>(data[i]);
}
}
r.value = make_const_vector(result, get_type<s32[4]>());
set_vr(op.rt, r);
return;
}
if (llvm::isa<llvm::ConstantAggregateZero>(a.value))
{
set_vr(op.rt, splat<u32[4]>(0));
return;
}
r.value = m_ir->CreateFPToUI(a.value, get_type<s32[4]>());
set_vr(op.rt, r & sext<s32[4]>(fcmp<llvm::FCmpInst::FCMP_OGE>(a, fsplat<f64[4]>(0.))));
}
else
{
value_t<f32[4]> a = get_vr<f32[4]>(op.ra);
if (op.i8 != 173)
a = eval(a * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(173 - op.i8)))));
value_t<s32[4]> r;
r.value = m_ir->CreateFPToUI(a.value, get_type<s32[4]>());
set_vr(op.rt, r & ~(bitcast<s32[4]>(a) >> 31));
}
}
void CSFLT(spu_opcode_t op) //
{
value_t<f32[4]> r;
r.value = m_ir->CreateSIToFP(get_vr<s32[4]>(op.ra).value, get_type<f32[4]>());
if (op.i8 != 155)
r = eval(r * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))));
set_vr(op.rt, r);
if (g_cfg.core.spu_accurate_xfloat)
{
value_t<s32[4]> a = get_vr<s32[4]>(op.ra);
value_t<f64[4]> r;
if (auto ca = llvm::dyn_cast<llvm::Constant>(a.value))
{
v128 data = get_const_vector(ca, m_pos, 25971);
r = build<f64[4]>(data._s32[0], data._s32[1], data._s32[2], data._s32[3]);
}
else
{
r.value = m_ir->CreateSIToFP(a.value, get_type<f64[4]>());
}
if (op.i8 != 155)
r = eval(r * fsplat<f64[4]>(std::exp2(static_cast<int>(op.i8 - 155))));
set_vr(op.rt, r);
}
else
{
value_t<f32[4]> r;
r.value = m_ir->CreateSIToFP(get_vr<s32[4]>(op.ra).value, get_type<f32[4]>());
if (op.i8 != 155)
r = eval(r * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))));
set_vr(op.rt, r);
}
}
void CUFLT(spu_opcode_t op) //
{
value_t<f32[4]> r;
r.value = m_ir->CreateUIToFP(get_vr<s32[4]>(op.ra).value, get_type<f32[4]>());
if (op.i8 != 155)
r = eval(r * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))));
set_vr(op.rt, r);
if (g_cfg.core.spu_accurate_xfloat)
{
value_t<s32[4]> a = get_vr<s32[4]>(op.ra);
value_t<f64[4]> r;
if (auto ca = llvm::dyn_cast<llvm::Constant>(a.value))
{
v128 data = get_const_vector(ca, m_pos, 20971);
r = build<f64[4]>(data._u32[0], data._u32[1], data._u32[2], data._u32[3]);
}
else
{
r.value = m_ir->CreateUIToFP(a.value, get_type<f64[4]>());
}
if (op.i8 != 155)
r = eval(r * fsplat<f64[4]>(std::exp2(static_cast<int>(op.i8 - 155))));
set_vr(op.rt, r);
}
else
{
value_t<f32[4]> r;
r.value = m_ir->CreateUIToFP(get_vr<s32[4]>(op.ra).value, get_type<f32[4]>());
if (op.i8 != 155)
r = eval(r * fsplat<f32[4]>(std::exp2(static_cast<float>(static_cast<s16>(op.i8 - 155)))));
set_vr(op.rt, r);
}
}
void STQX(spu_opcode_t op) //

View file

@ -359,6 +359,7 @@ struct cfg_root : cfg::node
cfg::_bool spu_verification{this, "SPU Verification", true}; // Should be enabled
cfg::_bool spu_cache{this, "SPU Cache", true};
cfg::_enum<tsx_usage> enable_TSX{this, "Enable TSX", tsx_usage::enabled}; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
cfg::_bool spu_accurate_xfloat{this, "Accurate xfloat", false};
cfg::_enum<lib_loading_type> lib_loading{this, "Lib Loader", lib_loading_type::liblv2only};
cfg::_bool hook_functions{this, "Hook static functions"};