mirror of
https://github.com/RPCS3/rpcs3.git
synced 2026-05-07 13:37:46 +00:00
Initial optimizations (Squashed to fix submodules)
This commit is contained in:
parent
db96c355ce
commit
2d4af4c88c
13 changed files with 426 additions and 558 deletions
|
|
@ -14,7 +14,10 @@ void shared_mutex::imp_lock_shared(s64 _old)
|
|||
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
busy_wait();
|
||||
if (i != 0)
|
||||
{
|
||||
busy_wait();
|
||||
}
|
||||
|
||||
const s64 value = m_value.load();
|
||||
|
||||
|
|
@ -173,7 +176,10 @@ void shared_mutex::imp_lock(s64 _old)
|
|||
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
busy_wait();
|
||||
if (i != 0)
|
||||
{
|
||||
busy_wait();
|
||||
}
|
||||
|
||||
const s64 value = m_value.load();
|
||||
|
||||
|
|
@ -236,6 +242,10 @@ void shared_mutex::imp_lock_degrade()
|
|||
|
||||
bool shared_mutex::try_lock_shared()
|
||||
{
|
||||
if (m_value < c_min) // Fast path
|
||||
{
|
||||
return false;
|
||||
}
|
||||
// Conditional decrement
|
||||
return m_value.fetch_op([](s64& value) { if (value >= c_min) value -= c_min; }) >= c_min;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,25 +36,25 @@ spu_recompiler::spu_recompiler()
|
|||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::compile(spu_function_t& f)
|
||||
bool spu_recompiler::compile(std::shared_ptr<spu_function_contents_t> f)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
|
||||
if (f.compiled)
|
||||
if (f->compiled)
|
||||
{
|
||||
// return if function already compiled
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (f.addr >= 0x40000 || f.addr % 4 || f.size == 0 || f.size > 0x40000 - f.addr || f.size % 4)
|
||||
if (f->addr >= 0x40000 || f->addr % 4 || f->size == 0 || f->size > 0x40000 - f->addr || f->size % 4)
|
||||
{
|
||||
fmt::throw_exception("Invalid SPU function (addr=0x%05x, size=0x%x)" HERE, f.addr, f.size);
|
||||
fmt::throw_exception("Invalid SPU function (addr=0x%05x, size=0x%x)" HERE, f->addr, f->size);
|
||||
}
|
||||
|
||||
using namespace asmjit;
|
||||
|
||||
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
|
||||
dis_asm.offset = reinterpret_cast<u8*>(f.data.data()) - f.addr;
|
||||
dis_asm.offset = reinterpret_cast<u8*>(f->data.data()) - f->addr;
|
||||
|
||||
StringLogger logger;
|
||||
logger.addOptions(Logger::kOptionBinaryForm);
|
||||
|
|
@ -63,10 +63,10 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
fmt::append(log, "========== SPU FUNCTION 0x%05x - 0x%05x ==========\n\n", f.addr, f.addr + f.size);
|
||||
fmt::append(log, "========== SPU FUNCTION 0x%05x - 0x%05x ==========\n\n", f->addr, f->addr + f->size);
|
||||
}
|
||||
|
||||
this->m_func = &f;
|
||||
this->m_func = f;
|
||||
|
||||
asmjit::CodeHolder code;
|
||||
code.init(m_jit->getCodeInfo());
|
||||
|
|
@ -121,13 +121,13 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
compiler.alloc(vec_vars[5], asmjit::x86::xmm5);
|
||||
|
||||
// Initialize labels
|
||||
std::vector<Label> pos_labels{ 0x10000 };
|
||||
this->labels = pos_labels.data();
|
||||
this->labels = std::unique_ptr<Label[]>(reinterpret_cast<Label*>(new u8[0x10000 * sizeof(Label)]()));
|
||||
auto pos_labels = this->labels.get();
|
||||
|
||||
// Register labels for block entries
|
||||
for (const u32 addr : f.blocks)
|
||||
for (const u32 addr : f->blocks)
|
||||
{
|
||||
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
|
||||
if (addr < f->addr || addr >= f->addr + f->size || addr % 4)
|
||||
{
|
||||
fmt::throw_exception("Invalid function block entry (0x%05x)" HERE, addr);
|
||||
}
|
||||
|
|
@ -136,15 +136,15 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
}
|
||||
|
||||
// Register label for post-the-end address
|
||||
pos_labels[(f.addr + f.size) / 4 % 0x10000] = compiler.newLabel();
|
||||
pos_labels[(f->addr + f->size) / 4 % 0x10000] = compiler.newLabel();
|
||||
|
||||
// Register label for jump table resolver
|
||||
Label jt_label = compiler.newLabel();
|
||||
this->jt = &jt_label;
|
||||
|
||||
for (const u32 addr : f.jtable)
|
||||
for (const u32 addr : f->jtable)
|
||||
{
|
||||
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
|
||||
if (addr < f->addr || addr >= f->addr + f->size || addr % 4)
|
||||
{
|
||||
fmt::throw_exception("Invalid jump table entry (0x%05x)" HERE, addr);
|
||||
}
|
||||
|
|
@ -155,24 +155,16 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
this->end = &end_label;
|
||||
|
||||
// Start compilation
|
||||
m_pos = f.addr;
|
||||
m_pos = f->addr;
|
||||
|
||||
if (utils::has_avx())
|
||||
{
|
||||
compiler.vzeroupper();
|
||||
//compiler.pxor(asmjit::x86::xmm0, asmjit::x86::xmm0);
|
||||
//compiler.vptest(asmjit::x86::ymm0, asmjit::x86::ymm0);
|
||||
//compiler.jnz(end_label);
|
||||
}
|
||||
|
||||
for (const u32 op : f.data)
|
||||
for (const u32 op : f->data)
|
||||
{
|
||||
// Bind label if initialized
|
||||
if (pos_labels[m_pos / 4].isValid())
|
||||
{
|
||||
compiler.bind(pos_labels[m_pos / 4]);
|
||||
|
||||
if (f.blocks.find(m_pos) != f.blocks.end())
|
||||
if (f->blocks.find(m_pos) != f->blocks.end())
|
||||
{
|
||||
compiler.comment("Block:");
|
||||
}
|
||||
|
|
@ -219,12 +211,12 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
// Generate jump table resolver (uses addr_var)
|
||||
compiler.bind(jt_label);
|
||||
|
||||
if (f.jtable.size())
|
||||
if (f->jtable.size())
|
||||
{
|
||||
compiler.comment("Jump table resolver:");
|
||||
}
|
||||
|
||||
for (const u32 addr : f.jtable)
|
||||
for (const u32 addr : f->jtable)
|
||||
{
|
||||
if ((addr % 4) == 0 && addr < 0x40000 && pos_labels[addr / 4].isValid())
|
||||
{
|
||||
|
|
@ -253,8 +245,8 @@ void spu_recompiler::compile(spu_function_t& f)
|
|||
Func fn;
|
||||
m_jit->add(&fn, codeHolder);
|
||||
|
||||
f.compiled = asmjit::Internal::ptr_cast<decltype(f.compiled)>(fn);
|
||||
|
||||
f->compiled = asmjit::Internal::ptr_cast<decltype(f->compiled)>(fn);
|
||||
|
||||
if (g_cfg.core.spu_debug)
|
||||
{
|
||||
// Add ASMJIT logs
|
||||
|
|
@ -462,10 +454,19 @@ void spu_recompiler::LNOP(spu_opcode_t op)
|
|||
{
|
||||
}
|
||||
|
||||
void invalidate_jit(SPUThread* _spu)
|
||||
{
|
||||
for (auto& func : _spu->compiled_functions)
|
||||
{
|
||||
func->dirty_bit = true;
|
||||
}
|
||||
}
|
||||
|
||||
void spu_recompiler::SYNC(spu_opcode_t op)
|
||||
{
|
||||
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(SPUThread*)>(invalidate_jit)), asmjit::FuncSignature1<u32, SPUThread*>(asmjit::CallConv::kIdHost));
|
||||
call->setArg(0, *cpu);
|
||||
// This instruction must be used following a store instruction that modifies the instruction stream.
|
||||
c->mfence();
|
||||
}
|
||||
|
||||
void spu_recompiler::DSYNC(spu_opcode_t op)
|
||||
|
|
@ -623,16 +624,6 @@ void spu_recompiler::ROT(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->vprotd(vt, va, vb);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](u32* t, const u32* a, const s32* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
|
|
@ -672,22 +663,6 @@ void spu_recompiler::ROTM(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
|
||||
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
|
||||
c->pxor(vt, vt);
|
||||
c->psubd(vt, vb);
|
||||
c->pcmpgtd(vb, XmmConst(_mm_set1_epi32(31)));
|
||||
c->vpshld(vt, va, vt);
|
||||
c->vpandn(vt, vb, vt);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](u32* t, const u32* a, const u32* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
|
|
@ -728,21 +703,6 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
|
||||
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
|
||||
c->pxor(vt, vt);
|
||||
c->pminud(vb, XmmConst(_mm_set1_epi32(31)));
|
||||
c->psubd(vt, vb);
|
||||
c->vpshad(vt, va, vt);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](s32* t, const s32* a, const u32* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
|
|
@ -782,19 +742,6 @@ void spu_recompiler::SHL(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->pand(vb, XmmConst(_mm_set1_epi32(0x3f)));
|
||||
c->vpcmpgtd(vt, vb, XmmConst(_mm_set1_epi32(31)));
|
||||
c->vpshld(vb, va, vb);
|
||||
c->pandn(vt, vb);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](u32* t, const u32* a, const u32* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 4; i++)
|
||||
|
|
@ -828,24 +775,13 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf
|
|||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
c->vmovdqa(v4, XmmConst(_mm_set_epi32(0x0d0c0d0c, 0x09080908, 0x05040504, 0x01000100)));
|
||||
c->vpshufb(vt, va, v4); // duplicate low word
|
||||
c->vpsrld(va, va, 16);
|
||||
c->vpshufb(va, va, v4);
|
||||
c->vpsrld(v4, vb, 16);
|
||||
c->vprolvd(va, va, v4);
|
||||
c->vprolvd(vb, vt, vb);
|
||||
c->vpblendw(vt, vb, va, 0xaa);
|
||||
c->vmovdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->vprotw(vt, va, vb);
|
||||
c->movdqa(v4, XmmConst(_mm_set1_epi16(0xf)));
|
||||
c->pand(vb, v4);
|
||||
c->vpsllvw(vt, va, vb);
|
||||
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
|
||||
c->pandn(vb, v4);
|
||||
c->vpsrlvw(va, va, vb);
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
|
@ -889,42 +825,6 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_avx2())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
const XmmLink& v5 = XmmAlloc();
|
||||
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
|
||||
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
|
||||
c->vpsrld(v4, vb, 16);
|
||||
c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
|
||||
c->vpandn(vb, vt, va); // clear high words
|
||||
c->vpsrlvd(va, va, v4);
|
||||
c->vpsrlvd(vb, vb, v5);
|
||||
c->vpblendw(vt, vb, va, 0xaa); // can use vpblendvb with 0xffff0000 mask (vt)
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
|
||||
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->pxor(vt, vt);
|
||||
c->psubw(vt, vb);
|
||||
c->pcmpgtw(vb, XmmConst(_mm_set1_epi16(15)));
|
||||
c->vpshlw(vt, va, vt);
|
||||
c->vpandn(vt, vb, vt);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](u16* t, const u16* a, const u16* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
|
|
@ -965,43 +865,6 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_avx2())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
const XmmLink& v5 = XmmAlloc();
|
||||
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
|
||||
c->movdqa(vt, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->vpandn(v4, vb, vt);
|
||||
c->vpand(v5, vb, vt);
|
||||
c->movdqa(vt, XmmConst(_mm_set1_epi32(0x2f)));
|
||||
c->vpsrld(v4, v4, 16);
|
||||
c->vpsubusw(v5, vt, v5); // clear high word and add 16 to low word
|
||||
c->vpslld(vb, va, 16);
|
||||
c->vpsravd(va, va, v4);
|
||||
c->vpsravd(vb, vb, v5);
|
||||
c->vpblendw(vt, vb, va, 0xaa);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
|
||||
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->pxor(vt, vt);
|
||||
c->pminuw(vb, XmmConst(_mm_set1_epi16(15)));
|
||||
c->psubw(vt, vb);
|
||||
c->vpshaw(vt, va, vt);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](s16* t, const s16* a, const u16* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
|
|
@ -1041,38 +904,6 @@ void spu_recompiler::SHLH(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_avx2())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
const XmmLink& v5 = XmmAlloc();
|
||||
c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
|
||||
c->vpsrld(v4, vb, 16);
|
||||
c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
|
||||
c->vpand(vb, vt, va); // clear low words
|
||||
c->vpsllvd(va, va, v5);
|
||||
c->vpsllvd(vb, vb, v4);
|
||||
c->vpblendw(vt, vb, va, 0x55);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
|
||||
c->vpcmpgtw(vt, vb, XmmConst(_mm_set1_epi16(15)));
|
||||
c->vpshlw(vb, va, vb);
|
||||
c->pandn(vt, vb);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
return;
|
||||
}
|
||||
|
||||
auto body = [](u16* t, const u16* a, const u16* b) noexcept
|
||||
{
|
||||
for (u32 i = 0; i < 8; i++)
|
||||
|
|
@ -1111,14 +942,6 @@ void spu_recompiler::ROTI(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
c->vprotd(va, va, s);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
return;
|
||||
}
|
||||
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& v1 = XmmAlloc();
|
||||
c->movdqa(v1, va);
|
||||
|
|
@ -1716,57 +1539,50 @@ void spu_recompiler::CDX(spu_opcode_t op)
|
|||
|
||||
void spu_recompiler::ROTQBI(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
c->psrldq(vb, 12);
|
||||
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
|
||||
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
|
||||
c->pshufd(vt, va, 0x4e);
|
||||
c->psubq(v4, vb);
|
||||
c->psllq(va, vb);
|
||||
c->psrlq(vt, v4);
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->mov(*qw2, *qw0);
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 7);
|
||||
c->shld(*qw0, *qw1, *addr);
|
||||
c->shld(*qw1, *qw2, *addr);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
c->unuse(*qw2);
|
||||
}
|
||||
|
||||
void spu_recompiler::ROTQMBI(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmAlloc();
|
||||
const XmmLink& vt = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
c->psrldq(vt, 12);
|
||||
c->pxor(vb, vb);
|
||||
c->psubq(vb, vt);
|
||||
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
|
||||
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
|
||||
c->movdqa(vt, va);
|
||||
c->psrldq(vt, 8);
|
||||
c->psubq(v4, vb);
|
||||
c->psrlq(va, vb);
|
||||
c->psllq(vt, v4);
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->neg(*addr);
|
||||
c->and_(*addr, 7);
|
||||
c->shrd(*qw0, *qw1, *addr);
|
||||
c->shr(*qw1, *addr);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
void spu_recompiler::SHLQBI(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
const XmmLink& v4 = XmmAlloc();
|
||||
c->psrldq(vb, 12);
|
||||
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
|
||||
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
|
||||
c->movdqa(vt, va);
|
||||
c->pslldq(vt, 8);
|
||||
c->psubq(v4, vb);
|
||||
c->psllq(va, vb);
|
||||
c->psrlq(vt, v4);
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
|
||||
c->and_(*addr, 7);
|
||||
c->shld(*qw1, *qw0, *addr);
|
||||
c->shl(*qw0, *addr);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*addr);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
void spu_recompiler::ROTQBY(spu_opcode_t op)
|
||||
|
|
@ -1867,14 +1683,16 @@ void spu_recompiler::SHLQBY(spu_opcode_t op)
|
|||
|
||||
void spu_recompiler::ORX(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& v1 = XmmAlloc();
|
||||
c->pshufd(v1, va, 0xb1);
|
||||
c->por(va, v1);
|
||||
c->pshufd(v1, va, 0x4e);
|
||||
c->por(va, v1);
|
||||
c->pslldq(va, 12);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
|
||||
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 0));
|
||||
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 1));
|
||||
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 2));
|
||||
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
|
||||
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), *addr);
|
||||
c->xor_(*addr, *addr);
|
||||
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 0), *addr);
|
||||
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 1), *addr);
|
||||
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 2), *addr);
|
||||
c->unuse(*addr);
|
||||
}
|
||||
|
||||
void spu_recompiler::CBD(spu_opcode_t op)
|
||||
|
|
@ -1981,37 +1799,40 @@ void spu_recompiler::CDD(spu_opcode_t op)
|
|||
|
||||
void spu_recompiler::ROTQBII(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->pshufd(vt, va, 0x4e); // swap 64-bit parts
|
||||
c->psllq(va, (op.i7 & 0x7));
|
||||
c->psrlq(vt, 64 - (op.i7 & 0x7));
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->mov(*qw2, *qw0);
|
||||
c->shld(*qw0, *qw1, op.i7 & 0x7);
|
||||
c->shld(*qw1, *qw2, op.i7 & 0x7);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
c->unuse(*qw2);
|
||||
}
|
||||
|
||||
void spu_recompiler::ROTQMBII(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, va);
|
||||
c->psrldq(vt, 8);
|
||||
c->psrlq(va, ((0 - op.i7) & 0x7));
|
||||
c->psllq(vt, 64 - ((0 - op.i7) & 0x7));
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->shrd(*qw0, *qw1, 0-op.i7 & 0x7);
|
||||
c->shr(*qw1, 0-op.i7 & 0x7);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
void spu_recompiler::SHLQBII(spu_opcode_t op)
|
||||
{
|
||||
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
|
||||
const XmmLink& vt = XmmAlloc();
|
||||
c->movdqa(vt, va);
|
||||
c->pslldq(vt, 8);
|
||||
c->psllq(va, (op.i7 & 0x7));
|
||||
c->psrlq(vt, 64 - (op.i7 & 0x7));
|
||||
c->por(vt, va);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
|
||||
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
|
||||
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
|
||||
c->shld(*qw1, *qw0, op.i7 & 0x7);
|
||||
c->shl(*qw0, op.i7 & 0x7);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
|
||||
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
|
||||
c->unuse(*qw0);
|
||||
c->unuse(*qw1);
|
||||
}
|
||||
|
||||
void spu_recompiler::ROTQBYI(spu_opcode_t op)
|
||||
|
|
@ -3478,13 +3299,6 @@ void spu_recompiler::SELB(spu_opcode_t op)
|
|||
return;
|
||||
}
|
||||
|
||||
if (utils::has_xop())
|
||||
{
|
||||
c->vpcmov(vc, vb, SPU_OFF_128(gpr, op.ra), vc);
|
||||
c->movdqa(SPU_OFF_128(gpr, op.rt4), vc);
|
||||
return;
|
||||
}
|
||||
|
||||
c->pand(vb, vc);
|
||||
c->pandn(vc, SPU_OFF_128(gpr, op.ra));
|
||||
c->por(vb, vc);
|
||||
|
|
@ -3609,10 +3423,6 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
|
|||
{
|
||||
c->vpternlogd(vc, va, vb, 0xca /* A?B:C */);
|
||||
}
|
||||
else if (utils::has_xop())
|
||||
{
|
||||
c->vpcmov(vc, va, vb, vc);
|
||||
}
|
||||
else
|
||||
{
|
||||
c->pand(va, vc);
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ class spu_recompiler : public spu_recompiler_base
|
|||
public:
|
||||
spu_recompiler();
|
||||
|
||||
virtual void compile(spu_function_t& f) override;
|
||||
virtual bool compile(std::shared_ptr<spu_function_contents_t> f) override;
|
||||
|
||||
private:
|
||||
// emitter:
|
||||
|
|
@ -41,7 +41,7 @@ private:
|
|||
std::array<asmjit::X86Xmm*, 6> vec;
|
||||
|
||||
// labels:
|
||||
asmjit::Label* labels; // array[0x10000]
|
||||
std::unique_ptr<asmjit::Label[]> labels; // array[0x10000]
|
||||
asmjit::Label* jt; // jump table resolver (uses *addr)
|
||||
asmjit::Label* end; // function end (return *addr)
|
||||
|
||||
|
|
|
|||
|
|
@ -5,16 +5,18 @@
|
|||
|
||||
const spu_decoder<spu_itype> s_spu_itype;
|
||||
|
||||
spu_function_t* SPUDatabase::find(const be_t<u32>* data, u64 key, u32 max_size)
|
||||
std::shared_ptr<spu_function_contents_t> SPUDatabase::find(const be_t<u32>* data, u64 key, u32 max_size, void* ignore)
|
||||
{
|
||||
for (auto found = m_db.equal_range(key); found.first != found.second; found.first++)
|
||||
{
|
||||
const auto& func = found.first->second;
|
||||
const auto & func = found.first->second;
|
||||
|
||||
// TODO remove code after a while if it hasn't been touched, else there's a big memory bloat here
|
||||
|
||||
// Compare binary data explicitly (TODO: optimize)
|
||||
if (LIKELY(func->size <= max_size) && std::memcmp(func->data.data(), data, func->size) == 0)
|
||||
if (func.get() != ignore && LIKELY(func->size <= max_size) && memcmp(func->data.data(), data, func->size) == 0)
|
||||
{
|
||||
return func.get();
|
||||
return func;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -33,8 +35,9 @@ SPUDatabase::~SPUDatabase()
|
|||
// TODO: serialize database
|
||||
}
|
||||
|
||||
spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_limit)
|
||||
std::shared_ptr<spu_function_contents_t> SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, void* ignore /*=nullptr*/)
|
||||
{
|
||||
const u32 max_limit = 0x40000;
|
||||
// Check arguments (bounds and alignment)
|
||||
if (max_limit > 0x40000 || entry >= max_limit || entry % 4 || max_limit % 4)
|
||||
{
|
||||
|
|
@ -42,7 +45,12 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
}
|
||||
|
||||
// Key for multimap
|
||||
const u64 key = entry | u64{ ls[entry / 4] } << 32;
|
||||
u32 xor_base = 0;
|
||||
for (u32 i = 0; i < 10; i++)
|
||||
{
|
||||
xor_base ^= ls[(entry / 4) + i];
|
||||
}
|
||||
const u64 key = entry | u64{ xor_base } << 32;
|
||||
const be_t<u32>* base = ls + entry / 4;
|
||||
const u32 block_sz = max_limit - entry;
|
||||
|
||||
|
|
@ -56,7 +64,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
/*{
|
||||
writer_lock lock(m_mutex);
|
||||
|
||||
// Double-check
|
||||
|
|
@ -64,7 +72,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
{
|
||||
return func;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
// Initialize block entries with the function entry point
|
||||
std::set<u32> blocks{ entry };
|
||||
|
|
@ -84,15 +92,21 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
// Find preliminary set of possible block entries (first pass), `start` is the current block address
|
||||
for (u32 start = entry, pos = entry; pos < limit; pos += 4)
|
||||
{
|
||||
u32 xor_base = 0;
|
||||
for (u32 i = 0; i < 10; i++)
|
||||
{
|
||||
xor_base ^= ls[(pos / 4) + i];
|
||||
}
|
||||
const spu_opcode_t op{ ls[pos / 4] };
|
||||
|
||||
const auto type = s_spu_itype.decode(op.opcode);
|
||||
|
||||
if (pos != entry)
|
||||
{
|
||||
reader_lock lock(m_mutex);
|
||||
|
||||
// Find existing function
|
||||
if (pos != entry && find(ls + pos / 4, pos | u64{ op.opcode } << 32, limit - pos))
|
||||
if (find(ls + pos / 4, pos | u64{ ls[pos / 4] } << 32, limit - pos))
|
||||
{
|
||||
limit = pos;
|
||||
break;
|
||||
|
|
@ -308,7 +322,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
}
|
||||
|
||||
// Prepare new function (set addr and size)
|
||||
auto func = std::make_shared<spu_function_t>(entry, limit - entry);
|
||||
auto func = std::make_shared<spu_function_contents_t>(entry, limit - entry);
|
||||
|
||||
// Copy function contents
|
||||
func->data = { ls + entry / 4, ls + limit / 4 };
|
||||
|
|
@ -354,5 +368,5 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
|
|||
|
||||
LOG_NOTICE(SPU, "Function detected [0x%05x-0x%05x] (size=0x%x)", func->addr, func->addr + func->size, func->size);
|
||||
|
||||
return func.get();
|
||||
return func;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,9 @@
|
|||
|
||||
#include "Utilities/mutex.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <memory>
|
||||
|
||||
// SPU Instruction Type
|
||||
struct spu_itype
|
||||
|
|
@ -246,8 +248,10 @@ struct spu_itype
|
|||
|
||||
class SPUThread;
|
||||
|
||||
typedef u32(*CompiledFunc)(SPUThread* _spu, be_t<u32>* _ls);
|
||||
|
||||
// SPU basic function information structure
|
||||
struct spu_function_t
|
||||
struct spu_function_contents_t
|
||||
{
|
||||
// Entry point (LS address)
|
||||
const u32 addr;
|
||||
|
|
@ -271,30 +275,48 @@ struct spu_function_t
|
|||
bool does_reset_stack;
|
||||
|
||||
// Pointer to the compiled function
|
||||
u32(*compiled)(SPUThread* _spu, be_t<u32>* _ls) = nullptr;
|
||||
CompiledFunc compiled = nullptr;
|
||||
|
||||
spu_function_t(u32 addr, u32 size)
|
||||
: addr(addr)
|
||||
, size(size)
|
||||
spu_function_contents_t(u32 addr, u32 size)
|
||||
: addr(addr),
|
||||
size(size)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// A single instance of a compiled function, currently in use
|
||||
union spu_function_t
|
||||
{
|
||||
// The function itself and its data
|
||||
std::shared_ptr<spu_function_contents_t> contents;
|
||||
|
||||
// Whether pages the function is in were written to since its last execution
|
||||
bool dirty_bit : 1;
|
||||
|
||||
operator bool()
|
||||
{
|
||||
return contents != nullptr;
|
||||
}
|
||||
|
||||
spu_function_t() : contents(nullptr) {};
|
||||
~spu_function_t() {dirty_bit = false; contents.reset();};
|
||||
};
|
||||
|
||||
// SPU Function Database (must be global or PS3 process-local)
|
||||
class SPUDatabase final : spu_itype
|
||||
{
|
||||
shared_mutex m_mutex;
|
||||
|
||||
// All registered functions (uses addr and first instruction as a key)
|
||||
std::unordered_multimap<u64, std::shared_ptr<spu_function_t>> m_db;
|
||||
std::unordered_multimap<u64, std::shared_ptr<spu_function_contents_t>> m_db;
|
||||
|
||||
// For internal use
|
||||
spu_function_t* find(const be_t<u32>* data, u64 key, u32 max_size);
|
||||
std::shared_ptr<spu_function_contents_t> find(const be_t<u32>* data, u64 key, u32 max_size, void* ignore = nullptr);
|
||||
|
||||
public:
|
||||
SPUDatabase();
|
||||
~SPUDatabase();
|
||||
|
||||
// Try to retrieve SPU function information
|
||||
spu_function_t* analyse(const be_t<u32>* ls, u32 entry, u32 limit = 0x40000);
|
||||
std::shared_ptr<spu_function_contents_t> analyse(const be_t<u32>* ls, u32 entry, void * ignore=nullptr);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -101,7 +101,8 @@ void spu_interpreter::RDCH(SPUThread& spu, spu_opcode_t op)
|
|||
}
|
||||
else
|
||||
{
|
||||
spu.gpr[op.rt] = v128::from32r(result);
|
||||
memset(&spu.gpr[op.rt], 0, 3*sizeof(u32));
|
||||
spu.gpr[op.rt]._u32[3] = result; v128::from32r(result);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,39 +24,48 @@ void spu_recompiler_base::enter(SPUThread& spu)
|
|||
const auto _ls = vm::_ptr<u32>(spu.offset);
|
||||
|
||||
// Search if cached data matches
|
||||
auto func = spu.compiled_cache[spu.pc / 4];
|
||||
|
||||
// Check shared db if we dont have a match
|
||||
if (!func || !std::equal(func->data.begin(), func->data.end(), _ls + spu.pc / 4, [](const be_t<u32>& l, const be_t<u32>& r) { return *(u32*)(u8*)&l == *(u32*)(u8*)&r; }))
|
||||
auto & func = spu.compiled_cache[spu.pc / 4];
|
||||
if (func.dirty_bit)
|
||||
{
|
||||
func = spu.spu_db->analyse(_ls, spu.pc);
|
||||
spu.compiled_cache[spu.pc / 4] = func;
|
||||
func.dirty_bit = false;
|
||||
|
||||
// This memcmp acts as a fast path instead of finding it again in analyse.
|
||||
if (memcmp(func.contents->data.data(), _ls + (spu.pc / 4), func.contents->size) != 0)
|
||||
{
|
||||
func.contents = spu.spu_db->analyse(_ls, spu.pc, func.contents.get());
|
||||
}
|
||||
}
|
||||
else if (!func)
|
||||
{
|
||||
func.contents = spu.spu_db->analyse(_ls, spu.pc);
|
||||
spu.compiled_functions.push_back(&func);
|
||||
}
|
||||
|
||||
// Reset callstack if necessary
|
||||
if ((func->does_reset_stack && spu.recursion_level) || spu.recursion_level >= 128)
|
||||
if ((func.contents->does_reset_stack && spu.recursion_level) || spu.recursion_level >= 128)
|
||||
{
|
||||
spu.state += cpu_flag::ret;
|
||||
return;
|
||||
}
|
||||
|
||||
// Compile if needed
|
||||
if (!func->compiled)
|
||||
if (!func.contents->compiled)
|
||||
{
|
||||
if (!spu.spu_rec)
|
||||
{
|
||||
spu.spu_rec = fxm::get_always<spu_recompiler>();
|
||||
}
|
||||
|
||||
spu.spu_rec->compile(*func);
|
||||
spu.spu_rec->compile(func.contents);
|
||||
|
||||
if (!func->compiled) fmt::throw_exception("Compilation failed" HERE);
|
||||
if (!func.contents->compiled) fmt::throw_exception("Compilation failed" HERE);
|
||||
}
|
||||
|
||||
const u32 res = func->compiled(&spu, _ls);
|
||||
const u32 res = func.contents->compiled(&spu, _ls);
|
||||
|
||||
if (const auto exception = spu.pending_exception)
|
||||
if (spu.pending_exception)
|
||||
{
|
||||
const auto exception = spu.pending_exception;
|
||||
spu.pending_exception = nullptr;
|
||||
std::rethrow_exception(exception);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ class spu_recompiler_base
|
|||
protected:
|
||||
std::mutex m_mutex; // must be locked in compile()
|
||||
|
||||
const spu_function_t* m_func; // current function
|
||||
std::shared_ptr<const spu_function_contents_t> m_func; // current function
|
||||
|
||||
u32 m_pos; // current position
|
||||
|
||||
|
|
@ -18,7 +18,7 @@ public:
|
|||
virtual ~spu_recompiler_base();
|
||||
|
||||
// Compile specified function
|
||||
virtual void compile(spu_function_t& f) = 0;
|
||||
virtual bool compile(std::shared_ptr<spu_function_contents_t>) = 0;
|
||||
|
||||
// Run
|
||||
static void enter(class SPUThread&);
|
||||
|
|
|
|||
|
|
@ -41,6 +41,18 @@ bool operator ==(const u128& lhs, const u128& rhs)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
FORCE_INLINE void __movsq(unsigned long * Destination, const unsigned long * Source, size_t Count)
|
||||
{
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"rep; movsq" :
|
||||
[Destination] "=D" (Destination), [Source] "=S" (Source), [Count] "=c" (Count) :
|
||||
"[Destination]" (Destination), "[Source]" (Source), "[Count]" (Count)
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
extern u64 get_timebased_time();
|
||||
extern u64 get_system_time();
|
||||
|
||||
|
|
@ -134,8 +146,8 @@ namespace spu
|
|||
{
|
||||
if (timeout_ms > 0)
|
||||
{
|
||||
const u64 timeout = timeout_ms * 1000u; //convert to microseconds
|
||||
const u64 start = get_system_time();
|
||||
const auto timeout = timeout_ms * 1000ull; //convert to microseconds
|
||||
const auto start = get_system_time();
|
||||
auto remaining = timeout;
|
||||
|
||||
while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions)
|
||||
|
|
@ -162,14 +174,14 @@ namespace spu
|
|||
}
|
||||
}
|
||||
|
||||
atomic_instruction_table[pc_offset]++;
|
||||
++atomic_instruction_table[pc_offset];
|
||||
}
|
||||
|
||||
void release_pc_address(u32 pc)
|
||||
{
|
||||
const u32 pc_offset = pc >> 2;
|
||||
|
||||
atomic_instruction_table[pc_offset]--;
|
||||
--atomic_instruction_table[pc_offset];
|
||||
}
|
||||
|
||||
struct concurrent_execution_watchdog
|
||||
|
|
@ -290,6 +302,8 @@ void SPUThread::on_spawn()
|
|||
{
|
||||
thread_ctrl::set_native_priority(-1);
|
||||
}
|
||||
|
||||
++g_num_spu_threads;
|
||||
}
|
||||
|
||||
void SPUThread::on_init(const std::shared_ptr<void>& _this)
|
||||
|
|
@ -532,81 +546,11 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args, bool from_mfc)
|
|||
}
|
||||
default:
|
||||
{
|
||||
auto vdst = static_cast<__m128i*>(dst);
|
||||
auto vsrc = static_cast<const __m128i*>(src);
|
||||
auto vcnt = size / sizeof(__m128i);
|
||||
auto vdst = static_cast<u64*>(dst);
|
||||
auto vsrc = static_cast<const u64*>(src);
|
||||
auto vcnt = size / sizeof(u64);
|
||||
|
||||
//if (is_get && !from_mfc)
|
||||
{
|
||||
while (vcnt >= 8)
|
||||
{
|
||||
const __m128i data[]
|
||||
{
|
||||
_mm_load_si128(vsrc + 0),
|
||||
_mm_load_si128(vsrc + 1),
|
||||
_mm_load_si128(vsrc + 2),
|
||||
_mm_load_si128(vsrc + 3),
|
||||
_mm_load_si128(vsrc + 4),
|
||||
_mm_load_si128(vsrc + 5),
|
||||
_mm_load_si128(vsrc + 6),
|
||||
_mm_load_si128(vsrc + 7),
|
||||
};
|
||||
|
||||
_mm_store_si128(vdst + 0, data[0]);
|
||||
_mm_store_si128(vdst + 1, data[1]);
|
||||
_mm_store_si128(vdst + 2, data[2]);
|
||||
_mm_store_si128(vdst + 3, data[3]);
|
||||
_mm_store_si128(vdst + 4, data[4]);
|
||||
_mm_store_si128(vdst + 5, data[5]);
|
||||
_mm_store_si128(vdst + 6, data[6]);
|
||||
_mm_store_si128(vdst + 7, data[7]);
|
||||
|
||||
vcnt -= 8;
|
||||
vsrc += 8;
|
||||
vdst += 8;
|
||||
}
|
||||
|
||||
while (vcnt--)
|
||||
{
|
||||
_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
// Disabled
|
||||
while (vcnt >= 8)
|
||||
{
|
||||
const __m128i data[]
|
||||
{
|
||||
_mm_load_si128(vsrc + 0),
|
||||
_mm_load_si128(vsrc + 1),
|
||||
_mm_load_si128(vsrc + 2),
|
||||
_mm_load_si128(vsrc + 3),
|
||||
_mm_load_si128(vsrc + 4),
|
||||
_mm_load_si128(vsrc + 5),
|
||||
_mm_load_si128(vsrc + 6),
|
||||
_mm_load_si128(vsrc + 7),
|
||||
};
|
||||
|
||||
_mm_stream_si128(vdst + 0, data[0]);
|
||||
_mm_stream_si128(vdst + 1, data[1]);
|
||||
_mm_stream_si128(vdst + 2, data[2]);
|
||||
_mm_stream_si128(vdst + 3, data[3]);
|
||||
_mm_stream_si128(vdst + 4, data[4]);
|
||||
_mm_stream_si128(vdst + 5, data[5]);
|
||||
_mm_stream_si128(vdst + 6, data[6]);
|
||||
_mm_stream_si128(vdst + 7, data[7]);
|
||||
|
||||
vcnt -= 8;
|
||||
vsrc += 8;
|
||||
vdst += 8;
|
||||
}
|
||||
|
||||
while (vcnt--)
|
||||
{
|
||||
_mm_stream_si128(vdst++, _mm_load_si128(vsrc++));
|
||||
}
|
||||
__movsq(vdst, vsrc, vcnt);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -662,15 +606,14 @@ void SPUThread::process_mfc_cmd()
|
|||
|
||||
if (is_polling)
|
||||
{
|
||||
vm::waiter waiter;
|
||||
waiter.owner = this;
|
||||
waiter.addr = raddr;
|
||||
waiter.size = 128;
|
||||
waiter.stamp = rtime;
|
||||
waiter.data = rdata.data();
|
||||
waiter.init();
|
||||
vm::waiter* waiter = new vm::waiter();
|
||||
waiter->owner = this;
|
||||
waiter->addr = raddr;
|
||||
waiter->stamp = rtime;
|
||||
waiter->data = rdata.data();
|
||||
waiter->init();
|
||||
|
||||
while (vm::reservation_acquire(raddr, 128) == waiter.stamp && rdata == data)
|
||||
while (vm::reservation_acquire(raddr, 128) == waiter->stamp && rdata == data)
|
||||
{
|
||||
if (test(state, cpu_flag::stop))
|
||||
{
|
||||
|
|
@ -679,6 +622,8 @@ void SPUThread::process_mfc_cmd()
|
|||
|
||||
thread_ctrl::wait_for(100);
|
||||
}
|
||||
|
||||
waiter->remove();
|
||||
}
|
||||
else if (s_use_rtm && utils::transaction_enter())
|
||||
{
|
||||
|
|
@ -704,9 +649,11 @@ void SPUThread::process_mfc_cmd()
|
|||
if (is_polling || UNLIKELY(vm::reservation_acquire(raddr, 128) != rtime))
|
||||
{
|
||||
// TODO: vm::check_addr
|
||||
vm::reader_lock lock;
|
||||
rtime = vm::reservation_acquire(raddr, 128);
|
||||
rdata = data;
|
||||
{
|
||||
vm::reader_lock lock;
|
||||
rtime = vm::reservation_acquire(raddr, 128);
|
||||
}
|
||||
memcpy(rdata.data(), data.data(), rdata.size() * sizeof(rdata[0]));
|
||||
}
|
||||
|
||||
// Copy to LS
|
||||
|
|
@ -723,38 +670,54 @@ void SPUThread::process_mfc_cmd()
|
|||
|
||||
bool result = false;
|
||||
|
||||
if (raddr == ch_mfc_cmd.eal && rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
|
||||
{
|
||||
// TODO: vm::check_addr
|
||||
if (s_use_rtm && utils::transaction_enter())
|
||||
{
|
||||
if (!vm::reader_lock{vm::try_to_lock})
|
||||
{
|
||||
_xabort(0);
|
||||
}
|
||||
// Check for fast exit in the beginning as well
|
||||
vm::writer_lock lock(vm::try_to_lock);
|
||||
|
||||
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
|
||||
if (lock.locked || memcmp(rdata.data(), data.data(), rdata.size() * sizeof(rdata[0])) == 0) {
|
||||
if (raddr == ch_mfc_cmd.eal && rtime == vm::reservation_acquire(raddr, 128))
|
||||
{
|
||||
// TODO: vm::check_addr
|
||||
if (s_use_rtm && utils::transaction_enter())
|
||||
{
|
||||
if (!lock.locked && !vm::reader_lock{ vm::try_to_lock })
|
||||
{
|
||||
_xabort(0);
|
||||
}
|
||||
|
||||
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
|
||||
{
|
||||
data = to_write;
|
||||
result = true;
|
||||
|
||||
vm::reservation_update(raddr, 128);
|
||||
vm::notify(raddr, 128);
|
||||
}
|
||||
|
||||
_xend();
|
||||
}
|
||||
else if (lock.locked)
|
||||
{
|
||||
data = to_write;
|
||||
result = true;
|
||||
|
||||
vm::reservation_update(raddr, 128);
|
||||
lock.unlock();
|
||||
|
||||
result = true;
|
||||
vm::notify(raddr, 128);
|
||||
}
|
||||
|
||||
_xend();
|
||||
}
|
||||
else
|
||||
{
|
||||
vm::writer_lock lock;
|
||||
|
||||
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
|
||||
else
|
||||
{
|
||||
data = to_write;
|
||||
result = true;
|
||||
// TODO maybe timeout and check if the lock is still needed in long waits (If rtime changes, no use)
|
||||
vm::writer_lock lock(0);
|
||||
|
||||
vm::reservation_update(raddr, 128);
|
||||
vm::notify(raddr, 128);
|
||||
if (rtime == vm::reservation_acquire(raddr, 128))
|
||||
{
|
||||
data = to_write;
|
||||
vm::reservation_update(raddr, 128);
|
||||
lock.unlock();
|
||||
|
||||
result = true;
|
||||
vm::notify(raddr, 128);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -766,11 +729,10 @@ void SPUThread::process_mfc_cmd()
|
|||
else
|
||||
{
|
||||
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
|
||||
}
|
||||
|
||||
if (raddr && !result)
|
||||
{
|
||||
ch_event_stat |= SPU_EVENT_LR;
|
||||
if (raddr)
|
||||
{
|
||||
ch_event_stat |= SPU_EVENT_LR;
|
||||
}
|
||||
}
|
||||
|
||||
raddr = 0;
|
||||
|
|
@ -808,9 +770,11 @@ void SPUThread::process_mfc_cmd()
|
|||
return;
|
||||
}
|
||||
|
||||
vm::writer_lock lock(0);
|
||||
data = to_write;
|
||||
vm::reservation_update(ch_mfc_cmd.eal, 128);
|
||||
{
|
||||
vm::writer_lock lock(0);
|
||||
vm::reservation_update(ch_mfc_cmd.eal, 128);
|
||||
}
|
||||
vm::notify(ch_mfc_cmd.eal, 128);
|
||||
|
||||
ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
|
||||
|
|
@ -1189,28 +1153,33 @@ bool SPUThread::get_ch_value(u32 ch, u32& out)
|
|||
return true;
|
||||
}
|
||||
|
||||
vm::waiter waiter;
|
||||
|
||||
vm::waiter* waiter = nullptr;
|
||||
if (ch_event_mask & SPU_EVENT_LR)
|
||||
{
|
||||
waiter.owner = this;
|
||||
waiter.addr = raddr;
|
||||
waiter.size = 128;
|
||||
waiter.stamp = rtime;
|
||||
waiter.data = rdata.data();
|
||||
waiter.init();
|
||||
waiter = new vm::waiter();
|
||||
waiter->owner = this;
|
||||
waiter->addr = raddr;
|
||||
waiter->stamp = rtime;
|
||||
waiter->data = rdata.data();
|
||||
waiter->init();
|
||||
}
|
||||
|
||||
while (!(res = get_events(true)))
|
||||
{
|
||||
if (test(state & cpu_flag::stop))
|
||||
{
|
||||
waiter->remove();
|
||||
return false;
|
||||
}
|
||||
|
||||
thread_ctrl::wait_for(100);
|
||||
}
|
||||
|
||||
if (waiter != nullptr)
|
||||
{
|
||||
waiter->remove();
|
||||
}
|
||||
|
||||
out = res;
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "Emu/Cell/Common.h"
|
||||
#include "Emu/CPU/CPUThread.h"
|
||||
#include "Emu/Cell/SPUInterpreter.h"
|
||||
#include "Emu/Cell/SPURecompiler.h"
|
||||
#include "MFC.h"
|
||||
|
||||
struct lv2_event_queue;
|
||||
|
|
@ -212,22 +213,16 @@ public:
|
|||
// returns true on success
|
||||
bool try_pop(u32& out)
|
||||
{
|
||||
const auto old = data.fetch_op([&](sync_var_t& data)
|
||||
const auto old = data.fetch_op([](sync_var_t& data)
|
||||
{
|
||||
if (data.count)
|
||||
{
|
||||
data.wait = false;
|
||||
out = data.value;
|
||||
}
|
||||
else
|
||||
{
|
||||
data.wait = true;
|
||||
}
|
||||
|
||||
data.count = false;
|
||||
data.value = 0; // ???
|
||||
sync_var_t t;
|
||||
*reinterpret_cast<u64*>(&t) = 0;
|
||||
t.wait = !data.count;
|
||||
data = t;
|
||||
});
|
||||
|
||||
out = old.value;
|
||||
|
||||
return old.count;
|
||||
}
|
||||
|
||||
|
|
@ -585,7 +580,10 @@ public:
|
|||
|
||||
std::exception_ptr pending_exception;
|
||||
|
||||
std::array<struct spu_function_t*, 65536> compiled_cache{};
|
||||
// No need for shared_ptr in the following two, as whenever something is removed or added to one,
|
||||
// the same goes for the other.
|
||||
std::array<spu_function_t, 65536> compiled_cache{};
|
||||
std::vector<spu_function_t*> compiled_functions{};
|
||||
std::shared_ptr<class SPUDatabase> spu_db;
|
||||
std::shared_ptr<class spu_recompiler_base> spu_rec;
|
||||
u32 recursion_level = 0;
|
||||
|
|
|
|||
|
|
@ -8,16 +8,15 @@
|
|||
#include "Emu/Cell/lv2/sys_memory.h"
|
||||
#include "Emu/RSX/GSRender.h"
|
||||
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
|
||||
namespace vm
|
||||
{
|
||||
static u8* memory_reserve_4GiB(std::uintptr_t _addr = 0)
|
||||
std::array<memory_page, 0x100000000 / 4096> g_pages{};
|
||||
|
||||
static u8* memory_reserve_4GiB(const std::uintptr_t _addr = 0)
|
||||
{
|
||||
for (u64 addr = _addr + 0x100000000;; addr += 0x100000000)
|
||||
for (auto addr = _addr + 0x100000000;; addr += 0x100000000)
|
||||
{
|
||||
if (auto ptr = utils::memory_reserve(0x100000000, (void*)addr))
|
||||
if (const auto ptr = utils::memory_reserve(0x100000000, reinterpret_cast<void*>(addr)))
|
||||
{
|
||||
return static_cast<u8*>(ptr);
|
||||
}
|
||||
|
|
@ -39,11 +38,12 @@ namespace vm
|
|||
// Memory locations
|
||||
std::vector<std::shared_ptr<block_t>> g_locations;
|
||||
|
||||
// Reservations (lock lines) in a single memory page
|
||||
using reservation_info = std::array<std::atomic<u64>, 4096 / 128>;
|
||||
|
||||
// Registered waiters
|
||||
std::deque<vm::waiter*> g_waiters;
|
||||
std::vector<vm::waiter*> g_waiters;
|
||||
|
||||
// Waiters which will be removed once the lock is freed
|
||||
std::mutex g_waiters_to_remove_lock;
|
||||
std::vector<vm::waiter*> g_waiters_to_remove;
|
||||
|
||||
// Memory mutex core
|
||||
shared_mutex g_mutex;
|
||||
|
|
@ -201,78 +201,52 @@ namespace vm
|
|||
{
|
||||
}
|
||||
|
||||
writer_lock::~writer_lock()
|
||||
void writer_lock::unlock()
|
||||
{
|
||||
if (locked)
|
||||
{
|
||||
g_mutex.unlock();
|
||||
}
|
||||
}
|
||||
|
||||
// Page information
|
||||
struct memory_page
|
||||
{
|
||||
// Memory flags
|
||||
atomic_t<u8> flags;
|
||||
|
||||
atomic_t<u32> waiters;
|
||||
|
||||
// Reservations
|
||||
atomic_t<reservation_info*> reservations;
|
||||
|
||||
// Access reservation info
|
||||
std::atomic<u64>& operator [](u32 addr)
|
||||
{
|
||||
auto ptr = reservations.load();
|
||||
|
||||
if (!ptr)
|
||||
if (!g_waiters_to_remove.empty())
|
||||
{
|
||||
// Opportunistic memory allocation
|
||||
ptr = new reservation_info{};
|
||||
|
||||
if (auto old_ptr = reservations.compare_and_swap(nullptr, ptr))
|
||||
std::lock_guard<std::mutex> lock(g_waiters_to_remove_lock);
|
||||
for (auto ptr : g_waiters_to_remove)
|
||||
{
|
||||
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), ptr);
|
||||
if (found != g_waiters.cend())
|
||||
{
|
||||
g_waiters.erase(found);
|
||||
}
|
||||
delete ptr;
|
||||
ptr = old_ptr;
|
||||
}
|
||||
g_waiters_to_remove.clear();
|
||||
}
|
||||
|
||||
return (*ptr)[(addr & 0xfff) >> 7];
|
||||
g_mutex.unlock();
|
||||
|
||||
locked = false;
|
||||
}
|
||||
};
|
||||
|
||||
// Memory pages
|
||||
std::array<memory_page, 0x100000000 / 4096> g_pages{};
|
||||
|
||||
u64 reservation_acquire(u32 addr, u32 _size)
|
||||
{
|
||||
// Access reservation info: stamp and the lock bit
|
||||
return g_pages[addr >> 12][addr].load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
void reservation_update(u32 addr, u32 _size)
|
||||
writer_lock::~writer_lock()
|
||||
{
|
||||
// Update reservation info with new timestamp (unsafe, assume allocated)
|
||||
(*g_pages[addr >> 12].reservations)[(addr & 0xfff) >> 7].store(__rdtsc(), std::memory_order_release);
|
||||
unlock();
|
||||
}
|
||||
|
||||
void waiter::init()
|
||||
{
|
||||
// Register waiter
|
||||
writer_lock lock(0);
|
||||
|
||||
g_waiters.emplace_back(this);
|
||||
}
|
||||
|
||||
void waiter::test() const
|
||||
{
|
||||
if (std::memcmp(data, vm::base(addr), size) == 0)
|
||||
const auto owner_copy = owner;
|
||||
if (!owner_copy)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
memory_page& page = g_pages[addr >> 12];
|
||||
|
||||
if (page.reservations == nullptr)
|
||||
{
|
||||
return;
|
||||
|
|
@ -283,23 +257,40 @@ namespace vm
|
|||
return;
|
||||
}
|
||||
|
||||
if (owner)
|
||||
if (memcmp(data, vm::base(addr), size) == 0)
|
||||
{
|
||||
owner->notify();
|
||||
return;
|
||||
}
|
||||
|
||||
owner_copy->notify();
|
||||
}
|
||||
|
||||
waiter::~waiter()
|
||||
void waiter::remove()
|
||||
{
|
||||
|
||||
// Unregister waiter
|
||||
writer_lock lock(0);
|
||||
const writer_lock lock(try_to_lock);
|
||||
|
||||
// Find waiter
|
||||
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this);
|
||||
|
||||
if (found != g_waiters.cend())
|
||||
if (lock.locked)
|
||||
{
|
||||
g_waiters.erase(found);
|
||||
// Find waiter
|
||||
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this);
|
||||
|
||||
if (found != g_waiters.cend())
|
||||
{
|
||||
g_waiters.erase(found);
|
||||
delete this;
|
||||
}
|
||||
else
|
||||
{
|
||||
verify("Waiter not found during removal"), false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
this->owner = nullptr; // Iterations of the object will ignore it from now on
|
||||
std::lock_guard<std::mutex> lock(g_waiters_to_remove_lock);
|
||||
g_waiters_to_remove.push_back(this);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -369,8 +360,8 @@ namespace vm
|
|||
|
||||
const u8 flags_both = flags_set & flags_clear;
|
||||
|
||||
flags_test |= page_allocated;
|
||||
flags_set &= ~flags_both;
|
||||
flags_test |= page_allocated;
|
||||
flags_set &= ~flags_both;
|
||||
flags_clear &= ~flags_both;
|
||||
|
||||
for (u32 i = addr / 4096; i < addr / 4096 + size / 4096; i++)
|
||||
|
|
@ -736,7 +727,7 @@ namespace vm
|
|||
{
|
||||
writer_lock lock(0);
|
||||
|
||||
for (auto it = g_locations.begin(); it != g_locations.end(); it++)
|
||||
for (auto it = g_locations.begin(); it != g_locations.end(); ++it)
|
||||
{
|
||||
if (*it && (*it)->addr == addr)
|
||||
{
|
||||
|
|
@ -835,7 +826,7 @@ void fmt_class_string<vm::_ptr_base<const char>>::format(std::string& out, u64 a
|
|||
|
||||
out += u8"“";
|
||||
|
||||
for (vm::_ptr_base<const volatile char> ptr = vm::cast(arg);; ptr++)
|
||||
for (vm::_ptr_base<const volatile char> ptr = vm::cast(arg);; ++ptr)
|
||||
{
|
||||
if (!vm::check_addr(ptr.addr()))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <atomic>
|
||||
|
||||
class named_thread;
|
||||
class cpu_thread;
|
||||
|
|
@ -42,9 +42,10 @@ namespace vm
|
|||
{
|
||||
named_thread* owner;
|
||||
u32 addr;
|
||||
u32 size;
|
||||
bool inserted = false;
|
||||
u64 stamp;
|
||||
const void* data;
|
||||
static const u32 size = 128; // Always 128 currently
|
||||
|
||||
waiter() = default;
|
||||
|
||||
|
|
@ -53,7 +54,7 @@ namespace vm
|
|||
void init();
|
||||
void test() const;
|
||||
|
||||
~waiter();
|
||||
void remove();
|
||||
};
|
||||
|
||||
// Address type
|
||||
|
|
@ -90,21 +91,64 @@ namespace vm
|
|||
|
||||
struct writer_lock final
|
||||
{
|
||||
const bool locked;
|
||||
bool locked;
|
||||
|
||||
writer_lock(const writer_lock&) = delete;
|
||||
writer_lock(int full = 1);
|
||||
writer_lock(const try_to_lock_t&);
|
||||
void unlock();
|
||||
~writer_lock();
|
||||
|
||||
explicit operator bool() const { return locked; }
|
||||
};
|
||||
|
||||
// Get reservation status for further atomic update: last update timestamp
|
||||
u64 reservation_acquire(u32 addr, u32 size);
|
||||
// Reservations (lock lines) in a single memory page
|
||||
using reservation_info = std::array<std::atomic<u64>, 4096 / 128>;
|
||||
|
||||
// End atomic update
|
||||
void reservation_update(u32 addr, u32 size);
|
||||
// Page information
|
||||
struct memory_page
|
||||
{
|
||||
// Reservations
|
||||
atomic_t<reservation_info*> reservations;
|
||||
//atomic_t<u32> waiters;
|
||||
// Memory flags
|
||||
atomic_t<u8> flags;
|
||||
|
||||
// Access reservation info
|
||||
FORCE_INLINE std::atomic<u64>& operator [](const u32 addr)
|
||||
{
|
||||
auto ptr = reservations.load();
|
||||
|
||||
if (!ptr)
|
||||
{
|
||||
ptr = new reservation_info();
|
||||
// Opportunistic memory allocation
|
||||
|
||||
if (const auto old_ptr = reservations.compare_and_swap(nullptr, ptr))
|
||||
{
|
||||
delete ptr;
|
||||
ptr = old_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
return (*ptr)[(addr & 0xfff) >> 7];
|
||||
}
|
||||
};
|
||||
|
||||
// Memory pages
|
||||
extern std::array<memory_page, 0x100000000 / 4096> g_pages;
|
||||
|
||||
FORCE_INLINE u64 reservation_acquire(u32 addr, u32 _size)
|
||||
{
|
||||
// Access reservation info: stamp and the lock bit
|
||||
return g_pages[addr >> 12][addr].load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
FORCE_INLINE void reservation_update(u32 addr, u32 _size)
|
||||
{
|
||||
// Update reservation info with new timestamp (unsafe, assume allocated)
|
||||
(*g_pages[addr >> 12].reservations)[(addr & 0xfff) >> 7].store(__rdtsc(), std::memory_order_release);
|
||||
}
|
||||
|
||||
// Check and notify memory changes at address
|
||||
void notify(u32 addr, u32 size);
|
||||
|
|
|
|||
|
|
@ -532,7 +532,7 @@ std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::spa
|
|||
|
||||
verify(HERE), (dst.size_bytes() >= src.size_bytes());
|
||||
|
||||
u32 dst_idx = 0;
|
||||
u32 dst_idx = -1;
|
||||
for (T index : src)
|
||||
{
|
||||
if (is_primitive_restart_enabled && index == primitive_restart_index)
|
||||
|
|
@ -549,9 +549,9 @@ std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::spa
|
|||
min_index = std::min(min_index, index);
|
||||
}
|
||||
|
||||
dst[dst_idx++] = index;
|
||||
dst[++dst_idx] = index;
|
||||
}
|
||||
return std::make_tuple(min_index, max_index, dst_idx);
|
||||
return std::make_tuple(min_index, max_index, dst_idx + 1);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue