Initial optimizations (Squashed to fix submodules)

This commit is contained in:
Farseer 2018-01-25 11:51:11 +02:00 committed by Zangetsu38
parent db96c355ce
commit 2d4af4c88c
13 changed files with 426 additions and 558 deletions

View file

@ -14,7 +14,10 @@ void shared_mutex::imp_lock_shared(s64 _old)
for (int i = 0; i < 10; i++)
{
busy_wait();
if (i != 0)
{
busy_wait();
}
const s64 value = m_value.load();
@ -173,7 +176,10 @@ void shared_mutex::imp_lock(s64 _old)
for (int i = 0; i < 10; i++)
{
busy_wait();
if (i != 0)
{
busy_wait();
}
const s64 value = m_value.load();
@ -236,6 +242,10 @@ void shared_mutex::imp_lock_degrade()
bool shared_mutex::try_lock_shared()
{
if (m_value < c_min) // Fast path
{
return false;
}
// Conditional decrement
return m_value.fetch_op([](s64& value) { if (value >= c_min) value -= c_min; }) >= c_min;
}

View file

@ -36,25 +36,25 @@ spu_recompiler::spu_recompiler()
}
}
void spu_recompiler::compile(spu_function_t& f)
bool spu_recompiler::compile(std::shared_ptr<spu_function_contents_t> f)
{
std::lock_guard<std::mutex> lock(m_mutex);
if (f.compiled)
if (f->compiled)
{
// return if function already compiled
return;
return true;
}
if (f.addr >= 0x40000 || f.addr % 4 || f.size == 0 || f.size > 0x40000 - f.addr || f.size % 4)
if (f->addr >= 0x40000 || f->addr % 4 || f->size == 0 || f->size > 0x40000 - f->addr || f->size % 4)
{
fmt::throw_exception("Invalid SPU function (addr=0x%05x, size=0x%x)" HERE, f.addr, f.size);
fmt::throw_exception("Invalid SPU function (addr=0x%05x, size=0x%x)" HERE, f->addr, f->size);
}
using namespace asmjit;
SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode);
dis_asm.offset = reinterpret_cast<u8*>(f.data.data()) - f.addr;
dis_asm.offset = reinterpret_cast<u8*>(f->data.data()) - f->addr;
StringLogger logger;
logger.addOptions(Logger::kOptionBinaryForm);
@ -63,10 +63,10 @@ void spu_recompiler::compile(spu_function_t& f)
if (g_cfg.core.spu_debug)
{
fmt::append(log, "========== SPU FUNCTION 0x%05x - 0x%05x ==========\n\n", f.addr, f.addr + f.size);
fmt::append(log, "========== SPU FUNCTION 0x%05x - 0x%05x ==========\n\n", f->addr, f->addr + f->size);
}
this->m_func = &f;
this->m_func = f;
asmjit::CodeHolder code;
code.init(m_jit->getCodeInfo());
@ -121,13 +121,13 @@ void spu_recompiler::compile(spu_function_t& f)
compiler.alloc(vec_vars[5], asmjit::x86::xmm5);
// Initialize labels
std::vector<Label> pos_labels{ 0x10000 };
this->labels = pos_labels.data();
this->labels = std::unique_ptr<Label[]>(reinterpret_cast<Label*>(new u8[0x10000 * sizeof(Label)]()));
auto pos_labels = this->labels.get();
// Register labels for block entries
for (const u32 addr : f.blocks)
for (const u32 addr : f->blocks)
{
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
if (addr < f->addr || addr >= f->addr + f->size || addr % 4)
{
fmt::throw_exception("Invalid function block entry (0x%05x)" HERE, addr);
}
@ -136,15 +136,15 @@ void spu_recompiler::compile(spu_function_t& f)
}
// Register label for post-the-end address
pos_labels[(f.addr + f.size) / 4 % 0x10000] = compiler.newLabel();
pos_labels[(f->addr + f->size) / 4 % 0x10000] = compiler.newLabel();
// Register label for jump table resolver
Label jt_label = compiler.newLabel();
this->jt = &jt_label;
for (const u32 addr : f.jtable)
for (const u32 addr : f->jtable)
{
if (addr < f.addr || addr >= f.addr + f.size || addr % 4)
if (addr < f->addr || addr >= f->addr + f->size || addr % 4)
{
fmt::throw_exception("Invalid jump table entry (0x%05x)" HERE, addr);
}
@ -155,24 +155,16 @@ void spu_recompiler::compile(spu_function_t& f)
this->end = &end_label;
// Start compilation
m_pos = f.addr;
m_pos = f->addr;
if (utils::has_avx())
{
compiler.vzeroupper();
//compiler.pxor(asmjit::x86::xmm0, asmjit::x86::xmm0);
//compiler.vptest(asmjit::x86::ymm0, asmjit::x86::ymm0);
//compiler.jnz(end_label);
}
for (const u32 op : f.data)
for (const u32 op : f->data)
{
// Bind label if initialized
if (pos_labels[m_pos / 4].isValid())
{
compiler.bind(pos_labels[m_pos / 4]);
if (f.blocks.find(m_pos) != f.blocks.end())
if (f->blocks.find(m_pos) != f->blocks.end())
{
compiler.comment("Block:");
}
@ -219,12 +211,12 @@ void spu_recompiler::compile(spu_function_t& f)
// Generate jump table resolver (uses addr_var)
compiler.bind(jt_label);
if (f.jtable.size())
if (f->jtable.size())
{
compiler.comment("Jump table resolver:");
}
for (const u32 addr : f.jtable)
for (const u32 addr : f->jtable)
{
if ((addr % 4) == 0 && addr < 0x40000 && pos_labels[addr / 4].isValid())
{
@ -253,8 +245,8 @@ void spu_recompiler::compile(spu_function_t& f)
Func fn;
m_jit->add(&fn, codeHolder);
f.compiled = asmjit::Internal::ptr_cast<decltype(f.compiled)>(fn);
f->compiled = asmjit::Internal::ptr_cast<decltype(f->compiled)>(fn);
if (g_cfg.core.spu_debug)
{
// Add ASMJIT logs
@ -462,10 +454,19 @@ void spu_recompiler::LNOP(spu_opcode_t op)
{
}
void invalidate_jit(SPUThread* _spu)
{
for (auto& func : _spu->compiled_functions)
{
func->dirty_bit = true;
}
}
void spu_recompiler::SYNC(spu_opcode_t op)
{
asmjit::CCFuncCall* call = c->call(asmjit::imm_ptr(asmjit::Internal::ptr_cast<void*, void(SPUThread*)>(invalidate_jit)), asmjit::FuncSignature1<u32, SPUThread*>(asmjit::CallConv::kIdHost));
call->setArg(0, *cpu);
// This instruction must be used following a store instruction that modifies the instruction stream.
c->mfence();
}
void spu_recompiler::DSYNC(spu_opcode_t op)
@ -623,16 +624,6 @@ void spu_recompiler::ROT(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->vprotd(vt, va, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](u32* t, const u32* a, const s32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
@ -672,22 +663,6 @@ void spu_recompiler::ROTM(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->pxor(vt, vt);
c->psubd(vt, vb);
c->pcmpgtd(vb, XmmConst(_mm_set1_epi32(31)));
c->vpshld(vt, va, vt);
c->vpandn(vt, vb, vt);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
@ -728,21 +703,6 @@ void spu_recompiler::ROTMA(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubd(vb, XmmConst(_mm_set1_epi32(1)));
c->pandn(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->pxor(vt, vt);
c->pminud(vb, XmmConst(_mm_set1_epi32(31)));
c->psubd(vt, vb);
c->vpshad(vt, va, vt);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](s32* t, const s32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
@ -782,19 +742,6 @@ void spu_recompiler::SHL(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->pand(vb, XmmConst(_mm_set1_epi32(0x3f)));
c->vpcmpgtd(vt, vb, XmmConst(_mm_set1_epi32(31)));
c->vpshld(vb, va, vb);
c->pandn(vt, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](u32* t, const u32* a, const u32* b) noexcept
{
for (u32 i = 0; i < 4; i++)
@ -828,24 +775,13 @@ void spu_recompiler::ROTH(spu_opcode_t op) //nf
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
c->vmovdqa(v4, XmmConst(_mm_set_epi32(0x0d0c0d0c, 0x09080908, 0x05040504, 0x01000100)));
c->vpshufb(vt, va, v4); // duplicate low word
c->vpsrld(va, va, 16);
c->vpshufb(va, va, v4);
c->vpsrld(v4, vb, 16);
c->vprolvd(va, va, v4);
c->vprolvd(vb, vt, vb);
c->vpblendw(vt, vb, va, 0xaa);
c->vmovdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->vprotw(vt, va, vb);
c->movdqa(v4, XmmConst(_mm_set1_epi16(0xf)));
c->pand(vb, v4);
c->vpsllvw(vt, va, vb);
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, v4);
c->vpsrlvw(va, va, vb);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
@ -889,42 +825,6 @@ void spu_recompiler::ROTHM(spu_opcode_t op)
return;
}
if (utils::has_avx2())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
const XmmLink& v5 = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
c->vpsrld(v4, vb, 16);
c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
c->vpandn(vb, vt, va); // clear high words
c->vpsrlvd(va, va, v4);
c->vpsrlvd(vb, vb, v5);
c->vpblendw(vt, vb, va, 0xaa); // can use vpblendvb with 0xffff0000 mask (vt)
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->pxor(vt, vt);
c->psubw(vt, vb);
c->pcmpgtw(vb, XmmConst(_mm_set1_epi16(15)));
c->vpshlw(vt, va, vt);
c->vpandn(vt, vb, vt);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
@ -965,43 +865,6 @@ void spu_recompiler::ROTMAH(spu_opcode_t op)
return;
}
if (utils::has_avx2())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
const XmmLink& v5 = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->movdqa(vt, XmmConst(_mm_set1_epi16(0x1f)));
c->vpandn(v4, vb, vt);
c->vpand(v5, vb, vt);
c->movdqa(vt, XmmConst(_mm_set1_epi32(0x2f)));
c->vpsrld(v4, v4, 16);
c->vpsubusw(v5, vt, v5); // clear high word and add 16 to low word
c->vpslld(vb, va, 16);
c->vpsravd(va, va, v4);
c->vpsravd(vb, vb, v5);
c->vpblendw(vt, vb, va, 0xaa);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->psubw(vb, XmmConst(_mm_set1_epi16(1)));
c->pandn(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->pxor(vt, vt);
c->pminuw(vb, XmmConst(_mm_set1_epi16(15)));
c->psubw(vt, vb);
c->vpshaw(vt, va, vt);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](s16* t, const s16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
@ -1041,38 +904,6 @@ void spu_recompiler::SHLH(spu_opcode_t op)
return;
}
if (utils::has_avx2())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
const XmmLink& v5 = XmmAlloc();
c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->movdqa(vt, XmmConst(_mm_set1_epi32(0xffff0000))); // mask: select high words
c->vpsrld(v4, vb, 16);
c->vpsubusw(v5, vb, vt); // clear high words (using saturation sub for throughput)
c->vpand(vb, vt, va); // clear low words
c->vpsllvd(va, va, v5);
c->vpsllvd(vb, vb, v4);
c->vpblendw(vt, vb, va, 0x55);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->pand(vb, XmmConst(_mm_set1_epi16(0x1f)));
c->vpcmpgtw(vt, vb, XmmConst(_mm_set1_epi16(15)));
c->vpshlw(vb, va, vb);
c->pandn(vt, vb);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
return;
}
auto body = [](u16* t, const u16* a, const u16* b) noexcept
{
for (u32 i = 0; i < 8; i++)
@ -1111,14 +942,6 @@ void spu_recompiler::ROTI(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->vprotd(va, va, s);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
return;
}
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
c->movdqa(v1, va);
@ -1716,57 +1539,50 @@ void spu_recompiler::CDX(spu_opcode_t op)
void spu_recompiler::ROTQBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
c->psrldq(vb, 12);
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->pshufd(vt, va, 0x4e);
c->psubq(v4, vb);
c->psllq(va, vb);
c->psrlq(vt, v4);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*qw2, *qw0);
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 7);
c->shld(*qw0, *qw1, *addr);
c->shld(*qw1, *qw2, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
c->unuse(*qw2);
}
void spu_recompiler::ROTQMBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmAlloc();
const XmmLink& vt = XmmGet(op.rb, XmmType::Int);
const XmmLink& v4 = XmmAlloc();
c->psrldq(vt, 12);
c->pxor(vb, vb);
c->psubq(vb, vt);
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->movdqa(vt, va);
c->psrldq(vt, 8);
c->psubq(v4, vb);
c->psrlq(va, vb);
c->psllq(vt, v4);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->neg(*addr);
c->and_(*addr, 7);
c->shrd(*qw0, *qw1, *addr);
c->shr(*qw1, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::SHLQBI(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
const XmmLink& vt = XmmAlloc();
const XmmLink& v4 = XmmAlloc();
c->psrldq(vb, 12);
c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->movdqa(vt, va);
c->pslldq(vt, 8);
c->psubq(v4, vb);
c->psllq(va, vb);
c->psrlq(vt, v4);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3));
c->and_(*addr, 7);
c->shld(*qw1, *qw0, *addr);
c->shl(*qw0, *addr);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*addr);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::ROTQBY(spu_opcode_t op)
@ -1867,14 +1683,16 @@ void spu_recompiler::SHLQBY(spu_opcode_t op)
void spu_recompiler::ORX(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& v1 = XmmAlloc();
c->pshufd(v1, va, 0xb1);
c->por(va, v1);
c->pshufd(v1, va, 0x4e);
c->por(va, v1);
c->pslldq(va, 12);
c->movdqa(SPU_OFF_128(gpr, op.rt), va);
c->mov(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 0));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 1));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 2));
c->or_(*addr, SPU_OFF_32(gpr, op.ra, &v128::_u32, 3));
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 3), *addr);
c->xor_(*addr, *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 0), *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 1), *addr);
c->mov(SPU_OFF_32(gpr, op.rt, &v128::_u32, 2), *addr);
c->unuse(*addr);
}
void spu_recompiler::CBD(spu_opcode_t op)
@ -1981,37 +1799,40 @@ void spu_recompiler::CDD(spu_opcode_t op)
void spu_recompiler::ROTQBII(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->pshufd(vt, va, 0x4e); // swap 64-bit parts
c->psllq(va, (op.i7 & 0x7));
c->psrlq(vt, 64 - (op.i7 & 0x7));
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->mov(*qw2, *qw0);
c->shld(*qw0, *qw1, op.i7 & 0x7);
c->shld(*qw1, *qw2, op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
c->unuse(*qw2);
}
void spu_recompiler::ROTQMBII(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, va);
c->psrldq(vt, 8);
c->psrlq(va, ((0 - op.i7) & 0x7));
c->psllq(vt, 64 - ((0 - op.i7) & 0x7));
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->shrd(*qw0, *qw1, 0-op.i7 & 0x7);
c->shr(*qw1, 0-op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::SHLQBII(spu_opcode_t op)
{
const XmmLink& va = XmmGet(op.ra, XmmType::Int);
const XmmLink& vt = XmmAlloc();
c->movdqa(vt, va);
c->pslldq(vt, 8);
c->psllq(va, (op.i7 & 0x7));
c->psrlq(vt, 64 - (op.i7 & 0x7));
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0));
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1));
c->shld(*qw1, *qw0, op.i7 & 0x7);
c->shl(*qw0, op.i7 & 0x7);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1);
c->unuse(*qw0);
c->unuse(*qw1);
}
void spu_recompiler::ROTQBYI(spu_opcode_t op)
@ -3478,13 +3299,6 @@ void spu_recompiler::SELB(spu_opcode_t op)
return;
}
if (utils::has_xop())
{
c->vpcmov(vc, vb, SPU_OFF_128(gpr, op.ra), vc);
c->movdqa(SPU_OFF_128(gpr, op.rt4), vc);
return;
}
c->pand(vb, vc);
c->pandn(vc, SPU_OFF_128(gpr, op.ra));
c->por(vb, vc);
@ -3609,10 +3423,6 @@ void spu_recompiler::SHUFB(spu_opcode_t op)
{
c->vpternlogd(vc, va, vb, 0xca /* A?B:C */);
}
else if (utils::has_xop())
{
c->vpcmov(vc, va, vb, vc);
}
else
{
c->pand(va, vc);

View file

@ -21,7 +21,7 @@ class spu_recompiler : public spu_recompiler_base
public:
spu_recompiler();
virtual void compile(spu_function_t& f) override;
virtual bool compile(std::shared_ptr<spu_function_contents_t> f) override;
private:
// emitter:
@ -41,7 +41,7 @@ private:
std::array<asmjit::X86Xmm*, 6> vec;
// labels:
asmjit::Label* labels; // array[0x10000]
std::unique_ptr<asmjit::Label[]> labels; // array[0x10000]
asmjit::Label* jt; // jump table resolver (uses *addr)
asmjit::Label* end; // function end (return *addr)

View file

@ -5,16 +5,18 @@
const spu_decoder<spu_itype> s_spu_itype;
spu_function_t* SPUDatabase::find(const be_t<u32>* data, u64 key, u32 max_size)
std::shared_ptr<spu_function_contents_t> SPUDatabase::find(const be_t<u32>* data, u64 key, u32 max_size, void* ignore)
{
for (auto found = m_db.equal_range(key); found.first != found.second; found.first++)
{
const auto& func = found.first->second;
const auto & func = found.first->second;
// TODO remove code after a while if it hasn't been touched, else there's a big memory bloat here
// Compare binary data explicitly (TODO: optimize)
if (LIKELY(func->size <= max_size) && std::memcmp(func->data.data(), data, func->size) == 0)
if (func.get() != ignore && LIKELY(func->size <= max_size) && memcmp(func->data.data(), data, func->size) == 0)
{
return func.get();
return func;
}
}
@ -33,8 +35,9 @@ SPUDatabase::~SPUDatabase()
// TODO: serialize database
}
spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_limit)
std::shared_ptr<spu_function_contents_t> SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, void* ignore /*=nullptr*/)
{
const u32 max_limit = 0x40000;
// Check arguments (bounds and alignment)
if (max_limit > 0x40000 || entry >= max_limit || entry % 4 || max_limit % 4)
{
@ -42,7 +45,12 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
}
// Key for multimap
const u64 key = entry | u64{ ls[entry / 4] } << 32;
u32 xor_base = 0;
for (u32 i = 0; i < 10; i++)
{
xor_base ^= ls[(entry / 4) + i];
}
const u64 key = entry | u64{ xor_base } << 32;
const be_t<u32>* base = ls + entry / 4;
const u32 block_sz = max_limit - entry;
@ -56,7 +64,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
}
}
{
/*{
writer_lock lock(m_mutex);
// Double-check
@ -64,7 +72,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
{
return func;
}
}
}*/
// Initialize block entries with the function entry point
std::set<u32> blocks{ entry };
@ -84,15 +92,21 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
// Find preliminary set of possible block entries (first pass), `start` is the current block address
for (u32 start = entry, pos = entry; pos < limit; pos += 4)
{
u32 xor_base = 0;
for (u32 i = 0; i < 10; i++)
{
xor_base ^= ls[(pos / 4) + i];
}
const spu_opcode_t op{ ls[pos / 4] };
const auto type = s_spu_itype.decode(op.opcode);
if (pos != entry)
{
reader_lock lock(m_mutex);
// Find existing function
if (pos != entry && find(ls + pos / 4, pos | u64{ op.opcode } << 32, limit - pos))
if (find(ls + pos / 4, pos | u64{ ls[pos / 4] } << 32, limit - pos))
{
limit = pos;
break;
@ -308,7 +322,7 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
}
// Prepare new function (set addr and size)
auto func = std::make_shared<spu_function_t>(entry, limit - entry);
auto func = std::make_shared<spu_function_contents_t>(entry, limit - entry);
// Copy function contents
func->data = { ls + entry / 4, ls + limit / 4 };
@ -354,5 +368,5 @@ spu_function_t* SPUDatabase::analyse(const be_t<u32>* ls, u32 entry, u32 max_lim
LOG_NOTICE(SPU, "Function detected [0x%05x-0x%05x] (size=0x%x)", func->addr, func->addr + func->size, func->size);
return func.get();
return func;
}

View file

@ -2,7 +2,9 @@
#include "Utilities/mutex.h"
#include <map>
#include <set>
#include <memory>
// SPU Instruction Type
struct spu_itype
@ -246,8 +248,10 @@ struct spu_itype
class SPUThread;
typedef u32(*CompiledFunc)(SPUThread* _spu, be_t<u32>* _ls);
// SPU basic function information structure
struct spu_function_t
struct spu_function_contents_t
{
// Entry point (LS address)
const u32 addr;
@ -271,30 +275,48 @@ struct spu_function_t
bool does_reset_stack;
// Pointer to the compiled function
u32(*compiled)(SPUThread* _spu, be_t<u32>* _ls) = nullptr;
CompiledFunc compiled = nullptr;
spu_function_t(u32 addr, u32 size)
: addr(addr)
, size(size)
spu_function_contents_t(u32 addr, u32 size)
: addr(addr),
size(size)
{
}
};
// A single instance of a compiled function, currently in use
union spu_function_t
{
// The function itself and its data
std::shared_ptr<spu_function_contents_t> contents;
// Whether pages the function is in were written to since its last execution
bool dirty_bit : 1;
operator bool()
{
return contents != nullptr;
}
spu_function_t() : contents(nullptr) {};
~spu_function_t() {dirty_bit = false; contents.reset();};
};
// SPU Function Database (must be global or PS3 process-local)
class SPUDatabase final : spu_itype
{
shared_mutex m_mutex;
// All registered functions (uses addr and first instruction as a key)
std::unordered_multimap<u64, std::shared_ptr<spu_function_t>> m_db;
std::unordered_multimap<u64, std::shared_ptr<spu_function_contents_t>> m_db;
// For internal use
spu_function_t* find(const be_t<u32>* data, u64 key, u32 max_size);
std::shared_ptr<spu_function_contents_t> find(const be_t<u32>* data, u64 key, u32 max_size, void* ignore = nullptr);
public:
SPUDatabase();
~SPUDatabase();
// Try to retrieve SPU function information
spu_function_t* analyse(const be_t<u32>* ls, u32 entry, u32 limit = 0x40000);
std::shared_ptr<spu_function_contents_t> analyse(const be_t<u32>* ls, u32 entry, void * ignore=nullptr);
};

View file

@ -101,7 +101,8 @@ void spu_interpreter::RDCH(SPUThread& spu, spu_opcode_t op)
}
else
{
spu.gpr[op.rt] = v128::from32r(result);
memset(&spu.gpr[op.rt], 0, 3*sizeof(u32));
spu.gpr[op.rt]._u32[3] = result; v128::from32r(result);
}
}

View file

@ -24,39 +24,48 @@ void spu_recompiler_base::enter(SPUThread& spu)
const auto _ls = vm::_ptr<u32>(spu.offset);
// Search if cached data matches
auto func = spu.compiled_cache[spu.pc / 4];
// Check shared db if we dont have a match
if (!func || !std::equal(func->data.begin(), func->data.end(), _ls + spu.pc / 4, [](const be_t<u32>& l, const be_t<u32>& r) { return *(u32*)(u8*)&l == *(u32*)(u8*)&r; }))
auto & func = spu.compiled_cache[spu.pc / 4];
if (func.dirty_bit)
{
func = spu.spu_db->analyse(_ls, spu.pc);
spu.compiled_cache[spu.pc / 4] = func;
func.dirty_bit = false;
// This memcmp acts as a fast path instead of finding it again in analyse.
if (memcmp(func.contents->data.data(), _ls + (spu.pc / 4), func.contents->size) != 0)
{
func.contents = spu.spu_db->analyse(_ls, spu.pc, func.contents.get());
}
}
else if (!func)
{
func.contents = spu.spu_db->analyse(_ls, spu.pc);
spu.compiled_functions.push_back(&func);
}
// Reset callstack if necessary
if ((func->does_reset_stack && spu.recursion_level) || spu.recursion_level >= 128)
if ((func.contents->does_reset_stack && spu.recursion_level) || spu.recursion_level >= 128)
{
spu.state += cpu_flag::ret;
return;
}
// Compile if needed
if (!func->compiled)
if (!func.contents->compiled)
{
if (!spu.spu_rec)
{
spu.spu_rec = fxm::get_always<spu_recompiler>();
}
spu.spu_rec->compile(*func);
spu.spu_rec->compile(func.contents);
if (!func->compiled) fmt::throw_exception("Compilation failed" HERE);
if (!func.contents->compiled) fmt::throw_exception("Compilation failed" HERE);
}
const u32 res = func->compiled(&spu, _ls);
const u32 res = func.contents->compiled(&spu, _ls);
if (const auto exception = spu.pending_exception)
if (spu.pending_exception)
{
const auto exception = spu.pending_exception;
spu.pending_exception = nullptr;
std::rethrow_exception(exception);
}

View file

@ -10,7 +10,7 @@ class spu_recompiler_base
protected:
std::mutex m_mutex; // must be locked in compile()
const spu_function_t* m_func; // current function
std::shared_ptr<const spu_function_contents_t> m_func; // current function
u32 m_pos; // current position
@ -18,7 +18,7 @@ public:
virtual ~spu_recompiler_base();
// Compile specified function
virtual void compile(spu_function_t& f) = 0;
virtual bool compile(std::shared_ptr<spu_function_contents_t>) = 0;
// Run
static void enter(class SPUThread&);

View file

@ -41,6 +41,18 @@ bool operator ==(const u128& lhs, const u128& rhs)
}
#endif
#ifndef _MSC_VER
FORCE_INLINE void __movsq(unsigned long * Destination, const unsigned long * Source, size_t Count)
{
__asm__ __volatile__
(
"rep; movsq" :
[Destination] "=D" (Destination), [Source] "=S" (Source), [Count] "=c" (Count) :
"[Destination]" (Destination), "[Source]" (Source), "[Count]" (Count)
);
}
#endif
extern u64 get_timebased_time();
extern u64 get_system_time();
@ -134,8 +146,8 @@ namespace spu
{
if (timeout_ms > 0)
{
const u64 timeout = timeout_ms * 1000u; //convert to microseconds
const u64 start = get_system_time();
const auto timeout = timeout_ms * 1000ull; //convert to microseconds
const auto start = get_system_time();
auto remaining = timeout;
while (atomic_instruction_table[pc_offset].load(std::memory_order_consume) >= max_concurrent_instructions)
@ -162,14 +174,14 @@ namespace spu
}
}
atomic_instruction_table[pc_offset]++;
++atomic_instruction_table[pc_offset];
}
void release_pc_address(u32 pc)
{
const u32 pc_offset = pc >> 2;
atomic_instruction_table[pc_offset]--;
--atomic_instruction_table[pc_offset];
}
struct concurrent_execution_watchdog
@ -290,6 +302,8 @@ void SPUThread::on_spawn()
{
thread_ctrl::set_native_priority(-1);
}
++g_num_spu_threads;
}
void SPUThread::on_init(const std::shared_ptr<void>& _this)
@ -532,81 +546,11 @@ void SPUThread::do_dma_transfer(const spu_mfc_cmd& args, bool from_mfc)
}
default:
{
auto vdst = static_cast<__m128i*>(dst);
auto vsrc = static_cast<const __m128i*>(src);
auto vcnt = size / sizeof(__m128i);
auto vdst = static_cast<u64*>(dst);
auto vsrc = static_cast<const u64*>(src);
auto vcnt = size / sizeof(u64);
//if (is_get && !from_mfc)
{
while (vcnt >= 8)
{
const __m128i data[]
{
_mm_load_si128(vsrc + 0),
_mm_load_si128(vsrc + 1),
_mm_load_si128(vsrc + 2),
_mm_load_si128(vsrc + 3),
_mm_load_si128(vsrc + 4),
_mm_load_si128(vsrc + 5),
_mm_load_si128(vsrc + 6),
_mm_load_si128(vsrc + 7),
};
_mm_store_si128(vdst + 0, data[0]);
_mm_store_si128(vdst + 1, data[1]);
_mm_store_si128(vdst + 2, data[2]);
_mm_store_si128(vdst + 3, data[3]);
_mm_store_si128(vdst + 4, data[4]);
_mm_store_si128(vdst + 5, data[5]);
_mm_store_si128(vdst + 6, data[6]);
_mm_store_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
vdst += 8;
}
while (vcnt--)
{
_mm_store_si128(vdst++, _mm_load_si128(vsrc++));
}
break;
}
// Disabled
while (vcnt >= 8)
{
const __m128i data[]
{
_mm_load_si128(vsrc + 0),
_mm_load_si128(vsrc + 1),
_mm_load_si128(vsrc + 2),
_mm_load_si128(vsrc + 3),
_mm_load_si128(vsrc + 4),
_mm_load_si128(vsrc + 5),
_mm_load_si128(vsrc + 6),
_mm_load_si128(vsrc + 7),
};
_mm_stream_si128(vdst + 0, data[0]);
_mm_stream_si128(vdst + 1, data[1]);
_mm_stream_si128(vdst + 2, data[2]);
_mm_stream_si128(vdst + 3, data[3]);
_mm_stream_si128(vdst + 4, data[4]);
_mm_stream_si128(vdst + 5, data[5]);
_mm_stream_si128(vdst + 6, data[6]);
_mm_stream_si128(vdst + 7, data[7]);
vcnt -= 8;
vsrc += 8;
vdst += 8;
}
while (vcnt--)
{
_mm_stream_si128(vdst++, _mm_load_si128(vsrc++));
}
__movsq(vdst, vsrc, vcnt);
}
}
@ -662,15 +606,14 @@ void SPUThread::process_mfc_cmd()
if (is_polling)
{
vm::waiter waiter;
waiter.owner = this;
waiter.addr = raddr;
waiter.size = 128;
waiter.stamp = rtime;
waiter.data = rdata.data();
waiter.init();
vm::waiter* waiter = new vm::waiter();
waiter->owner = this;
waiter->addr = raddr;
waiter->stamp = rtime;
waiter->data = rdata.data();
waiter->init();
while (vm::reservation_acquire(raddr, 128) == waiter.stamp && rdata == data)
while (vm::reservation_acquire(raddr, 128) == waiter->stamp && rdata == data)
{
if (test(state, cpu_flag::stop))
{
@ -679,6 +622,8 @@ void SPUThread::process_mfc_cmd()
thread_ctrl::wait_for(100);
}
waiter->remove();
}
else if (s_use_rtm && utils::transaction_enter())
{
@ -704,9 +649,11 @@ void SPUThread::process_mfc_cmd()
if (is_polling || UNLIKELY(vm::reservation_acquire(raddr, 128) != rtime))
{
// TODO: vm::check_addr
vm::reader_lock lock;
rtime = vm::reservation_acquire(raddr, 128);
rdata = data;
{
vm::reader_lock lock;
rtime = vm::reservation_acquire(raddr, 128);
}
memcpy(rdata.data(), data.data(), rdata.size() * sizeof(rdata[0]));
}
// Copy to LS
@ -723,38 +670,54 @@ void SPUThread::process_mfc_cmd()
bool result = false;
if (raddr == ch_mfc_cmd.eal && rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
{
// TODO: vm::check_addr
if (s_use_rtm && utils::transaction_enter())
{
if (!vm::reader_lock{vm::try_to_lock})
{
_xabort(0);
}
// Check for fast exit in the beginning as well
vm::writer_lock lock(vm::try_to_lock);
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
if (lock.locked || memcmp(rdata.data(), data.data(), rdata.size() * sizeof(rdata[0])) == 0) {
if (raddr == ch_mfc_cmd.eal && rtime == vm::reservation_acquire(raddr, 128))
{
// TODO: vm::check_addr
if (s_use_rtm && utils::transaction_enter())
{
if (!lock.locked && !vm::reader_lock{ vm::try_to_lock })
{
_xabort(0);
}
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
{
data = to_write;
result = true;
vm::reservation_update(raddr, 128);
vm::notify(raddr, 128);
}
_xend();
}
else if (lock.locked)
{
data = to_write;
result = true;
vm::reservation_update(raddr, 128);
lock.unlock();
result = true;
vm::notify(raddr, 128);
}
_xend();
}
else
{
vm::writer_lock lock;
if (rtime == vm::reservation_acquire(raddr, 128) && rdata == data)
else
{
data = to_write;
result = true;
// TODO maybe timeout and check if the lock is still needed in long waits (If rtime changes, no use)
vm::writer_lock lock(0);
vm::reservation_update(raddr, 128);
vm::notify(raddr, 128);
if (rtime == vm::reservation_acquire(raddr, 128))
{
data = to_write;
vm::reservation_update(raddr, 128);
lock.unlock();
result = true;
vm::notify(raddr, 128);
}
}
}
}
@ -766,11 +729,10 @@ void SPUThread::process_mfc_cmd()
else
{
ch_atomic_stat.set_value(MFC_PUTLLC_FAILURE);
}
if (raddr && !result)
{
ch_event_stat |= SPU_EVENT_LR;
if (raddr)
{
ch_event_stat |= SPU_EVENT_LR;
}
}
raddr = 0;
@ -808,9 +770,11 @@ void SPUThread::process_mfc_cmd()
return;
}
vm::writer_lock lock(0);
data = to_write;
vm::reservation_update(ch_mfc_cmd.eal, 128);
{
vm::writer_lock lock(0);
vm::reservation_update(ch_mfc_cmd.eal, 128);
}
vm::notify(ch_mfc_cmd.eal, 128);
ch_atomic_stat.set_value(MFC_PUTLLUC_SUCCESS);
@ -1189,28 +1153,33 @@ bool SPUThread::get_ch_value(u32 ch, u32& out)
return true;
}
vm::waiter waiter;
vm::waiter* waiter = nullptr;
if (ch_event_mask & SPU_EVENT_LR)
{
waiter.owner = this;
waiter.addr = raddr;
waiter.size = 128;
waiter.stamp = rtime;
waiter.data = rdata.data();
waiter.init();
waiter = new vm::waiter();
waiter->owner = this;
waiter->addr = raddr;
waiter->stamp = rtime;
waiter->data = rdata.data();
waiter->init();
}
while (!(res = get_events(true)))
{
if (test(state & cpu_flag::stop))
{
waiter->remove();
return false;
}
thread_ctrl::wait_for(100);
}
if (waiter != nullptr)
{
waiter->remove();
}
out = res;
return true;
}

View file

@ -3,6 +3,7 @@
#include "Emu/Cell/Common.h"
#include "Emu/CPU/CPUThread.h"
#include "Emu/Cell/SPUInterpreter.h"
#include "Emu/Cell/SPURecompiler.h"
#include "MFC.h"
struct lv2_event_queue;
@ -212,22 +213,16 @@ public:
// returns true on success
bool try_pop(u32& out)
{
const auto old = data.fetch_op([&](sync_var_t& data)
const auto old = data.fetch_op([](sync_var_t& data)
{
if (data.count)
{
data.wait = false;
out = data.value;
}
else
{
data.wait = true;
}
data.count = false;
data.value = 0; // ???
sync_var_t t;
*reinterpret_cast<u64*>(&t) = 0;
t.wait = !data.count;
data = t;
});
out = old.value;
return old.count;
}
@ -585,7 +580,10 @@ public:
std::exception_ptr pending_exception;
std::array<struct spu_function_t*, 65536> compiled_cache{};
// No need for shared_ptr in the following two, as whenever something is removed or added to one,
// the same goes for the other.
std::array<spu_function_t, 65536> compiled_cache{};
std::vector<spu_function_t*> compiled_functions{};
std::shared_ptr<class SPUDatabase> spu_db;
std::shared_ptr<class spu_recompiler_base> spu_rec;
u32 recursion_level = 0;

View file

@ -8,16 +8,15 @@
#include "Emu/Cell/lv2/sys_memory.h"
#include "Emu/RSX/GSRender.h"
#include <atomic>
#include <deque>
namespace vm
{
static u8* memory_reserve_4GiB(std::uintptr_t _addr = 0)
std::array<memory_page, 0x100000000 / 4096> g_pages{};
static u8* memory_reserve_4GiB(const std::uintptr_t _addr = 0)
{
for (u64 addr = _addr + 0x100000000;; addr += 0x100000000)
for (auto addr = _addr + 0x100000000;; addr += 0x100000000)
{
if (auto ptr = utils::memory_reserve(0x100000000, (void*)addr))
if (const auto ptr = utils::memory_reserve(0x100000000, reinterpret_cast<void*>(addr)))
{
return static_cast<u8*>(ptr);
}
@ -39,11 +38,12 @@ namespace vm
// Memory locations
std::vector<std::shared_ptr<block_t>> g_locations;
// Reservations (lock lines) in a single memory page
using reservation_info = std::array<std::atomic<u64>, 4096 / 128>;
// Registered waiters
std::deque<vm::waiter*> g_waiters;
std::vector<vm::waiter*> g_waiters;
// Waiters which will be removed once the lock is freed
std::mutex g_waiters_to_remove_lock;
std::vector<vm::waiter*> g_waiters_to_remove;
// Memory mutex core
shared_mutex g_mutex;
@ -201,78 +201,52 @@ namespace vm
{
}
writer_lock::~writer_lock()
void writer_lock::unlock()
{
if (locked)
{
g_mutex.unlock();
}
}
// Page information
struct memory_page
{
// Memory flags
atomic_t<u8> flags;
atomic_t<u32> waiters;
// Reservations
atomic_t<reservation_info*> reservations;
// Access reservation info
std::atomic<u64>& operator [](u32 addr)
{
auto ptr = reservations.load();
if (!ptr)
if (!g_waiters_to_remove.empty())
{
// Opportunistic memory allocation
ptr = new reservation_info{};
if (auto old_ptr = reservations.compare_and_swap(nullptr, ptr))
std::lock_guard<std::mutex> lock(g_waiters_to_remove_lock);
for (auto ptr : g_waiters_to_remove)
{
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), ptr);
if (found != g_waiters.cend())
{
g_waiters.erase(found);
}
delete ptr;
ptr = old_ptr;
}
g_waiters_to_remove.clear();
}
return (*ptr)[(addr & 0xfff) >> 7];
g_mutex.unlock();
locked = false;
}
};
// Memory pages
std::array<memory_page, 0x100000000 / 4096> g_pages{};
u64 reservation_acquire(u32 addr, u32 _size)
{
// Access reservation info: stamp and the lock bit
return g_pages[addr >> 12][addr].load(std::memory_order_acquire);
}
void reservation_update(u32 addr, u32 _size)
writer_lock::~writer_lock()
{
// Update reservation info with new timestamp (unsafe, assume allocated)
(*g_pages[addr >> 12].reservations)[(addr & 0xfff) >> 7].store(__rdtsc(), std::memory_order_release);
unlock();
}
void waiter::init()
{
// Register waiter
writer_lock lock(0);
g_waiters.emplace_back(this);
}
void waiter::test() const
{
if (std::memcmp(data, vm::base(addr), size) == 0)
const auto owner_copy = owner;
if (!owner_copy)
{
return;
}
memory_page& page = g_pages[addr >> 12];
if (page.reservations == nullptr)
{
return;
@ -283,23 +257,40 @@ namespace vm
return;
}
if (owner)
if (memcmp(data, vm::base(addr), size) == 0)
{
owner->notify();
return;
}
owner_copy->notify();
}
waiter::~waiter()
void waiter::remove()
{
// Unregister waiter
writer_lock lock(0);
const writer_lock lock(try_to_lock);
// Find waiter
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this);
if (found != g_waiters.cend())
if (lock.locked)
{
g_waiters.erase(found);
// Find waiter
const auto found = std::find(g_waiters.cbegin(), g_waiters.cend(), this);
if (found != g_waiters.cend())
{
g_waiters.erase(found);
delete this;
}
else
{
verify("Waiter not found during removal"), false;
}
}
else
{
this->owner = nullptr; // Iterations of the object will ignore it from now on
std::lock_guard<std::mutex> lock(g_waiters_to_remove_lock);
g_waiters_to_remove.push_back(this);
}
}
@ -369,8 +360,8 @@ namespace vm
const u8 flags_both = flags_set & flags_clear;
flags_test |= page_allocated;
flags_set &= ~flags_both;
flags_test |= page_allocated;
flags_set &= ~flags_both;
flags_clear &= ~flags_both;
for (u32 i = addr / 4096; i < addr / 4096 + size / 4096; i++)
@ -736,7 +727,7 @@ namespace vm
{
writer_lock lock(0);
for (auto it = g_locations.begin(); it != g_locations.end(); it++)
for (auto it = g_locations.begin(); it != g_locations.end(); ++it)
{
if (*it && (*it)->addr == addr)
{
@ -835,7 +826,7 @@ void fmt_class_string<vm::_ptr_base<const char>>::format(std::string& out, u64 a
out += u8"";
for (vm::_ptr_base<const volatile char> ptr = vm::cast(arg);; ptr++)
for (vm::_ptr_base<const volatile char> ptr = vm::cast(arg);; ++ptr)
{
if (!vm::check_addr(ptr.addr()))
{

View file

@ -1,8 +1,8 @@
#pragma once
#include <map>
#include <functional>
#include <memory>
#include <atomic>
class named_thread;
class cpu_thread;
@ -42,9 +42,10 @@ namespace vm
{
named_thread* owner;
u32 addr;
u32 size;
bool inserted = false;
u64 stamp;
const void* data;
static const u32 size = 128; // Always 128 currently
waiter() = default;
@ -53,7 +54,7 @@ namespace vm
void init();
void test() const;
~waiter();
void remove();
};
// Address type
@ -90,21 +91,64 @@ namespace vm
struct writer_lock final
{
const bool locked;
bool locked;
writer_lock(const writer_lock&) = delete;
writer_lock(int full = 1);
writer_lock(const try_to_lock_t&);
void unlock();
~writer_lock();
explicit operator bool() const { return locked; }
};
// Get reservation status for further atomic update: last update timestamp
u64 reservation_acquire(u32 addr, u32 size);
// Reservations (lock lines) in a single memory page
using reservation_info = std::array<std::atomic<u64>, 4096 / 128>;
// End atomic update
void reservation_update(u32 addr, u32 size);
// Page information
struct memory_page
{
// Reservations
atomic_t<reservation_info*> reservations;
//atomic_t<u32> waiters;
// Memory flags
atomic_t<u8> flags;
// Access reservation info
FORCE_INLINE std::atomic<u64>& operator [](const u32 addr)
{
auto ptr = reservations.load();
if (!ptr)
{
ptr = new reservation_info();
// Opportunistic memory allocation
if (const auto old_ptr = reservations.compare_and_swap(nullptr, ptr))
{
delete ptr;
ptr = old_ptr;
}
}
return (*ptr)[(addr & 0xfff) >> 7];
}
};
// Memory pages
extern std::array<memory_page, 0x100000000 / 4096> g_pages;
FORCE_INLINE u64 reservation_acquire(u32 addr, u32 _size)
{
// Access reservation info: stamp and the lock bit
return g_pages[addr >> 12][addr].load(std::memory_order_acquire);
}
FORCE_INLINE void reservation_update(u32 addr, u32 _size)
{
// Update reservation info with new timestamp (unsafe, assume allocated)
(*g_pages[addr >> 12].reservations)[(addr & 0xfff) >> 7].store(__rdtsc(), std::memory_order_release);
}
// Check and notify memory changes at address
void notify(u32 addr, u32 size);

View file

@ -532,7 +532,7 @@ std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::spa
verify(HERE), (dst.size_bytes() >= src.size_bytes());
u32 dst_idx = 0;
u32 dst_idx = -1;
for (T index : src)
{
if (is_primitive_restart_enabled && index == primitive_restart_index)
@ -549,9 +549,9 @@ std::tuple<T, T, u32> upload_untouched(gsl::span<to_be_t<const T>> src, gsl::spa
min_index = std::min(min_index, index);
}
dst[dst_idx++] = index;
dst[++dst_idx] = index;
}
return std::make_tuple(min_index, max_index, dst_idx);
return std::make_tuple(min_index, max_index, dst_idx + 1);
}
template<typename T>