Initial Linux Aarch64 support

* Update asmjit dependency (aarch64 branch)
* Disable USE_DISCORD_RPC by default
* Dump some JIT objects in rpcs3 cache dir
* Add SIGILL handler for all platforms
* Fix resetting zeroing denormals in thread pool
* Refactor most v128:: utils into global gv_** functions
* Refactor PPU interpreter (incomplete), remove "precise"
* - Instruction specializations with multiple accuracy flags
* - Adjust calling convention for speed
* - Removed precise/fast setting, replaced with static
* - Started refactoring interpreters for building at runtime JIT
*   (I got tired of poor compiler optimizations)
* - Expose some accuracy settings (SAT, NJ, VNAN, FPCC)
* - Add exec_bytes PPU thread variable (akin to cycle count)
* PPU LLVM: fix VCTUXS+VCTSXS instruction NaN results
* SPU interpreter: remove "precise" for now (extremely non-portable)
* - As with PPU, settings changed to static/dynamic for interpreters.
* - Precise options will be implemented later
* Fix termination after fatal error dialog
This commit is contained in:
Nekotekina 2021-12-30 19:39:18 +03:00
parent d6aa834b5f
commit 580bd2b25e
89 changed files with 20360 additions and 5612 deletions

View file

@ -62,7 +62,7 @@
#include "util/asm.hpp"
#include "util/vm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
extern atomic_t<u64> g_watchdog_hold_ctr;
@ -131,9 +131,8 @@ void fmt_class_string<typename ppu_thread::call_history_t>::format(std::string&
}
}
const ppu_decoder<ppu_interpreter_precise> g_ppu_interpreter_precise;
const ppu_decoder<ppu_interpreter_fast> g_ppu_interpreter_fast;
const ppu_decoder<ppu_itype> g_ppu_itype;
extern const ppu_decoder<ppu_itype> g_ppu_itype{};
extern const ppu_decoder<ppu_iname> g_ppu_iname{};
extern void ppu_initialize();
extern void ppu_finalize(const ppu_module& info);
@ -143,15 +142,16 @@ extern std::pair<std::shared_ptr<lv2_overlay>, CellError> ppu_load_overlay(const
extern void ppu_unload_prx(const lv2_prx&);
extern std::shared_ptr<lv2_prx> ppu_load_prx(const ppu_prx_object&, const std::string&, s64 file_offset);
extern void ppu_execute_syscall(ppu_thread& ppu, u64 code);
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t op);
static void ppu_break(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
extern void do_cell_atomic_128_store(u32 addr, const void* to_write);
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](asmjit::x86::Assembler& c, auto& args)
const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](native_asm& c, auto& args)
{
// Gateway for PPU, converts from native to GHC calling convention, also saves RSP value for escape
using namespace asmjit;
#if defined(ARCH_X64)
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
@ -192,10 +192,10 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
c.mov(x86::rax, x86::qword_ptr(x86::r13, x86::edx, 1, 0)); // Load call target
c.mov(x86::rdx, x86::rax);
c.shl(x86::rax, 17);
c.shr(x86::rax, 17);
c.shr(x86::rdx, 47);
c.shl(x86::rdx, 12);
c.shl(x86::rax, 16);
c.shr(x86::rax, 16);
c.shr(x86::rdx, 48);
c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base
c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr)));
@ -246,116 +246,113 @@ const auto ppu_gateway = built_function<void(*)(ppu_thread*)>("ppu_gateway", [](
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](asmjit::x86::Assembler& c, auto& args)
const extern auto ppu_escape = build_function_asm<void(*)(ppu_thread*)>("ppu_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&ppu_thread::saved_native_sp)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#endif
});
void ppu_recompiler_fallback(ppu_thread& ppu);
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](asmjit::x86::Assembler& c, auto& args)
#if defined(ARCH_X64)
const auto ppu_recompiler_fallback_ghc = build_function_asm<void(*)(ppu_thread& ppu)>("ppu_trampolineb", [](native_asm& c, auto& args)
{
using namespace asmjit;
c.mov(args[0], x86::rbp);
c.jmp(imm_ptr(ppu_recompiler_fallback));
});
#elif defined(ARCH_ARM64)
const auto ppu_recompiler_fallback_ghc = &ppu_recompiler_fallback;
#endif
// Get pointer to executable cache
static u64& ppu_ref(u32 addr)
static ppu_intrp_func_t& ppu_ref(u32 addr)
{
return *reinterpret_cast<u64*>(vm::g_exec_addr + u64{addr} * 2);
return *reinterpret_cast<ppu_intrp_func_t*>(vm::g_exec_addr + u64{addr} * 2);
}
// Get interpreter cache value
static u64 ppu_cache(u32 addr)
static ppu_intrp_func_t ppu_cache(u32 addr)
{
if (g_cfg.core.ppu_decoder > ppu_decoder_type::fast)
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{
fmt::throw_exception("Invalid PPU decoder");
}
// Select opcode table
const auto& table = *(
g_cfg.core.ppu_decoder == ppu_decoder_type::precise
? &g_ppu_interpreter_precise.get_table()
: &g_ppu_interpreter_fast.get_table());
return reinterpret_cast<uptr>(table[ppu_decode(vm::read32(addr))]);
return g_fxo->get<ppu_interpreter_rt>().decode(vm::read32(addr));
}
static bool ppu_fallback(ppu_thread& ppu, ppu_opcode_t op)
static ppu_intrp_func ppu_ret = {[](ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
if (g_cfg.core.ppu_debug)
{
ppu_log.error("Unregistered instruction: 0x%08x", op.opcode);
}
// Fix PC and return (step execution)
ppu.cia = vm::get_addr(this_op);
return;
}};
ppu_ref(ppu.cia) = ppu_cache(ppu.cia);
return false;
static void ppu_fallback(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
const auto _pc = vm::get_addr(this_op);
const auto _fn = ppu_cache(_pc);
ppu_ref(_pc) = _fn;
return _fn(ppu, op, this_op, next_fn);
}
// TODO: Make this a dispatch call
void ppu_recompiler_fallback(ppu_thread& ppu)
{
perf_meter<"PPUFALL1"_u64> perf0;
if (g_cfg.core.ppu_debug)
{
ppu_log.error("Unregistered PPU Function (LR=0x%llx)", ppu.lr);
ppu_log.error("Unregistered PPU Function (LR=0x%x)", ppu.lr);
}
const auto& table = g_ppu_interpreter_fast.get_table();
u64 ctr = 0;
const auto& table = g_fxo->get<ppu_interpreter_rt>();
while (true)
{
if (uptr func = ppu_ref(ppu.cia); (func << 17 >> 17) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
if (uptr func = uptr(ppu_ref(ppu.cia)); (func << 16 >> 16) != reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc))
{
// We found a recompiler function at cia, return
break;
}
// Run instructions in interpreter
if (const u32 op = vm::read32(ppu.cia); ctr++, table[ppu_decode(op)](ppu, {op})) [[likely]]
{
ppu.cia += 4;
continue;
}
// Run one instruction in interpreter (TODO)
const u32 op = vm::read32(ppu.cia);
table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
if (ppu.test_stopped())
{
break;
}
}
if (g_cfg.core.ppu_debug)
{
ppu_log.warning("Exiting interpreter at 0x%x (executed %u functions)", ppu.cia, ctr);
}
}
void ppu_reservation_fallback(ppu_thread& ppu)
{
const auto& table = g_ppu_interpreter_fast.get_table();
perf_meter<"PPUFALL2"_u64> perf0;
const auto& table = g_fxo->get<ppu_interpreter_rt>();
while (true)
{
// Run instructions in interpreter
// Run one instruction in interpreter (TODO)
const u32 op = vm::read32(ppu.cia);
if (table[ppu_decode(op)](ppu, {op})) [[likely]]
{
ppu.cia += 4;
}
table.decode(op)(ppu, {op}, vm::_ptr<u32>(ppu.cia), &ppu_ret);
if (!ppu.raddr || !ppu.use_full_rdata)
{
@ -372,7 +369,7 @@ void ppu_reservation_fallback(ppu_thread& ppu)
static std::unordered_map<u32, u32>* s_ppu_toc;
static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
static void ppu_check_toc(ppu_thread& ppu, ppu_opcode_t op, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
// Compare TOC with expected value
const auto found = s_ppu_toc->find(ppu.cia);
@ -383,18 +380,12 @@ static bool ppu_check_toc(ppu_thread& ppu, ppu_opcode_t)
if (!ppu.state.test_and_set(cpu_flag::dbg_pause) && ppu.check_state())
{
return false;
return;
}
}
// Fallback to the interpreter function
const u64 val = ppu_cache(ppu.cia);
if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val & 0xffffffff)(ppu, {static_cast<u32>(val >> 32)}))
{
ppu.cia += 4;
}
return false;
return ppu_cache(ppu.cia)(ppu, op, this_op, next_fn);
}
extern void ppu_register_range(u32 addr, u32 size)
@ -417,7 +408,6 @@ extern void ppu_register_range(u32 addr, u32 size)
utils::memory_commit(vm::g_stat_addr + addr, size);
}
const u64 fallback = reinterpret_cast<uptr>(ppu_fallback);
const u64 seg_base = addr;
while (size)
@ -425,11 +415,11 @@ extern void ppu_register_range(u32 addr, u32 size)
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
{
// Assume addr is the start of first segment of PRX
ppu_ref(addr) = reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3));
ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>(reinterpret_cast<uptr>(ppu_recompiler_fallback_ghc) | (seg_base << (32 + 3)));
}
else
{
ppu_ref(addr) = fallback;
ppu_ref(addr) = ppu_fallback;
}
addr += 4;
@ -437,14 +427,14 @@ extern void ppu_register_range(u32 addr, u32 size)
}
}
static bool ppu_far_jump(ppu_thread& ppu);
static void ppu_far_jump(ppu_thread&, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*);
extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nullptr)
extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = nullptr)
{
// Initialize specific function
if (ptr)
{
ppu_ref(addr) = (reinterpret_cast<uptr>(ptr) & 0x7fff'ffff'ffffu) | (ppu_ref(addr) & ~0x7fff'ffff'ffffu);
ppu_ref(addr) = reinterpret_cast<ppu_intrp_func_t>((reinterpret_cast<uptr>(ptr) & 0xffff'ffff'ffffu) | (uptr(ppu_ref(addr)) & ~0xffff'ffff'ffffu));
return;
}
@ -464,12 +454,9 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
}
// Initialize interpreter cache
const u64 _break = reinterpret_cast<uptr>(ppu_break);
const u64 far_jump = reinterpret_cast<uptr>(ppu_far_jump);
while (size)
{
if (ppu_ref(addr) != _break && ppu_ref(addr) != far_jump)
if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_far_jump)
{
ppu_ref(addr) = ppu_cache(addr);
}
@ -481,12 +468,12 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_function_t ptr = nu
extern void ppu_register_function_at(u32 addr, u32 size, u64 ptr)
{
return ppu_register_function_at(addr, size, reinterpret_cast<ppu_function_t>(ptr));
return ppu_register_function_at(addr, size, reinterpret_cast<ppu_intrp_func_t>(ptr));
}
u32 ppu_get_exported_func_addr(u32 fnid, const std::string& module_name);
bool ppu_return_from_far_jump(ppu_thread& ppu)
void ppu_return_from_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>*, ppu_intrp_func*)
{
auto& calls_info = ppu.hle_func_calls_with_toc_info;
ensure(!calls_info.empty());
@ -498,7 +485,6 @@ bool ppu_return_from_far_jump(ppu_thread& ppu)
ppu.gpr[2] = restore_info->saved_r2;
calls_info.pop_back();
return false;
}
static const bool s_init_return_far_jump_func = []
@ -586,9 +572,9 @@ u32 ppu_get_far_jump(u32 pc)
return g_fxo->get<ppu_far_jumps_t>().get_target(pc);
}
static bool ppu_far_jump(ppu_thread& ppu)
static void ppu_far_jump(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func*)
{
const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(ppu.cia, &ppu);
const u32 cia = g_fxo->get<ppu_far_jumps_t>().get_target(vm::get_addr(this_op), &ppu);
if (!vm::check_addr(cia, vm::page_executable))
{
@ -596,7 +582,6 @@ static bool ppu_far_jump(ppu_thread& ppu)
}
ppu.cia = cia;
return false;
}
bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, std::string module_name)
@ -658,7 +643,7 @@ bool ppu_form_branch_to_code(u32 entry, u32 target, bool link, bool with_toc, st
auto& jumps = g_fxo->get<ppu_far_jumps_t>();
std::lock_guard lock(jumps.mutex);
jumps.vals.insert_or_assign(entry, std::type_identity_t<typename ppu_far_jumps_t::all_info_t>{target, link, with_toc, std::move(module_name)});
jumps.vals.insert_or_assign(entry, ppu_far_jumps_t::all_info_t{target, link, with_toc, std::move(module_name)});
ppu_register_function_at(entry, 4, &ppu_far_jump);
return true;
@ -702,10 +687,13 @@ void ppu_remove_hle_instructions(u32 addr, u32 size)
atomic_t<bool> g_debugger_pause_all_threads_on_bp = true;
// Breakpoint entry point
static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
static void ppu_break(ppu_thread& ppu, ppu_opcode_t, be_t<u32>* this_op, ppu_intrp_func* next_fn)
{
const bool pause_all = g_debugger_pause_all_threads_on_bp;
const u32 old_cia = vm::get_addr(this_op);
ppu.cia = old_cia;
// Pause
ppu.state.atomic_op([&](bs_t<cpu_flag>& state)
{
@ -719,19 +707,14 @@ static bool ppu_break(ppu_thread& ppu, ppu_opcode_t)
Emu.CallAfter([]() { Emu.Pause(); });
}
if (ppu.check_state())
if (ppu.check_state() || old_cia != atomic_storage<u32>::load(ppu.cia))
{
return false;
// Do not execute if PC changed
return;
}
// Fallback to the interpreter function
const u64 val = ppu_cache(ppu.cia);
if (reinterpret_cast<decltype(&ppu_interpreter::UNK)>(val)(ppu, {vm::read32(ppu.cia).get()}))
{
ppu.cia += 4;
}
return false;
return ppu_cache(ppu.cia)(ppu, {*this_op}, this_op, next_fn);
}
// Set or remove breakpoint
@ -742,11 +725,9 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
return false;
}
const u64 _break = reinterpret_cast<uptr>(&ppu_break);
// Remove breakpoint parameters
u64 to_set = 0;
u64 expected = _break;
ppu_intrp_func_t to_set = 0;
ppu_intrp_func_t expected = &ppu_break;
if (u32 hle_addr{}; g_fxo->is_init<ppu_function_manager>() && (hle_addr = g_fxo->get<ppu_function_manager>().addr))
{
@ -756,7 +737,7 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
if (addr % 8 == 4 && index < ppu_function_manager::get().size())
{
// HLE function placement
to_set = reinterpret_cast<uptr>(ppu_function_manager::get()[index]);
to_set = ppu_function_manager::get()[index];
}
}
@ -766,23 +747,21 @@ extern bool ppu_breakpoint(u32 addr, bool is_adding)
to_set = ppu_cache(addr);
}
u64& _ref = ppu_ref(addr);
ppu_intrp_func_t& _ref = ppu_ref(addr);
if (is_adding)
{
// Swap if adding
std::swap(to_set, expected);
const u64 _fall = reinterpret_cast<uptr>(&ppu_fallback);
if (_ref == _fall)
if (_ref == &ppu_fallback)
{
ppu_log.error("Unregistered instruction replaced with a breakpoint at 0x%08x", addr);
expected = _fall;
expected = ppu_fallback;
}
}
return atomic_storage<u64>::compare_exchange(_ref, expected, to_set);
return atomic_storage<ppu_intrp_func_t>::compare_exchange(_ref, expected, to_set);
}
extern bool ppu_patch(u32 addr, u32 value)
@ -812,12 +791,9 @@ extern bool ppu_patch(u32 addr, u32 value)
*vm::get_super_ptr<u32>(addr) = value;
const u64 _break = reinterpret_cast<uptr>(&ppu_break);
const u64 fallback = reinterpret_cast<uptr>(&ppu_fallback);
if (is_exec)
{
if (ppu_ref(addr) != _break && ppu_ref(addr) != fallback)
if (ppu_ref(addr) != ppu_break && ppu_ref(addr) != ppu_fallback)
{
ppu_ref(addr) = ppu_cache(addr);
}
@ -1182,10 +1158,13 @@ void ppu_thread::cpu_task()
{
std::fesetround(FE_TONEAREST);
if (g_cfg.core.set_daz_and_ftz && g_cfg.core.ppu_decoder != ppu_decoder_type::precise)
if (g_cfg.core.set_daz_and_ftz)
{
// Set DAZ and FTZ
_mm_setcsr(_mm_getcsr() | 0x8840);
gv_set_zeroing_denormals();
}
else
{
gv_unset_zeroing_denormals();
}
// Execute cmd_queue
@ -1197,9 +1176,7 @@ void ppu_thread::cpu_task()
{
case ppu_cmd::opcode:
{
cmd_pop(), g_cfg.core.ppu_decoder == ppu_decoder_type::precise
? g_ppu_interpreter_precise.decode(arg)(*this, {arg})
: g_ppu_interpreter_fast.decode(arg)(*this, {arg});
cmd_pop(), g_fxo->get<ppu_interpreter_rt>().decode(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::set_gpr:
@ -1236,7 +1213,7 @@ void ppu_thread::cpu_task()
}
case ppu_cmd::hle_call:
{
cmd_pop(), ppu_function_manager::get().at(arg)(*this);
cmd_pop(), ppu_function_manager::get().at(arg)(*this, {arg}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::opd_call:
@ -1247,8 +1224,8 @@ void ppu_thread::cpu_task()
}
case ppu_cmd::ptr_call:
{
const ppu_function_t func = cmd_get(1).as<ppu_function_t>();
cmd_pop(1), func(*this);
const ppu_intrp_func_t func = cmd_get(1).as<ppu_intrp_func_t>();
cmd_pop(1), func(*this, {}, vm::_ptr<u32>(cia - 4), &ppu_ret);
break;
}
case ppu_cmd::initialize:
@ -1323,7 +1300,7 @@ void ppu_thread::cpu_on_stop()
void ppu_thread::exec_task()
{
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
if (g_cfg.core.ppu_decoder != ppu_decoder_type::_static)
{
while (true)
{
@ -1340,79 +1317,28 @@ void ppu_thread::exec_task()
}
const auto cache = vm::g_exec_addr;
using func_t = decltype(&ppu_interpreter::UNK);
const auto mem_ = vm::g_base_addr;
while (true)
{
const auto exec_op = [this](u64 op)
if (test_stopped()) [[unlikely]]
{
return reinterpret_cast<func_t>(op)(*this, {vm::read32(cia).get()});
};
if (cia % 8 || state) [[unlikely]]
{
if (test_stopped()) return;
// Decode single instruction (may be step)
if (exec_op(*reinterpret_cast<u64*>(cache + u64{cia} * 2))) { cia += 4; }
continue;
return;
}
u64 op0, op1, op2, op3;
u64 _pos = u64{cia} * 2;
gv_zeroupper();
// Reinitialize
{
const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
op0 = _op0._u64[0];
op1 = _op0._u64[1];
op2 = _op1._u64[0];
op3 = _op1._u64[1];
}
while (exec_op(op0)) [[likely]]
{
cia += 4;
if (exec_op(op1)) [[likely]]
{
cia += 4;
if (exec_op(op2)) [[likely]]
{
cia += 4;
if (exec_op(op3)) [[likely]]
{
cia += 4;
if (state) [[unlikely]]
{
break;
}
_pos += 32;
const v128 _op0 = *reinterpret_cast<const v128*>(cache + _pos);
const v128 _op1 = *reinterpret_cast<const v128*>(cache + _pos + 16);
op0 = _op0._u64[0];
op1 = _op0._u64[1];
op2 = _op1._u64[0];
op3 = _op1._u64[1];
continue;
}
break;
}
break;
}
break;
}
// Execute instruction (may be step; execute only one instruction if state)
const auto op = reinterpret_cast<be_t<u32>*>(mem_ + u64{cia});
const auto fn = reinterpret_cast<ppu_intrp_func*>(cache + u64{cia} * 2);
fn->fn(*this, {*op}, op, state ? &ppu_ret : fn + 1);
}
}
ppu_thread::~ppu_thread()
{
perf_log.notice("Perf stats for STCX reload: successs %u, failure %u", last_succ, last_fail);
perf_log.notice("Perf stats for instructions: total %u", exec_bytes / 4);
}
ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u32 prio, int detached)
@ -1638,7 +1564,7 @@ void ppu_thread::stack_pop_verbose(u32 addr, u32 size) noexcept
ppu_log.error("Invalid thread");
}
extern ppu_function_t ppu_get_syscall(u64 code);
extern ppu_intrp_func_t ppu_get_syscall(u64 code);
void ppu_trap(ppu_thread& ppu, u64 addr)
{
@ -1728,7 +1654,7 @@ static T ppu_load_acquire_reservation(ppu_thread& ppu, u32 addr)
{
const auto _inst = v128::loadu(inst + i) & mask_vec;
if (_mm_movemask_epi8(v128::eq32(_inst, store_vec).vi))
if (!gv_testz(gv_eq32(_inst, store_vec)))
{
return false;
}
@ -1817,10 +1743,11 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](asmjit::x86::Assembler& c, auto& args)
const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
@ -2024,6 +1951,9 @@ const auto ppu_stcx_accurate_tx = built_function<u64(*)(u32 raddr, u64 rtime, co
c.bind(ret2);
#endif
c.ret();
#else
c.ret(a64::x30);
#endif
});
template <typename T>
@ -2147,7 +2077,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
utils::prefetch_read(ppu.rdata + 64);
ppu.last_faddr = addr;
ppu.last_ftime = res.load() & -128;
ppu.last_ftsc = __rdtsc();
ppu.last_ftsc = utils::get_tsc();
return false;
}
default:
@ -2249,7 +2179,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
ppu.last_faddr = addr;
ppu.last_ftime = old_rtime & -128;
ppu.last_ftsc = __rdtsc();
ppu.last_ftsc = utils::get_tsc();
std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8);
}
@ -2286,7 +2216,7 @@ namespace
// Compiled PPU module info
struct jit_module
{
std::vector<ppu_function_t> funcs;
std::vector<ppu_intrp_func_t> funcs;
std::shared_ptr<jit_compiler> pjit;
bool init = false;
};
@ -2829,7 +2759,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
if (g_cfg.core.ppu_debug && func.size && func.toc != umax)
{
s_ppu_toc->emplace(func.addr, func.toc);
ppu_ref(func.addr) = reinterpret_cast<uptr>(&ppu_check_toc);
ppu_ref(func.addr) = &ppu_check_toc;
}
}
@ -3022,7 +2952,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
// Fixup some information
entry.name = fmt::format("__0x%x", entry.addr - reloc);
if (has_mfvscr)
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
{
// TODO
entry.attr += ppu_attr::has_mfvscr;
@ -3139,13 +3069,15 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
enum class ppu_settings : u32
{
non_win32,
accurate_fma,
accurate_ppu_vector_nan,
java_mode_handling,
accurate_dfma,
fixup_vnan,
accurate_jm,
accurate_cache_line_stores,
reservations_128_byte,
greedy_mode,
has_mfvscr,
accurate_sat,
accurate_fpcc,
accurate_vnan,
__bitset_enum_max
};
@ -3155,20 +3087,24 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
#ifndef _WIN32
settings += ppu_settings::non_win32;
#endif
if (g_cfg.core.llvm_accurate_dfma)
settings += ppu_settings::accurate_fma;
if (g_cfg.core.llvm_ppu_accurate_vector_nan)
settings += ppu_settings::accurate_ppu_vector_nan;
if (g_cfg.core.llvm_ppu_jm_handling)
settings += ppu_settings::java_mode_handling;
if (g_cfg.core.use_accurate_dfma)
settings += ppu_settings::accurate_dfma;
if (g_cfg.core.ppu_fix_vnan)
settings += ppu_settings::fixup_vnan;
if (g_cfg.core.ppu_use_nj_bit)
settings += ppu_settings::accurate_jm;
if (has_dcbz == 2)
settings += ppu_settings::accurate_cache_line_stores;
if (g_cfg.core.ppu_128_reservations_loop_max_length)
settings += ppu_settings::reservations_128_byte;
if (g_cfg.core.ppu_llvm_greedy_mode)
settings += ppu_settings::greedy_mode;
if (has_mfvscr)
settings += ppu_settings::has_mfvscr;
if (has_mfvscr && g_cfg.core.ppu_set_sat_bit)
settings += ppu_settings::accurate_sat;
if (g_cfg.core.ppu_set_fpcc)
settings += ppu_settings::accurate_fpcc, fmt::throw_exception("FPCC Not implemented");
if (g_cfg.core.ppu_set_vnan)
settings += ppu_settings::accurate_vnan, fmt::throw_exception("VNAN Not implemented");
// Write version, hash, CPU, settings
fmt::append(obj_name, "v5-kusa-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
@ -3319,10 +3255,10 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
if (!func.size) continue;
const auto name = fmt::format("__0x%x", func.addr - reloc);
const auto addr = ensure(reinterpret_cast<ppu_function_t>(jit->get(name)));
const auto addr = ensure(reinterpret_cast<ppu_intrp_func_t>(jit->get(name)));
jit_mod.funcs.emplace_back(addr);
if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
if (ppu_ref(func.addr) != ppu_far_jump)
ppu_register_function_at(func.addr, 4, addr);
if (g_cfg.core.ppu_debug)
@ -3342,7 +3278,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
const u64 addr = reinterpret_cast<uptr>(ensure(jit_mod.funcs[index++]));
if (ppu_ref(func.addr) != reinterpret_cast<u64>(ppu_far_jump))
if (ppu_ref(func.addr) != ppu_far_jump)
ppu_register_function_at(func.addr, 4, addr);
if (g_cfg.core.ppu_debug)