mirror of
https://github.com/RPCSX/rpcsx.git
synced 2026-04-05 14:37:08 +00:00
PPU LLVM arm64+macOS port (#12115)
* BufferUtils: use naive function pointer on Apple arm64
Use naive function pointer on Apple arm64 because ASLR breaks asmjit.
See BufferUtils.cpp comment for explanation on why this happens and how
to fix if you want to use asmjit.
* build-macos: fix source maps for Mac
Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo
modes.
* LLVM PPU: fix aarch64 on macOS
Force MachO on macOS to fix LLVM being unable to patch relocations
during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by
PPUTranslator/Recompiler.
* virtual memory: use 16k pages on aarch64 macOS
Temporary hack to get things working by using 16k pages instead of 4k
pages in VM emulation.
* PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS
Fixes some intrinsics usage and patches usages of asmjit to properly
emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also
patches the SPU recompiler to properly work on arm64 by telling LLVM to
target arm64.
* virtual memory: fix W^X toggles on macOS aarch64
Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default
to RW mode. For both SPU and PPU execution threads, when initialization
finishes we toggle to RX mode. This exploits Apple's per-thread setting
for RW/RX to let us be technically compliant with the OS's W^X
enforcement while not needing to actually separate the memory
allocated for code/data.
* PPU: implement aarch64 specific functions
Implements ppu_gateway for arm64 and patches LLVM initialization to use
the correct triple. Adds some fixes for macOS W^X JIT restrictions when
entering/exiting JITed code.
* PPU: Mark rpcs3 calls as non-tail
Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you
call a function inside e.g. an L2 syscall, it will clobber LR on arm64
and subtly break returns in emulated code. Only JIT -> JIT "calls"
should be tail.
* macOS/arm64: compatibility fixes
* vm: patch virtual memory for arm64 macOS
Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to
existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix
memory unmapping on 16K page machines with a hack to mark "unmapped"
pages as RW.
* PPU: remove wrong comment
* PPU: fix a merge regression
* vm: remove 16k page hacks
* PPU: formatting fixes
* PPU: fix arm64 null function assembly
* ppu: clean up arch-specific instructions
This commit is contained in:
parent
264253757c
commit
cefc37a553
14 changed files with 306 additions and 16 deletions
|
|
@ -65,6 +65,10 @@
|
|||
#include "util/simd.hpp"
|
||||
#include "util/sysinfo.hpp"
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <libkern/OSCacheControl.h>
|
||||
#endif
|
||||
|
||||
extern atomic_t<u64> g_watchdog_hold_ctr;
|
||||
|
||||
// Should be of the same type
|
||||
|
|
@ -247,7 +251,104 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
|
|||
|
||||
c.ret();
|
||||
#else
|
||||
// See https://github.com/ghc/ghc/blob/master/rts/include/stg/MachRegs.h
|
||||
// for GHC calling convention definitions on Aarch64
|
||||
// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
|
||||
// for AArch64 calling convention
|
||||
|
||||
// Push callee saved registers to the stack
|
||||
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
|
||||
c.sub(a64::sp, a64::sp, Imm(112));
|
||||
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
|
||||
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
|
||||
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
|
||||
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
|
||||
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
|
||||
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
|
||||
c.str(a64::x30, arm::Mem(a64::sp, 96));
|
||||
|
||||
// Save sp for native longjmp emulation
|
||||
Label native_sp_offset = c.newLabel();
|
||||
c.ldr(a64::x10, arm::Mem(native_sp_offset));
|
||||
c.str(a64::sp, arm::Mem(args[0], a64::x10));
|
||||
|
||||
// Load REG_Base - use absolute jump target to bypass rel jmp range limits
|
||||
Label exec_addr = c.newLabel();
|
||||
c.ldr(a64::x19, arm::Mem(exec_addr));
|
||||
c.ldr(a64::x19, arm::Mem(a64::x19));
|
||||
// Load PPUThread struct base -> REG_Sp
|
||||
const arm::GpX ppu_t_base = a64::x20;
|
||||
c.mov(ppu_t_base, args[0]);
|
||||
// Load PC
|
||||
const arm::GpX pc = a64::x26;
|
||||
Label cia_offset = c.newLabel();
|
||||
const arm::GpX cia_addr_reg = a64::x11;
|
||||
// Load offset value
|
||||
c.ldr(cia_addr_reg, arm::Mem(cia_offset));
|
||||
// Load cia
|
||||
c.ldr(pc, arm::Mem(ppu_t_base, cia_addr_reg));
|
||||
// Zero top 32 bits
|
||||
c.mov(a64::w26, a64::w26);
|
||||
// Multiply by 2 to index into ptr table
|
||||
const arm::GpX index_shift = a64::x27;
|
||||
c.mov(index_shift, Imm(2));
|
||||
c.mul(pc, pc, index_shift);
|
||||
|
||||
// Load call target
|
||||
const arm::GpX call_target = a64::x28;
|
||||
c.ldr(call_target, arm::Mem(a64::x19, pc));
|
||||
// Compute REG_Hp
|
||||
const arm::GpX reg_hp = a64::x21;
|
||||
c.mov(reg_hp, call_target);
|
||||
c.lsr(reg_hp, reg_hp, 48);
|
||||
c.lsl(reg_hp, reg_hp, 13);
|
||||
|
||||
// Zero top 16 bits of call target
|
||||
c.lsl(call_target, call_target, Imm(16));
|
||||
c.lsr(call_target, call_target, Imm(16));
|
||||
|
||||
// Load registers
|
||||
Label base_addr = c.newLabel();
|
||||
c.ldr(a64::x22, arm::Mem(base_addr));
|
||||
c.ldr(a64::x22, arm::Mem(a64::x22));
|
||||
|
||||
Label gpr_addr_offset = c.newLabel();
|
||||
const arm::GpX gpr_addr_reg = a64::x9;
|
||||
c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
|
||||
c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
|
||||
c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
|
||||
c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
|
||||
c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));
|
||||
|
||||
// Execute LLE call
|
||||
c.blr(call_target);
|
||||
|
||||
// Restore stack ptr
|
||||
c.ldr(a64::x10, arm::Mem(native_sp_offset));
|
||||
c.ldr(a64::sp, arm::Mem(args[0], a64::x10));
|
||||
// Restore registers from the stack
|
||||
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
|
||||
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
|
||||
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
|
||||
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
|
||||
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
|
||||
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
|
||||
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
|
||||
// Restore stack ptr
|
||||
c.add(a64::sp, a64::sp, Imm(112));
|
||||
// Return
|
||||
c.ret(a64::x30);
|
||||
|
||||
c.bind(exec_addr);
|
||||
c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
|
||||
c.bind(base_addr);
|
||||
c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
|
||||
c.bind(cia_offset);
|
||||
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
|
||||
c.bind(gpr_addr_offset);
|
||||
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
|
||||
c.bind(native_sp_offset);
|
||||
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
|
||||
#endif
|
||||
});
|
||||
|
||||
|
|
@ -1252,6 +1353,9 @@ void ppu_thread::cpu_task()
|
|||
}
|
||||
case ppu_cmd::initialize:
|
||||
{
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(false);
|
||||
#endif
|
||||
cmd_pop();
|
||||
|
||||
while (!g_fxo->get<rsx::thread>().is_inited && !is_stopped())
|
||||
|
|
@ -1267,6 +1371,15 @@ void ppu_thread::cpu_task()
|
|||
thread_ctrl::wait_on<atomic_wait::op_ne>(g_progr_ptotal, 0);
|
||||
g_fxo->get<progress_dialog_workaround>().skip_the_progress_dialog = true;
|
||||
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(true);
|
||||
#endif
|
||||
#ifdef ARCH_ARM64
|
||||
// Flush all cache lines after potentially writing executable code
|
||||
asm("ISB");
|
||||
asm("DSB ISH");
|
||||
#endif
|
||||
|
||||
break;
|
||||
}
|
||||
case ppu_cmd::sleep:
|
||||
|
|
@ -1396,6 +1509,15 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
|
|||
{
|
||||
call_history.data.resize(call_history_max_size);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(true);
|
||||
#endif
|
||||
#ifdef ARCH_ARM64
|
||||
// Flush all cache lines after potentially writing executable code
|
||||
asm("ISB");
|
||||
asm("DSB ISH");
|
||||
#endif
|
||||
}
|
||||
|
||||
ppu_thread::thread_name_t::operator std::string() const
|
||||
|
|
@ -1974,6 +2096,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
|
|||
#endif
|
||||
c.ret();
|
||||
#else
|
||||
// Unimplemented should fail.
|
||||
c.brk(Imm(0x42));
|
||||
c.ret(a64::x30);
|
||||
#endif
|
||||
});
|
||||
|
|
@ -2552,6 +2676,9 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<lv2_
|
|||
|
||||
named_thread_group workers("SPRX Worker ", std::min<u32>(utils::get_thread_count(), ::size32(file_queue)), [&]
|
||||
{
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(false);
|
||||
#endif
|
||||
// Set low priority
|
||||
thread_ctrl::scoped_priority low_prio(-1);
|
||||
|
||||
|
|
@ -3226,6 +3353,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
|
|||
// Set low priority
|
||||
thread_ctrl::scoped_priority low_prio(-1);
|
||||
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(false);
|
||||
#endif
|
||||
for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++)
|
||||
{
|
||||
if (Emu.IsStopped())
|
||||
|
|
@ -3287,6 +3417,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
|
|||
}
|
||||
|
||||
// Jit can be null if the loop doesn't ever enter.
|
||||
#ifdef __APPLE__
|
||||
pthread_jit_write_protect_np(false);
|
||||
#endif
|
||||
if (jit && !jit_mod.init)
|
||||
{
|
||||
jit->fin();
|
||||
|
|
@ -3345,7 +3478,12 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
|
|||
std::unique_ptr<Module> _module = std::make_unique<Module>(obj_name, jit.get_context());
|
||||
|
||||
// Initialize target
|
||||
#if defined(__APPLE__) && defined(ARCH_ARM64)
|
||||
// Force target linux on macOS arm64 to bypass some 64-bit address space linking issues
|
||||
_module->setTargetTriple(utils::c_llvm_default_triple);
|
||||
#else
|
||||
_module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
|
||||
#endif
|
||||
_module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());
|
||||
|
||||
// Initialize translator
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue