rpcsx/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp

5019 lines
111 KiB
C++
Raw Normal View History

2020-12-05 13:08:24 +01:00
#include "stdafx.h"
#include "SPURecompiler.h"
#include "Emu/System.h"
#include "Emu/system_config.h"
#include "Emu/system_progress.hpp"
#include "Emu/system_utils.hpp"
#include "Emu/cache_utils.hpp"
#include "Emu/IdManager.h"
2018-05-02 20:49:19 +02:00
#include "Crypto/sha1.h"
#include "Utilities/StrUtil.h"
#include "Utilities/JIT.h"
#include "util/init_mutex.hpp"
#include "util/shared_ptr.hpp"
#include "SPUThread.h"
#include "SPUAnalyser.h"
#include "SPUInterpreter.h"
#include "SPUDisAsm.h"
2017-07-20 16:20:28 +02:00
#include <algorithm>
#include <optional>
#include <unordered_set>
#include "util/v128.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag;
// Move 4 args for calling native function from a GHC calling convention function
#if defined(ARCH_X64)
static u8* move_args_ghc_to_native(u8* raw)
{
#ifdef _WIN32
// mov rcx, r13
// mov rdx, rbp
// mov r8, r12
// mov r9, rbx
std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12);
#else
// mov rdi, r13
// mov rsi, rbp
// mov rdx, r12
// mov rcx, rbx
std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12);
#endif
return raw + 12;
}
#elif defined(ARCH_ARM64)
static void ghc_cpp_trampoline(u64 fn_target, native_asm& c, auto& args)
{
using namespace asmjit;
Label target = c.newLabel();
c.mov(args[0], a64::x19);
c.mov(args[1], a64::x20);
c.mov(args[2], a64::x21);
c.mov(args[3], a64::x22);
c.ldr(a64::x15, arm::Mem(target));
c.br(a64::x15);
c.brk(Imm(0x42)); // Unreachable
c.bind(target);
c.embedUInt64(fn_target);
}
#endif
DECLARE(spu_runtime::tr_dispatch) = []
{
PPU LLVM arm64+macOS port (#12115) * BufferUtils: use naive function pointer on Apple arm64 Use naive function pointer on Apple arm64 because ASLR breaks asmjit. See BufferUtils.cpp comment for explanation on why this happens and how to fix if you want to use asmjit. * build-macos: fix source maps for Mac Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo modes. * LLVM PPU: fix aarch64 on macOS Force MachO on macOS to fix LLVM being unable to patch relocations during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by PPUTranslator/Recompiler. * virtual memory: use 16k pages on aarch64 macOS Temporary hack to get things working by using 16k pages instead of 4k pages in VM emulation. * PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS Fixes some intrinsics usage and patches usages of asmjit to properly emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also patches the SPU recompiler to properly work on arm64 by telling LLVM to target arm64. * virtual memory: fix W^X toggles on macOS aarch64 Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default to RW mode. For both SPU and PPU execution threads, when initialization finishes we toggle to RX mode. This exploits Apple's per-thread setting for RW/RX to let us be technically compliant with the OS's W^X enforcement while not needing to actually separate the memory allocated for code/data. * PPU: implement aarch64 specific functions Implements ppu_gateway for arm64 and patches LLVM initialization to use the correct triple. Adds some fixes for macOS W^X JIT restrictions when entering/exiting JITed code. * PPU: Mark rpcs3 calls as non-tail Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you call a function inside e.g. an L2 syscall, it will clobber LR on arm64 and subtly break returns in emulated code. Only JIT -> JIT "calls" should be tail. * macOS/arm64: compatibility fixes * vm: patch virtual memory for arm64 macOS Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix memory unmapping on 16K page machines with a hack to mark "unmapped" pages as RW. * PPU: remove wrong comment * PPU: fix a merge regression * vm: remove 16k page hacks * PPU: formatting fixes * PPU: fix arm64 null function assembly * ppu: clean up arch-specific instructions
2022-06-14 14:28:38 +02:00
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
#if defined(ARCH_X64)
// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xf3; // pause
*raw++ = 0x90;
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_dispatch",
[](native_asm& c, auto& args)
{
c.yield();
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::dispatch), c, args);
c.embed("tr_dispatch", 11);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::tr_branch) = []
{
#if defined(ARCH_X64)
// Generate a trampoline to spu_recompiler_base::branch
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_branch",
[](native_asm& c, auto& args)
{
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::branch), c, args);
c.embed("tr_branch", 9);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::tr_interpreter) = []
{
#if defined(ARCH_X64)
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::old_interpreter);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_interpreter",
[](native_asm& c, auto& args)
{
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::old_interpreter), c, args);
c.embed("tr_interpreter", 14);
});
return trptr;
#endif
}();
DECLARE(spu_runtime::g_dispatcher) = []
{
// Allocate 2^20 positions in data area
const auto ptr = reinterpret_cast<std::remove_const_t<decltype(spu_runtime::g_dispatcher)>>(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false));
for (auto& x : *ptr)
{
x.raw() = tr_dispatch;
}
return ptr;
}();
DECLARE(spu_runtime::tr_all) = []
{
#if defined(ARCH_X64)
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = trptr;
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// mov eax, [rcx]
*raw++ = 0x8b;
*raw++ = 0x01;
// shr eax, (32 - 20)
*raw++ = 0xc1;
*raw++ = 0xe8;
*raw++ = 0x0c;
// Load g_dispatcher to rdx
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x15;
const s32 r32 = ::narrow<s32>(reinterpret_cast<u64>(g_dispatcher) - reinterpret_cast<u64>(raw) - 4);
std::memcpy(raw, &r32, 4);
raw += 4;
// Update block_hash (set zero): mov [r13 + spu_thread::m_block_hash], 0
*raw++ = 0x49;
*raw++ = 0xc7;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x00;
// jmp [rdx + rax * 8]
*raw++ = 0xff;
*raw++ = 0x24;
*raw++ = 0xc2;
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_all",
[](native_asm& c, auto& args)
{
using namespace asmjit;
// w1: PC (eax in x86 SPU)
// x7: lsa (rcx in x86 SPU)
// Load PC
Label pc_offset = c.newLabel();
c.ldr(a64::x0, arm::Mem(pc_offset));
c.ldr(a64::w1, arm::Mem(a64::x19, a64::x0)); // REG_Base + offset(spu_thread::pc)
// Compute LS address = REG_Sp + PC, store into x7 (use later)
c.add(a64::x7, a64::x20, a64::x1);
// Load 32b from LS address
c.ldr(a64::w3, arm::Mem(a64::x7));
// shr (32 - 20)
c.lsr(a64::w3, a64::w3, Imm(32 - 20));
// Load g_dispatcher
Label g_dispatcher_offset = c.newLabel();
c.ldr(a64::x4, arm::Mem(g_dispatcher_offset));
// Update block hash
Label block_hash_offset = c.newLabel();
c.mov(a64::x5, Imm(0));
c.ldr(a64::x6, arm::Mem(block_hash_offset));
c.str(a64::x5, arm::Mem(a64::x19, a64::x6)); // REG_Base + offset(spu_thread::block_hash)
// Jump to [g_dispatcher + idx * 8]
c.mov(a64::x6, Imm(8));
c.mul(a64::x6, a64::x3, a64::x6);
c.add(a64::x4, a64::x4, a64::x6);
c.ldr(a64::x4, arm::Mem(a64::x4));
c.br(a64::x4);
c.bind(pc_offset);
c.embedUInt64(::offset32(&spu_thread::pc));
c.bind(g_dispatcher_offset);
c.embedUInt64(reinterpret_cast<u64>(g_dispatcher));
c.bind(block_hash_offset);
c.embedUInt64(::offset32(&spu_thread::block_hash));
c.embed("tr_all", 6);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>("spu_gateway", [](native_asm& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;
#if defined(ARCH_X64)
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rsi);
c.push(x86::rdi);
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 0xa8);
c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15);
c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7);
c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
#else
c.push(x86::rbp);
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::rax);
#endif
// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
// Move 4 args (despite spu_function_t def)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);
if (utils::has_avx())
{
c.vzeroupper();
}
2022-01-17 17:24:53 +01:00
c.call(spu_runtime::tr_all);
if (utils::has_avx())
{
c.vzeroupper();
}
#ifdef _WIN32
c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10));
c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20));
c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30));
c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40));
c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50));
c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60));
c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70));
c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80));
c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90));
c.add(x86::rsp, 0xa8);
c.pop(x86::rbx);
c.pop(x86::rbp);
c.pop(x86::rdi);
c.pop(x86::rsi);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
#else
c.add(x86::rsp, +8);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
c.pop(x86::rbp);
#endif
c.ret();
#elif defined(ARCH_ARM64)
// Push callee saved registers to the stack
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
c.sub(a64::sp, a64::sp, Imm(112));
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.str(a64::x30, arm::Mem(a64::sp, 96));
// Save native stack pointer for longjmp emulation
Label sp_offset = c.newLabel();
c.ldr(a64::x26, arm::Mem(sp_offset));
// sp not allowed to be used in load/stores directly
c.mov(a64::x15, a64::sp);
c.str(a64::x15, arm::Mem(args[0], a64::x26));
// Move 4 args (despite spu_function_t def)
c.mov(a64::x19, args[0]);
c.mov(a64::x20, args[1]);
c.mov(a64::x21, args[2]);
c.mov(a64::x22, args[3]);
// Save ret address to stack
// since non-tail calls to cpp fns may corrupt lr and
// g_tail_escape may jump out of a fn before the epilogue can restore lr
Label ret_addr = c.newLabel();
c.adr(a64::x0, ret_addr);
c.str(a64::x0, arm::Mem(a64::sp, 104));
Label call_target = c.newLabel();
c.ldr(a64::x0, arm::Mem(call_target));
c.blr(a64::x0);
c.bind(ret_addr);
// Restore stack ptr
c.ldr(a64::x26, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(a64::x19, a64::x26));
c.mov(a64::sp, a64::x15);
// Restore registers from the stack
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
// Restore stack ptr
c.add(a64::sp, a64::sp, Imm(112));
// Return
c.ret(a64::x30);
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.bind(call_target);
c.embedUInt64(reinterpret_cast<u64>(spu_runtime::tr_all));
c.embed("spu_gateway", 11);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#elif defined(ARCH_ARM64)
// Restore native stack pointer (longjmp emulation)
Label sp_offset = c.newLabel();
c.ldr(a64::x15, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(args[0], a64::x15));
c.mov(a64::sp, a64::x15);
c.ldr(a64::x30, arm::Mem(a64::sp, 104));
c.ret(a64::x30);
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.embed("spu_escape", 10);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Adjust stack for initial call instruction in the gateway
c.sub(x86::rsp, 16);
// Tail call, GHC CC (second arg)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
c.mov(x86::r12, args[2]);
c.xor_(x86::ebx, x86::ebx);
c.mov(x86::qword_ptr(x86::rsp), args[1]);
c.ret();
#elif defined(ARCH_ARM64)
// Restore native stack pointer (longjmp emulation)
Label sp_offset = c.newLabel();
c.ldr(a64::x15, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(args[0], a64::x15));
c.mov(a64::sp, a64::x15);
// Reload lr, since it might've been clobbered by a cpp fn
// and g_tail_escape runs before epilogue
c.ldr(a64::x30, arm::Mem(a64::sp, 104));
// Tail call, GHC CC
c.mov(a64::x19, args[0]); // REG_Base
Label ls_offset = c.newLabel();
c.ldr(a64::x20, arm::Mem(ls_offset));
c.ldr(a64::x20, arm::Mem(args[0], a64::x20)); // REG_Sp
c.mov(a64::x21, args[2]); // REG_Hp
c.eor(a64::w22, a64::w22, a64::w22); // REG_R1
c.br(args[1]);
c.bind(ls_offset);
c.embedUInt64(::offset32(&spu_thread::ls));
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.embed("spu_tail_escape", 15);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_interpreter_table) = {};
DECLARE(spu_runtime::g_interpreter) = nullptr;
spu_cache::spu_cache(const std::string& loc)
: m_file(loc, fs::read + fs::write + fs::create + fs::append)
{
}
spu_cache::~spu_cache()
{
}
extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size)
{
if (vaddr % 4)
{
return;
}
size &= -4;
if (!size || vaddr + size > SPU_LS_SIZE)
{
return;
}
if (!g_cfg.core.llvm_precompilation)
{
return;
}
g_fxo->need<spu_cache>();
if (!g_fxo->get<spu_cache>().collect_funcs_to_precompile)
{
return;
}
std::basic_string<u32> data(size / 4, 0);
std::memcpy(data.data(), ls_data_vaddr, size);
spu_cache::precompile_data_t obj{vaddr, std::move(data)};
obj.funcs = spu_thread::discover_functions(vaddr, { reinterpret_cast<const u8*>(ls_data_vaddr), size }, vaddr != 0, umax);
if (obj.funcs.empty())
{
// Nothing to add
return;
}
for (u32 addr : obj.funcs)
{
spu_log.notice("Found SPU function at: 0x%05x", addr);
}
spu_log.notice("Found %u SPU functions", obj.funcs.size());
g_fxo->get<spu_cache>().precompile_funcs.push(std::move(obj));
}
2023-09-02 10:55:21 +02:00
// For SPU cache validity check
static u16 calculate_crc16(const uchar* data, usz length)
{
u16 crc = umax;
while (length--)
{
u8 x = (crc >> 8) ^ *data++;
x ^= (x >> 4);
crc = static_cast<u16>((crc << 8) ^ (x << 12) ^ (x << 5) ^ x);
}
return crc;
}
std::deque<spu_program> spu_cache::get()
{
std::deque<spu_program> result;
if (!m_file)
{
return result;
}
m_file.seek(0);
// TODO: signal truncated or otherwise broken file
while (true)
{
2023-09-02 10:55:21 +02:00
struct block_info_t
{
be_t<u16> crc;
be_t<u16> size;
be_t<u32> addr;
} block_info{};
2023-09-02 10:55:21 +02:00
if (!m_file.read(block_info))
{
break;
}
2023-09-02 10:55:21 +02:00
const u32 crc = block_info.crc;
const u32 size = block_info.size;
const u32 addr = block_info.addr;
if (utils::add_saturate<u32>(addr, size * 4) > SPU_LS_SIZE)
{
break;
}
2023-09-02 10:55:21 +02:00
std::vector<u32> func;
if (!m_file.read(func, size))
{
break;
}
if (!size || !func[0])
{
// Skip old format Giga entries
continue;
}
2023-09-02 10:55:21 +02:00
// CRC check is optional to be compatible with old format
if (crc && std::max<u32>(calculate_crc16(reinterpret_cast<const uchar*>(func.data()), size * 4), 1) != crc)
{
// Invalid, but continue anyway
continue;
}
spu_program res;
res.entry_point = addr;
res.lower_bound = addr;
res.data = std::move(func);
result.emplace_front(std::move(res));
}
return result;
}
void spu_cache::add(const spu_program& func)
{
if (!m_file)
{
return;
}
be_t<u32> size = ::size32(func.data);
be_t<u32> addr = func.entry_point;
2023-09-02 10:55:21 +02:00
// Add CRC (forced non-zero)
size |= std::max<u32>(calculate_crc16(reinterpret_cast<const uchar*>(func.data.data()), size * 4), 1) << 16;
const fs::iovec_clone gather[3]
{
{&size, sizeof(size)},
{&addr, sizeof(addr)},
{func.data.data(), func.data.size() * 4}
};
// Append data
m_file.write_gather(gather, 3);
}
void spu_cache::initialize(bool build_existing_cache)
{
spu_runtime::g_interpreter = spu_runtime::g_gateway;
if (g_cfg.core.spu_decoder == spu_decoder_type::_static || g_cfg.core.spu_decoder == spu_decoder_type::dynamic)
{
for (auto& x : *spu_runtime::g_dispatcher)
{
x.raw() = spu_runtime::tr_interpreter;
}
}
const std::string ppu_cache = rpcs3::cache::get_ppu_cache();
if (ppu_cache.empty())
{
return;
}
// SPU cache file (version + block size type)
const std::string loc = ppu_cache + "spu-" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1-tane.dat";
2019-09-26 17:45:29 +02:00
spu_cache cache(loc);
2019-09-26 17:45:29 +02:00
if (!cache)
{
spu_log.error("Failed to initialize SPU cache at: %s", loc);
return;
}
// Read cache
2019-09-26 17:45:29 +02:00
auto func_list = cache.get();
2020-12-18 08:39:54 +01:00
atomic_t<usz> fnext{};
atomic_t<u8> fail_flag{0};
auto data_list = g_fxo->get<spu_cache>().precompile_funcs.pop_all();
g_fxo->get<spu_cache>().collect_funcs_to_precompile = false;
2023-12-29 18:33:29 +01:00
usz total_precompile = 0;
for (auto& sec : data_list)
{
total_precompile += sec.funcs.size();
}
const bool spu_precompilation_enabled = func_list.empty() && g_cfg.core.spu_cache && g_cfg.core.llvm_precompilation;
2023-08-28 11:20:17 +02:00
if (spu_precompilation_enabled)
{
// What compiles in this case goes straight to disk
g_fxo->get<spu_cache>() = std::move(cache);
}
else if (!build_existing_cache)
{
return;
}
2023-08-28 11:20:17 +02:00
else
{
total_precompile = 0;
data_list = {};
2023-08-28 11:20:17 +02:00
}
atomic_t<usz> data_indexer = 0;
if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
{
compiler->init();
2019-10-25 23:52:56 +02:00
if (compiler->compile({}) && spu_runtime::g_interpreter)
{
spu_log.success("SPU Runtime: Built the interpreter.");
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
{
return;
}
}
2019-10-25 23:52:56 +02:00
else
{
spu_log.fatal("SPU Runtime: Failed to build the interpreter.");
2019-10-25 23:52:56 +02:00
}
}
}
2020-04-07 14:41:38 +02:00
u32 worker_count = 0;
std::optional<scoped_progress_dialog> progr;
u32 total_funcs = 0;
2020-02-29 12:57:41 +01:00
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
2023-12-29 18:33:29 +01:00
const usz add_count = func_list.size() + total_precompile;
if (add_count)
{
2023-12-29 18:33:29 +01:00
total_funcs = build_existing_cache ? ::narrow<u32>(add_count) : 0;
}
2023-12-29 18:33:29 +01:00
worker_count = std::min<u32>(rpcs3::utils::get_max_threads(), ::narrow<u32>(add_count));
2020-02-29 12:57:41 +01:00
}
atomic_t<u32> pending_progress = 0;
atomic_t<bool> showing_progress = false;
if (!g_progr_ptotal)
{
g_progr_ptotal += total_funcs;
showing_progress.release(true);
progr.emplace("Building SPU cache...");
}
2020-04-07 14:41:38 +02:00
named_thread_group workers("SPU Worker ", worker_count, [&]() -> uint
{
PPU LLVM arm64+macOS port (#12115) * BufferUtils: use naive function pointer on Apple arm64 Use naive function pointer on Apple arm64 because ASLR breaks asmjit. See BufferUtils.cpp comment for explanation on why this happens and how to fix if you want to use asmjit. * build-macos: fix source maps for Mac Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo modes. * LLVM PPU: fix aarch64 on macOS Force MachO on macOS to fix LLVM being unable to patch relocations during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by PPUTranslator/Recompiler. * virtual memory: use 16k pages on aarch64 macOS Temporary hack to get things working by using 16k pages instead of 4k pages in VM emulation. * PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS Fixes some intrinsics usage and patches usages of asmjit to properly emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also patches the SPU recompiler to properly work on arm64 by telling LLVM to target arm64. * virtual memory: fix W^X toggles on macOS aarch64 Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default to RW mode. For both SPU and PPU execution threads, when initialization finishes we toggle to RX mode. This exploits Apple's per-thread setting for RW/RX to let us be technically compliant with the OS's W^X enforcement while not needing to actually separate the memory allocated for code/data. * PPU: implement aarch64 specific functions Implements ppu_gateway for arm64 and patches LLVM initialization to use the correct triple. Adds some fixes for macOS W^X JIT restrictions when entering/exiting JITed code. * PPU: Mark rpcs3 calls as non-tail Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you call a function inside e.g. an L2 syscall, it will clobber LR on arm64 and subtly break returns in emulated code. Only JIT -> JIT "calls" should be tail. * macOS/arm64: compatibility fixes * vm: patch virtual memory for arm64 macOS Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix memory unmapping on 16K page machines with a hack to mark "unmapped" pages as RW. * PPU: remove wrong comment * PPU: fix a merge regression * vm: remove 16k page hacks * PPU: formatting fixes * PPU: fix arm64 null function assembly * ppu: clean up arch-specific instructions
2022-06-14 14:28:38 +02:00
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
// Set low priority
thread_ctrl::scoped_priority low_prio(-1);
2020-02-29 12:57:41 +01:00
// Initialize compiler instances for parallel compilation
std::unique_ptr<spu_recompiler_base> compiler;
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
compiler = spu_recompiler_base::make_asmjit_recompiler();
}
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
compiler = spu_recompiler_base::make_llvm_recompiler();
}
compiler->init();
2020-02-29 12:57:41 +01:00
// How much every thread compiled
uint result = 0;
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
usz func_i = fnext++;
// Ensure some actions are performed on a single thread
const bool is_first_thread = func_i == 0;
// Build functions
for (; func_i < func_list.size(); func_i = fnext++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0)
{
const spu_program& func = std::as_const(func_list)[func_i];
if (Emu.IsStopped() || fail_flag)
2018-06-01 14:52:31 +02:00
{
continue;
}
// Get data start
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
be_t<u64> hash_start;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
std::memcpy(&hash_start, output, sizeof(hash_start));
}
// Check hash against allowed bounds
const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound;
if ((!inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound || hash_start > g_cfg.core.spu_llvm_upper_bound)) ||
(inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound && hash_start > g_cfg.core.spu_llvm_upper_bound)))
{
spu_log.error("[Debug] Skipped function %s", fmt::base57(hash_start));
result++;
continue;
}
// Initialize LS with function data only
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (!compiler->compile(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
continue;
}
// Clear fake LS
2019-10-25 23:52:56 +02:00
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
2020-02-29 12:57:41 +01:00
result++;
if (is_first_thread && !showing_progress)
{
if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal)
{
showing_progress = true;
g_progr_pdone += pending_progress.exchange(0);
g_progr_ptotal += total_funcs;
progr.emplace("Building SPU cache...");
}
}
else if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
2018-06-01 14:52:31 +02:00
}
2020-02-29 12:57:41 +01:00
u32 last_sec_idx = umax;
for (func_i = data_indexer++;; func_i = data_indexer++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0)
{
2023-12-29 18:33:29 +01:00
usz passed_count = 0;
u32 func_addr = 0;
u32 next_func = 0;
u32 sec_addr = umax;
u32 sec_idx = 0;
std::basic_string_view<u32> inst_data;
// Try to get the data this index points to
for (auto& sec : data_list)
{
if (func_i < passed_count + sec.funcs.size())
{
2023-12-29 18:33:29 +01:00
const usz func_idx = func_i - passed_count;
sec_addr = sec.vaddr;
func_addr = ::at32(sec.funcs, func_idx);
inst_data = sec.inst_data;
2023-12-29 18:33:29 +01:00
next_func = sec.funcs.size() >= func_idx ? ::narrow<u32>(sec_addr + inst_data.size() * 4) : sec.funcs[func_idx];
break;
}
passed_count += sec.funcs.size();
sec_idx++;
}
if (sec_addr == umax)
{
// End of compilation for thread
break;
}
if (Emu.IsStopped() || fail_flag)
{
continue;
}
if (last_sec_idx != sec_idx)
{
if (last_sec_idx != umax)
{
// Clear fake LS of previous section
auto& sec = data_list[last_sec_idx];
std::memset(ls.data() + sec.vaddr / 4, 0, sec.inst_data.size() * 4);
}
// Initialize LS with the entire section data
for (u32 i = 0, pos = sec_addr; i < inst_data.size(); i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(inst_data[i]);
}
last_sec_idx = sec_idx;
}
u32 block_addr = func_addr;
std::map<u32, std::basic_string<u32>> targets;
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), block_addr, &targets);
while (!func2.data.empty())
{
const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
2023-12-29 18:33:29 +01:00
const u32 prog_size = ::size32(func2.data);
if (!compiler->compile(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
break;
}
result++;
const u32 start_new = block_addr + prog_size * 4;
if (start_new >= next_func || (start_new == next_func - 4 && ls[start_new / 4] == 0x200000u))
{
// Completed
break;
}
if (auto type = g_spu_itype.decode(last_inst);
type == spu_itype::BRSL || type == spu_itype::BRASL || type == spu_itype::BISL || type == spu_itype::SYNC)
{
if (ls[start_new / 4] && g_spu_itype.decode(ls[start_new / 4]) != spu_itype::UNK)
{
spu_log.notice("Precompiling fallthrough to 0x%05x", start_new);
func2 = compiler->analyse(ls.data(), start_new, &targets);
block_addr = start_new;
continue;
}
}
if (targets.empty())
{
break;
}
const auto upper = targets.upper_bound(func_addr);
if (upper == targets.begin())
{
break;
}
u32 new_entry = umax;
// Find the lowest target in the space in-between
for (auto it = std::prev(upper); it != targets.end() && it->first < start_new && new_entry > start_new; it++)
{
for (u32 target : it->second)
{
if (target >= start_new && target < next_func)
{
if (target < new_entry)
{
new_entry = target;
if (new_entry == start_new)
{
// Cannot go lower
break;
}
}
}
}
}
if (new_entry != umax && !spu_thread::is_exec_code(new_entry, { reinterpret_cast<const u8*>(ls.data()), SPU_LS_SIZE }))
{
new_entry = umax;
}
if (new_entry == umax)
{
new_entry = start_new;
while (new_entry < next_func && (ls[start_new / 4] < 0x3fffc || !spu_thread::is_exec_code(new_entry, { reinterpret_cast<const u8*>(ls.data()), SPU_LS_SIZE })))
{
new_entry += 4;
}
if (new_entry >= next_func || (new_entry == next_func - 4 && ls[new_entry / 4] == 0x200000u))
{
// Completed
break;
}
}
spu_log.notice("Precompiling filler space at 0x%05x (next=0x%05x)", new_entry, next_func);
func2 = compiler->analyse(ls.data(), new_entry, &targets);
block_addr = new_entry;
}
if (is_first_thread && !showing_progress)
{
if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal)
{
showing_progress = true;
g_progr_pdone += pending_progress.exchange(0);
g_progr_ptotal += total_funcs;
progr.emplace("Building SPU cache...");
}
}
else if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
}
if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
2020-02-29 12:57:41 +01:00
return result;
});
u32 built_total = 0;
2020-02-29 12:57:41 +01:00
// Join (implicitly) and print individual results
for (u32 i = 0; i < workers.size(); i++)
{
2020-02-29 12:57:41 +01:00
spu_log.notice("SPU Runtime: Worker %u built %u programs.", i + 1, workers[i]);
built_total += workers[i];
}
spu_log.notice("SPU Runtime: Workers built %u programs.", built_total);
if (Emu.IsStopped())
{
spu_log.error("SPU Runtime: Cache building aborted.");
return;
}
if (fail_flag)
{
spu_log.fatal("SPU Runtime: Cache building failed (out of memory).");
return;
}
2020-02-29 12:57:41 +01:00
if ((g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm) && !func_list.empty())
{
spu_log.success("SPU Runtime: Built %u functions.", func_list.size());
if (g_cfg.core.spu_debug)
{
std::string dump;
dump.reserve(10'000'000);
std::map<std::basic_string_view<u8>, spu_program*> sorted;
for (auto&& f : func_list)
{
// Interpret as a byte string
std::basic_string_view<u8> data = {reinterpret_cast<u8*>(f.data.data()), f.data.size() * sizeof(u32)};
sorted[data] = &f;
}
std::unordered_set<u32> depth_n;
u32 n_max = 0;
for (auto&& [bytes, f] : sorted)
{
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, bytes.data(), bytes.size());
sha1_finish(&ctx, output);
fmt::append(dump, "\n\t[%s] ", fmt::base57(output));
}
u32 depth_m = 0;
for (auto&& [data, f2] : sorted)
{
u32 depth = 0;
if (f2 == f)
{
continue;
}
for (u32 i = 0; i < bytes.size(); i++)
{
if (i < data.size() && data[i] == bytes[i])
{
depth++;
}
else
{
break;
}
}
depth_n.emplace(depth);
depth_m = std::max(depth, depth_m);
}
fmt::append(dump, "c=%06d,d=%06d ", depth_n.size(), depth_m);
bool sk = false;
for (u32 i = 0; i < bytes.size(); i++)
{
if (depth_m == i)
{
dump += '|';
sk = true;
}
fmt::append(dump, "%02x", bytes[i]);
if (i % 4 == 3)
{
if (sk)
{
sk = false;
}
else
{
dump += ' ';
}
dump += ' ';
}
}
fmt::append(dump, "\n\t%49s", "");
for (u32 i = 0; i < f->data.size(); i++)
{
fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
}
n_max = std::max(n_max, ::size32(depth_n));
depth_n.clear();
}
spu_log.notice("SPU Cache Dump (max_c=%d): %s", n_max, dump);
}
}
2019-09-26 17:45:29 +02:00
// Initialize global cache instance
2023-08-28 11:20:17 +02:00
if (g_cfg.core.spu_cache && cache)
{
2021-03-02 12:59:19 +01:00
g_fxo->get<spu_cache>() = std::move(cache);
}
}
bool spu_program::operator==(const spu_program& rhs) const noexcept
{
// TODO
return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data;
}
bool spu_program::operator<(const spu_program& rhs) const noexcept
{
const u32 lhs_offs = (entry_point - lower_bound) / 4;
const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4;
// Select range for comparison
std::basic_string_view<u32> lhs_data(data.data() + lhs_offs, data.size() - lhs_offs);
std::basic_string_view<u32> rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs);
const auto cmp0 = lhs_data.compare(rhs_data);
if (cmp0 < 0)
return true;
else if (cmp0 > 0)
return false;
// Compare from address 0 to the point before the entry point (TODO: undesirable)
lhs_data = {data.data(), lhs_offs};
rhs_data = {rhs.data.data(), rhs_offs};
const auto cmp1 = lhs_data.compare(rhs_data);
if (cmp1 < 0)
return true;
else if (cmp1 > 0)
return false;
// TODO
return lhs_offs < rhs_offs;
}
spu_runtime::spu_runtime()
{
// Clear LLVM output
m_cache_path = rpcs3::cache::get_ppu_cache();
2019-08-27 16:25:49 +02:00
if (m_cache_path.empty())
{
return;
}
fs::create_dir(m_cache_path + "llvm/");
fs::remove_all(m_cache_path + "llvm/", false);
if (g_cfg.core.spu_debug)
{
fs::file(m_cache_path + "spu.log", fs::rewrite);
fs::file(m_cache_path + "spu-ir.log", fs::rewrite);
}
}
spu_item* spu_runtime::add_empty(spu_program&& data)
{
if (data.data.empty())
{
2019-10-25 23:52:56 +02:00
return nullptr;
}
2019-10-25 23:52:56 +02:00
// Store previous item if already added
spu_item* prev = nullptr;
2019-10-25 23:52:56 +02:00
//Try to add item that doesn't exist yet
const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old)
{
if (_new.data == _old.data)
{
2019-10-25 23:52:56 +02:00
prev = &_old;
return false;
}
2019-10-25 23:52:56 +02:00
return true;
}, std::move(data));
if (ret)
{
return ret;
}
2019-10-25 23:52:56 +02:00
return prev;
}
spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
{
// Prepare sorted list
2019-10-25 23:52:56 +02:00
static thread_local std::vector<std::pair<std::basic_string_view<u32>, spu_function_t>> m_flat_list;
// Remember top position
auto stuff_it = ::at32(m_stuff, id_inst >> 12).begin();
auto stuff_end = ::at32(m_stuff, id_inst >> 12).end();
{
2019-10-25 23:52:56 +02:00
if (stuff_it->trampoline)
{
return stuff_it->trampoline;
}
2019-10-25 23:52:56 +02:00
m_flat_list.clear();
for (auto it = stuff_it; it != stuff_end; ++it)
{
if (const auto ptr = it->compiled.load())
{
std::basic_string_view<u32> range{it->data.data.data(), it->data.data.size()};
range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4);
2019-10-25 23:52:56 +02:00
m_flat_list.emplace_back(range, ptr);
}
else
{
// Pull oneself deeper (TODO)
++stuff_it;
}
}
}
std::sort(m_flat_list.begin(), m_flat_list.end(), FN(x.first < y.first));
2019-10-25 23:52:56 +02:00
struct work
{
u32 size;
u16 from;
u16 level;
u8* rel32;
decltype(m_flat_list)::iterator beg;
decltype(m_flat_list)::iterator end;
};
// Scratch vector
static thread_local std::vector<work> workload;
// Generate a dispatcher (übertrampoline)
const auto beg = m_flat_list.begin();
const auto _end = m_flat_list.end();
const u32 size0 = ::size32(m_flat_list);
2019-10-25 23:52:56 +02:00
auto result = beg->second;
if (size0 != 1)
{
#if defined(ARCH_ARM64)
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 128 + 16, 16);
if (!wxptr)
{
return nullptr;
}
// Raw assembly pointer
u8* raw = wxptr;
auto make_jump = [&](asmjit::arm::CondCode op, auto target)
{
// 36 bytes
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// ldr x9, #16 -> ldr x9, taddr
*raw++ = 0x89;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
if (op == asmjit::arm::CondCode::kAlways)
{
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
else
{
// b.COND #8 -> b.COND do_branch
switch (op)
{
case asmjit::arm::CondCode::kUnsignedLT:
*raw++ = 0x43;
break;
case asmjit::arm::CondCode::kUnsignedGT:
*raw++ = 0x48;
break;
default:
asm("brk 0x42");
}
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x54;
// b #16 -> b cont
*raw++ = 0x04;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// do_branch: br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1f;
*raw++ = 0xD6;
}
// taddr
std::memcpy(raw, &taddr, 8);
raw += 8;
// cont: next instruction
};
#elif defined(ARCH_X64)
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16);
if (!wxptr)
{
return nullptr;
}
// Raw assembly pointer
u8* raw = wxptr;
// Write jump instruction with rel32 immediate
auto make_jump = [&](u8 op, auto target)
{
ensure(raw + 8 <= wxptr + size0 * 22 + 16);
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// Compute the distance
const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
ensure(rel >= s32{smin} && rel <= s32{smax});
if (op != 0xe9)
{
// First jcc byte
*raw++ = 0x0f;
ensure((op >> 4) == 0x8);
}
*raw++ = op;
const s32 r32 = static_cast<s32>(rel);
std::memcpy(raw, &r32, 4);
raw += 4;
};
#endif
workload.clear();
workload.reserve(size0);
workload.emplace_back();
workload.back().size = size0;
workload.back().level = 0;
workload.back().from = -1;
2021-04-09 21:12:47 +02:00
workload.back().rel32 = nullptr;
workload.back().beg = beg;
workload.back().end = _end;
// LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all)
2020-12-18 08:39:54 +01:00
for (usz i = 0; i < workload.size(); i++)
{
// Get copy of the workload info
auto w = workload[i];
// Split range in two parts
auto it = w.beg;
auto it2 = w.beg;
u32 size1 = w.size / 2;
u32 size2 = w.size - size1;
std::advance(it2, w.size / 2);
while (ensure(w.level < umax))
{
it = it2;
size1 = w.size - size2;
if (w.level >= w.beg->first.size())
{
// Cannot split: smallest function is a prefix of bigger ones (TODO)
break;
}
const u32 x1 = ::at32(w.beg->first, w.level);
if (!x1)
{
// Cannot split: some functions contain holes at this level
w.level++;
// Resort subrange starting from the new level
std::stable_sort(w.beg, w.end, [&](const auto& a, const auto& b)
{
std::basic_string_view<u32> lhs = a.first;
std::basic_string_view<u32> rhs = b.first;
lhs.remove_prefix(w.level);
rhs.remove_prefix(w.level);
return lhs < rhs;
});
continue;
}
// Adjust ranges (forward)
while (it != w.end && x1 == ::at32(it->first, w.level))
{
it++;
size1++;
}
if (it == w.end)
{
// Cannot split: words are identical within the range at this level
w.level++;
}
else
{
size2 = w.size - size1;
break;
}
}
if (w.rel32)
{
#if defined(ARCH_X64)
// Patch rel32 linking it to the current location if necessary
const s32 r32 = ::narrow<s32>(raw - w.rel32);
std::memcpy(w.rel32 - 4, &r32, 4);
#elif defined(ARCH_ARM64)
// Rewrite jump address
{
u64 raw64 = reinterpret_cast<u64>(raw);
memcpy(w.rel32 - 8, &raw64, 8);
}
#else
#error "Unimplemented"
#endif
}
if (w.level >= w.beg->first.size() || w.level >= it->first.size())
{
// If functions cannot be compared, assume smallest function
spu_log.error("Trampoline simplified at ??? (level=%u)", w.level);
#if defined(ARCH_X64)
make_jump(0xe9, w.beg->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
continue;
}
// Value for comparison
const u32 x = ::at32(it->first, w.level);
// Adjust ranges (backward)
while (it != m_flat_list.begin())
{
it--;
if (w.level >= it->first.size())
{
it = m_flat_list.end();
break;
}
if (::at32(it->first, w.level) != x)
{
it++;
break;
}
ensure(it != w.beg);
size1--;
size2++;
}
if (it == m_flat_list.end())
{
spu_log.error("Trampoline simplified (II) at ??? (level=%u)", w.level);
#if defined(ARCH_X64)
make_jump(0xe9, w.beg->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
continue;
}
// Emit 32-bit comparison
#if defined(ARCH_X64)
ensure(raw + 12 <= wxptr + size0 * 22 + 16); // "Asm overflow"
#elif defined(ARCH_ARM64)
ensure(raw + (4 * 4) <= wxptr + size0 * 128 + 16);
#else
#error "Unimplemented"
#endif
if (w.from != w.level)
{
// If necessary (level has advanced), emit load: mov eax, [rcx + addr]
const u32 cmp_lsa = w.level * 4u;
#if defined(ARCH_X64)
if (cmp_lsa < 0x80)
{
*raw++ = 0x8b;
*raw++ = 0x41;
*raw++ = ::narrow<s8>(cmp_lsa);
}
else
{
*raw++ = 0x8b;
*raw++ = 0x81;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
}
#elif defined(ARCH_ARM64)
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;
// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// cmp_lsa
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
// ldr w1, [x7, x9]
*raw++ = 0xE1;
*raw++ = 0x68;
*raw++ = 0x69;
*raw++ = 0xB8;
#else
#error "Unimplemented"
#endif
}
// Emit comparison: cmp eax, imm32
#if defined(ARCH_X64)
*raw++ = 0x3d;
std::memcpy(raw, &x, 4);
raw += 4;
#elif defined(ARCH_ARM64)
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;
// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// x
std::memcpy(raw, &x, 4);
raw += 4;
// cmp w1, w9
*raw++ = 0x3f;
*raw++ = 0x00;
*raw++ = 0x09;
*raw++ = 0x6B;
#else
#error "Unimplemented"
#endif
// Low subrange target
if (size1 == 1)
{
#if defined(ARCH_X64)
make_jump(0x82, w.beg->second); // jb rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kUnsignedLT, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0x82, raw); // jb rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kUnsignedLT, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.end = it;
to.size = size1;
to.rel32 = raw;
to.from = w.level;
}
// Second subrange target
if (size2 == 1)
{
#if defined(ARCH_X64)
make_jump(0xe9, it->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
it2 = it;
// Select additional midrange for equality comparison
while (it2 != w.end && ::at32(it2->first, w.level) == x)
{
size2--;
it2++;
}
if (it2 != w.end)
{
// High subrange target
if (size2 == 1)
{
#if defined(ARCH_X64)
make_jump(0x87, it2->second); // ja rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it2->second);
make_jump(asmjit::arm::CondCode::kUnsignedGT, branch_target);
#else
#throw "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0x87, raw); // ja rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kUnsignedGT, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it2;
to.size = size2;
to.rel32 = raw;
to.from = w.level;
}
const u32 size3 = w.size - size1 - size2;
if (size3 == 1)
{
#if defined(ARCH_X64)
make_jump(0xe9, it->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0xe9, raw); // jmp rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kAlways, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it;
to.end = it2;
to.size = size3;
to.rel32 = raw;
to.from = w.level;
}
}
else
{
#if defined(ARCH_X64)
make_jump(0xe9, raw); // jmp rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kAlways, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it;
to.size = w.size - size1;
to.rel32 = raw;
to.from = w.level;
}
}
}
workload.clear();
2019-10-25 23:52:56 +02:00
result = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
std::string fname;
fmt::append(fname, "__ub%u", m_flat_list.size());
jit_announce(wxptr, raw - wxptr, fname);
}
2019-10-25 23:52:56 +02:00
if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result))
{
2019-10-25 23:52:56 +02:00
return _old;
}
2019-10-25 23:52:56 +02:00
// Install ubertrampoline
auto& insert_to = ::at32(*spu_runtime::g_dispatcher, id_inst >> 12);
2019-10-25 23:52:56 +02:00
auto _old = insert_to.load();
2019-10-25 23:52:56 +02:00
do
{
2019-10-25 23:52:56 +02:00
// Make sure we are replacing an older ubertrampoline but not newer one
if (_old != tr_dispatch)
{
2019-10-25 23:52:56 +02:00
bool ok = false;
2019-10-25 23:52:56 +02:00
for (auto it = stuff_it; it != stuff_end; ++it)
{
if (it->trampoline == _old)
{
ok = true;
break;
}
}
if (!ok)
{
return result;
}
}
}
2019-10-25 23:52:56 +02:00
while (!insert_to.compare_exchange(_old, result));
2019-10-25 23:52:56 +02:00
return result;
}
spu_function_t spu_runtime::find(const u32* ls, u32 addr) const
{
const u32 index = ls[addr / 4] >> 12;
for (const auto& item : ::at32(m_stuff, index))
{
2019-10-25 23:52:56 +02:00
if (const auto ptr = item.compiled.load())
{
std::basic_string_view<u32> range{item.data.data.data(), item.data.data.size()};
range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4);
2019-10-25 23:52:56 +02:00
if (addr / 4 + range.size() > 0x10000)
{
continue;
}
if (range.compare(0, range.size(), ls + addr / 4, range.size()) == 0)
{
return ptr;
}
}
}
return nullptr;
}
spu_function_t spu_runtime::make_branch_patchpoint(u16 data) const
{
#if defined(ARCH_X64)
u8* const raw = jit_runtime::alloc(16, 16);
if (!raw)
{
return nullptr;
}
// Save address of the following jmp (GHC CC 3rd argument)
raw[0] = 0x4c; // lea r12, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x25;
raw[3] = 0x01;
raw[4] = 0x00;
raw[5] = 0x00;
raw[6] = 0x00;
raw[7] = 0x90; // nop
// Jump to spu_recompiler_base::branch
raw[8] = 0xe9;
// Compute the distance
const s64 rel = reinterpret_cast<u64>(tr_branch) - reinterpret_cast<u64>(raw + 8) - 5;
std::memcpy(raw + 9, &rel, 4);
raw[13] = 0xcc;
raw[14] = data >> 8;
raw[15] = data & 0xff;
return reinterpret_cast<spu_function_t>(raw);
#elif defined(ARCH_ARM64)
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
u8* const patch_fn = ensure(jit_runtime::alloc(36, 16));
u8* raw = patch_fn;
// adr x21, #16
*raw++ = 0x95;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x10;
// nop x3
for (int i = 0; i < 3; i++)
{
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
// ldr x9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
u64 branch_target = reinterpret_cast<u64>(tr_branch);
std::memcpy(raw, &branch_target, 8);
raw += 8;
*raw++ = static_cast<u8>(data >> 8);
*raw++ = static_cast<u8>(data & 0xff);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
return reinterpret_cast<spu_function_t>(patch_fn);
#else
#error "Unimplemented"
#endif
}
spu_recompiler_base::spu_recompiler_base()
{
}
2017-01-25 00:22:19 +01:00
spu_recompiler_base::~spu_recompiler_base()
{
}
void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
{
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
if (rip)
{
#if defined(ARCH_X64)
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_all) - reinterpret_cast<u64>(rip - 8) - 5;
union
{
u8 bytes[8];
u64 result;
};
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
2019-10-25 23:52:56 +02:00
bytes[5] = 0x66; // lnop (2 bytes)
bytes[6] = 0x90;
bytes[7] = 0x90;
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
#elif defined(ARCH_ARM64)
union
{
u8 bytes[16];
u128 result;
};
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(spu_runtime::tr_all);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
atomic_storage<u128>::release(*reinterpret_cast<u128*>(rip), result);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#else
#error "Unimplemented"
#endif
}
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
if (spu.block_counter != spu.block_recover && &dispatch != ::at32(*spu_runtime::g_dispatcher, spu._ref<nse_t<u32>>(spu.pc) >> 12))
{
spu.block_recover = spu.block_counter;
return;
}
2019-10-25 23:52:56 +02:00
spu.jit->init();
// Compile
if (spu._ref<u32>(spu.pc) == 0u)
{
spu_runtime::g_escape(&spu);
return;
}
2019-10-25 23:52:56 +02:00
const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
if (!func)
{
spu_log.fatal("[0x%05x] Compilation failed.", spu.pc);
2019-10-25 23:52:56 +02:00
return;
}
// Diagnostic
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4];
if (_info._u64[0] + 1)
{
spu_log.trace("Called from 0x%x", _info._u32[2] - 4);
}
}
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
2019-10-25 23:52:56 +02:00
#if defined(ARCH_ARM64)
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#endif
2019-10-25 23:52:56 +02:00
spu_runtime::g_tail_escape(&spu, func, nullptr);
}
void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
{
#if defined(ARCH_X64)
if (const u32 ls_off = ((rip[6] << 8) | rip[7]) * 4)
#elif defined(ARCH_ARM64)
if (const u32 ls_off = ((rip[16] << 8) | rip[17]) * 4) // See branch_patchpoint `data`
#else
#error "Unimplemented"
#endif
{
spu_log.todo("Special branch patchpoint hit.\nPlease report to the developer (0x%05x).", ls_off);
}
// Find function
const auto func = spu.jit->get_runtime().find(static_cast<u32*>(spu._ptr<void>(0)), spu.pc);
if (!func)
{
return;
}
#if defined(ARCH_X64)
// Overwrite jump to this function with jump to the compiled function
const s64 rel = reinterpret_cast<u64>(func) - reinterpret_cast<u64>(rip) - 5;
union
{
u8 bytes[8];
u64 result;
};
if (rel >= s32{smin} && rel <= s32{smax})
2015-09-04 01:23:31 +02:00
{
const s64 rel8 = (rel + 5) - 2;
if (rel8 >= s8{smin} && rel8 <= s8{smax})
2016-04-14 01:09:41 +02:00
{
bytes[0] = 0xeb; // jmp rel8
bytes[1] = static_cast<s8>(rel8);
std::memset(bytes + 2, 0xcc, 4);
}
else
{
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
bytes[5] = 0xcc;
2016-04-14 01:09:41 +02:00
}
bytes[6] = rip[6];
bytes[7] = rip[7];
}
else
{
fmt::throw_exception("Impossible far jump: %p -> %p", rip, func);
}
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
#elif defined(ARCH_ARM64)
union
{
u8 bytes[16];
u128 result;
};
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(func);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
atomic_storage<u128>::release(*reinterpret_cast<u128*>(rip), result);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#else
#error "Unimplemented"
#endif
2019-10-25 23:52:56 +02:00
spu_runtime::g_tail_escape(&spu, func, rip);
}
2021-03-05 20:05:37 +01:00
void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/)
{
if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
{
fmt::throw_exception("Invalid SPU decoder");
}
// Select opcode table
const auto& table = g_fxo->get<spu_interpreter_rt>();
// LS pointer
const auto base = static_cast<const u8*>(ls);
while (true)
{
if (spu.state) [[unlikely]]
{
if (spu.check_state())
break;
}
const u32 op = *reinterpret_cast<const be_t<u32>*>(base + spu.pc);
if (table.decode(op)(spu, {op}))
spu.pc += 4;
}
}
2023-08-29 13:50:50 +02:00
std::vector<u32> spu_thread::discover_functions(u32 base_addr, std::span<const u8> ls, bool is_known_addr, u32 /*entry*/)
{
std::vector<u32> calls;
std::vector<u32> branches;
calls.reserve(100);
// Discover functions
// Use the most simple method: search for instructions that calls them
// And then filter invalid cases
// TODO: Does not detect jumptables or fixed-addr indirect calls
const v128 brasl_mask = is_known_addr ? v128::from32p(0x62u << 23) : v128::from32p(umax);
2023-08-29 13:50:50 +02:00
2023-12-29 18:33:29 +01:00
for (u32 i = utils::align<u32>(base_addr, 0x10); i < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); i += 0x10)
{
// Search for BRSL LR and BRASL LR or BR
// TODO: BISL
2023-08-29 13:50:50 +02:00
const v128 inst = read_from_ptr<be_t<v128>>(ls.data(), i - base_addr);
const v128 cleared_i16 = gv_and32(inst, v128::from32p(utils::rol32(~0xffff, 7)));
const v128 eq_brsl = gv_eq32(cleared_i16, v128::from32p(0x66u << 23));
const v128 eq_brasl = gv_eq32(cleared_i16, brasl_mask);
const v128 eq_br = gv_eq32(cleared_i16, v128::from32p(0x64u << 23));
const v128 result = eq_brsl | eq_brasl;
if (!gv_testz(result))
{
for (u32 j = 0; j < 4; j++)
{
if (result.u32r[j])
{
calls.push_back(i + j * 4);
}
}
}
if (!gv_testz(eq_br))
{
for (u32 j = 0; j < 4; j++)
{
if (eq_br.u32r[j])
{
branches.push_back(i + j * 4);
}
}
}
}
calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller)
{
// Check the validity of both the callee code and the following caller code
2023-08-29 13:50:50 +02:00
return !is_exec_code(caller, ls, base_addr) || !is_exec_code(caller + 4, ls, base_addr);
}), calls.end());
branches.erase(std::remove_if(branches.begin(), branches.end(), [&](u32 caller)
{
// Check the validity of the callee code
return !is_exec_code(caller, ls, base_addr);
}), branches.end());
std::vector<u32> addrs;
for (u32 addr : calls)
{
2023-08-29 13:50:50 +02:00
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls, addr - base_addr)};
const u32 func = op_branch_targets(addr, op)[0];
if (func == umax || addr + 4 == func || func == addr || std::count(addrs.begin(), addrs.end(), func))
{
continue;
}
addrs.push_back(func);
}
for (u32 addr : branches)
{
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls, addr - base_addr)};
const u32 func = op_branch_targets(addr, op)[0];
if (func == umax || addr + 4 == func || func == addr || !addr)
{
continue;
}
// Search for AI R1, +x or OR R3/4, Rx, 0
// Reasoning: AI R1, +x means stack pointer restoration, branch after that is likely a tail call
// R3 and R4 are common function arguments because they are the first two
2023-12-29 18:33:29 +01:00
for (u32 back = addr - 4, it = 10; it && back >= base_addr && back < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); it--, back -= 4)
{
const spu_opcode_t test_op{read_from_ptr<be_t<u32>>(ls, back - base_addr)};
const auto type = g_spu_itype.decode(test_op.opcode);
if (type & spu_itype::branch)
{
break;
}
bool is_tail = false;
if (type == spu_itype::AI && test_op.rt == 1u && test_op.ra == 1u)
{
if (test_op.si10 <= 0)
{
break;
}
is_tail = true;
}
else if (!(type & spu_itype::zregmod))
{
const u32 op_rt = type & spu_itype::_quadrop ? +test_op.rt4 : +test_op.rt;
if (op_rt >= 80u && (type != spu_itype::LQD || test_op.ra != 1u))
{
// Modifying non-volatile registers, not a call (and not context restoration)
break;
}
//is_tail = op_rt == 3u || op_rt == 4u;
}
if (!is_tail)
{
continue;
}
if (std::count(addrs.begin(), addrs.end(), func))
{
break;
}
addrs.push_back(func);
break;
}
}
std::sort(addrs.begin(), addrs.end());
return addrs;
}
spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, std::map<u32, std::basic_string<u32>>* out_target_list)
{
// Result: addr + raw instruction data
spu_program result;
result.data.reserve(10000);
result.entry_point = entry_point;
result.lower_bound = entry_point;
// Initialize block entries
m_block_info.reset();
m_block_info.set(entry_point / 4);
m_entry_info.reset();
m_entry_info.set(entry_point / 4);
m_ret_info.reset();
// Simple block entry workload list
workload.clear();
workload.push_back(entry_point);
std::memset(m_regmod.data(), 0xff, sizeof(m_regmod));
std::memset(m_use_ra.data(), 0xff, sizeof(m_use_ra));
std::memset(m_use_rb.data(), 0xff, sizeof(m_use_rb));
std::memset(m_use_rc.data(), 0xff, sizeof(m_use_rc));
m_targets.clear();
m_preds.clear();
m_preds[entry_point];
m_bbs.clear();
m_chunks.clear();
m_funcs.clear();
// Value flags (TODO: only is_const is implemented)
enum class vf : u32
{
is_const,
is_mask,
is_rel,
__bitset_enum_max
};
// Weak constant propagation context (for guessing branch targets)
std::array<bs_t<vf>, 128> vflags{};
// Associated constant values for 32-bit preferred slot
std::array<u32, 128> values;
// SYNC instruction found
bool sync = false;
u32 hbr_loc = 0;
u32 hbr_tg = -1;
// Result bounds
u32 lsa = entry_point;
u32 limit = 0x40000;
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
}
for (u32 wi = 0, wa = workload[0]; wi < workload.size();)
{
const auto next_block = [&]
{
// Reset value information
vflags.fill({});
sync = false;
hbr_loc = 0;
hbr_tg = -1;
wi++;
if (wi < workload.size())
{
wa = workload[wi];
}
};
const u32 pos = wa;
const auto add_block = [&](u32 target)
{
// Validate new target (TODO)
if (target >= lsa && target < limit)
{
// Check for redundancy
if (!m_block_info[target / 4])
{
m_block_info[target / 4] = true;
workload.push_back(target);
}
// Add predecessor
if (m_preds[target].find_first_of(pos) + 1 == 0)
{
m_preds[target].push_back(pos);
}
}
};
2017-02-13 14:12:24 +01:00
if (pos < lsa || pos >= limit)
{
// Don't analyse if already beyond the limit
next_block();
continue;
}
const u32 data = ls[pos / 4];
const auto op = spu_opcode_t{data};
wa += 4;
m_targets.erase(pos);
// Fill register access info
if (auto iflags = g_spu_iflag.decode(data))
{
if (+iflags & +spu_iflag::use_ra)
m_use_ra[pos / 4] = op.ra;
if (+iflags & +spu_iflag::use_rb)
m_use_rb[pos / 4] = op.rb;
if (+iflags & +spu_iflag::use_rc)
m_use_rc[pos / 4] = op.rc;
}
// Analyse instruction
switch (const auto type = g_spu_itype.decode(data))
{
case spu_itype::UNK:
case spu_itype::DFCEQ:
case spu_itype::DFCMEQ:
case spu_itype::DFCGT:
case spu_itype::DFCMGT:
case spu_itype::DFTSV:
{
// Stop before invalid instructions (TODO)
next_block();
continue;
}
case spu_itype::SYNC:
case spu_itype::STOP:
case spu_itype::STOPD:
{
if (data == 0)
{
// Stop before null data
next_block();
continue;
}
if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
// Stop on special instructions (TODO)
m_targets[pos];
next_block();
break;
}
if (type == spu_itype::SYNC)
{
// Remember
sync = true;
}
break;
}
case spu_itype::IRET:
{
if (op.d && op.e)
{
spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos);
}
m_targets[pos];
next_block();
break;
}
case spu_itype::BI:
case spu_itype::BISL:
case spu_itype::BISLED:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ:
{
if (op.d && op.e)
{
spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos);
}
const auto af = vflags[op.ra];
const auto av = values[op.ra];
const bool sl = type == spu_itype::BISL || type == spu_itype::BISLED;
if (sl)
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
}
if (af & vf::is_const)
{
const u32 target = spu_branch_target(av);
spu_log.warning("[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
2023-09-01 14:48:43 +02:00
if (type == spu_itype::BI && target == pos + 4 && op.d)
{
// Disable interrupts idiom
break;
}
m_targets[pos].push_back(target);
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
if (sync)
{
spu_log.notice("[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target);
if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
}
else
{
m_entry_info[target / 4] = true;
add_block(target);
}
}
else if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
}
else if (type == spu_itype::BI && g_cfg.core.spu_block_size != spu_block_size_type::safe && !op.d && !op.e && !sync)
{
// Analyse jump table (TODO)
std::basic_string<u32> jt_abs;
std::basic_string<u32> jt_rel;
const u32 start = pos + 4;
2018-05-04 22:51:35 +02:00
u64 dabs = 0;
u64 drel = 0;
for (u32 i = start; i < limit; i += 4)
{
const u32 target = ls[i / 4];
2018-05-04 22:51:35 +02:00
if (target == 0 || target % 4)
{
// Address cannot be misaligned: abort
break;
}
if (target >= lsa && target < 0x40000)
{
// Possible jump table entry (absolute)
jt_abs.push_back(target);
}
if (target + start >= lsa && target + start < 0x40000)
{
// Possible jump table entry (relative)
jt_rel.push_back(target + start);
}
if (std::max(jt_abs.size(), jt_rel.size()) * 4 + start <= i)
{
// Neither type of jump table completes
jt_abs.clear();
jt_rel.clear();
break;
}
}
2018-05-04 22:51:35 +02:00
// Choose position after the jt as an anchor and compute the average distance
for (u32 target : jt_abs)
{
dabs += std::abs(static_cast<s32>(target - start - jt_abs.size() * 4));
}
for (u32 target : jt_rel)
{
drel += std::abs(static_cast<s32>(target - start - jt_rel.size() * 4));
}
// Add detected jump table blocks
if (jt_abs.size() >= 3 || jt_rel.size() >= 3)
{
2018-05-04 22:51:35 +02:00
if (jt_abs.size() == jt_rel.size())
{
if (dabs < drel)
{
jt_rel.clear();
}
if (dabs > drel)
{
jt_abs.clear();
}
ensure(jt_abs.size() != jt_rel.size());
2018-05-04 22:51:35 +02:00
}
if (jt_abs.size() >= jt_rel.size())
{
2020-03-04 15:08:40 +01:00
const u32 new_size = (start - lsa) / 4 + ::size32(jt_abs);
2018-05-04 22:51:35 +02:00
if (result.data.size() < new_size)
{
result.data.resize(new_size);
2018-05-04 22:51:35 +02:00
}
for (u32 i = 0; i < jt_abs.size(); i++)
{
add_block(jt_abs[i]);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_abs[i]);
m_targets[start + i * 4];
}
m_targets.emplace(pos, std::move(jt_abs));
}
if (jt_rel.size() >= jt_abs.size())
{
2020-03-04 15:08:40 +01:00
const u32 new_size = (start - lsa) / 4 + ::size32(jt_rel);
2018-05-04 22:51:35 +02:00
if (result.data.size() < new_size)
2018-05-04 22:51:35 +02:00
{
result.data.resize(new_size);
2018-05-04 22:51:35 +02:00
}
for (u32 i = 0; i < jt_rel.size(); i++)
{
2018-05-04 22:51:35 +02:00
add_block(jt_rel[i]);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_rel[i] - start);
m_targets[start + i * 4];
}
m_targets.emplace(pos, std::move(jt_rel));
}
}
else if (start + 12 * 4 < limit &&
ls[start / 4 + 0] == 0x1ce00408u &&
ls[start / 4 + 1] == 0x24000389u &&
ls[start / 4 + 2] == 0x24004809u &&
ls[start / 4 + 3] == 0x24008809u &&
ls[start / 4 + 4] == 0x2400c809u &&
ls[start / 4 + 5] == 0x24010809u &&
ls[start / 4 + 6] == 0x24014809u &&
ls[start / 4 + 7] == 0x24018809u &&
ls[start / 4 + 8] == 0x1c200807u &&
ls[start / 4 + 9] == 0x2401c809u)
{
spu_log.warning("[0x%x] Pattern 1 detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
// Add 8 targets (TODO)
for (u32 addr = start + 4; addr < start + 36; addr += 4)
{
m_targets[pos].push_back(addr);
add_block(addr);
}
}
else if (hbr_loc > start && hbr_loc < limit && hbr_tg == start)
{
spu_log.warning("[0x%x] No patterns detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
}
}
else if (type == spu_itype::BI && sync)
{
spu_log.notice("[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos);
}
if (type == spu_itype::BI || sl)
{
if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
m_targets[pos];
}
else
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
}
else
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
case spu_itype::BRSL:
case spu_itype::BRASL:
{
const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
if (type == spu_itype::BRSL && target == pos + 4)
{
// Get next instruction address idiom
break;
}
m_targets[pos].push_back(target);
if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync)
{
m_entry_info[target / 4] = true;
add_block(target);
}
else
{
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
spu_log.notice("[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target);
}
if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
}
next_block();
break;
}
case spu_itype::BRA:
{
const u32 target = spu_branch_target(0, op.i16);
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync)
{
m_entry_info[target / 4] = true;
add_block(target);
}
else
{
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
spu_log.notice("[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target);
}
if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
}
next_block();
break;
}
case spu_itype::BR:
case spu_itype::BRZ:
case spu_itype::BRNZ:
case spu_itype::BRHZ:
case spu_itype::BRHNZ:
{
const u32 target = spu_branch_target(pos, op.i16);
if (target == pos + 4)
{
// Nop
break;
}
m_targets[pos].push_back(target);
add_block(target);
if (type != spu_itype::BR)
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
case spu_itype::DSYNC:
case spu_itype::HEQ:
case spu_itype::HEQI:
case spu_itype::HGT:
case spu_itype::HGTI:
case spu_itype::HLGT:
case spu_itype::HLGTI:
case spu_itype::LNOP:
case spu_itype::NOP:
case spu_itype::MTSPR:
case spu_itype::FSCRWR:
case spu_itype::STQA:
case spu_itype::STQD:
case spu_itype::STQR:
case spu_itype::STQX:
{
// Do nothing
break;
}
case spu_itype::WRCH:
{
switch (op.ra)
{
case MFC_EAL:
{
m_regmod[pos / 4] = s_reg_mfc_eal;
break;
}
case MFC_LSA:
{
m_regmod[pos / 4] = s_reg_mfc_lsa;
break;
}
case MFC_TagID:
{
m_regmod[pos / 4] = s_reg_mfc_tag;
break;
}
case MFC_Size:
{
m_regmod[pos / 4] = s_reg_mfc_size;
break;
}
case MFC_Cmd:
{
m_use_rb[pos / 4] = s_reg_mfc_eal;
break;
}
2021-04-09 21:12:47 +02:00
default: break;
}
break;
}
case spu_itype::LQA:
case spu_itype::LQD:
case spu_itype::LQR:
case spu_itype::LQX:
{
// Unconst
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = {};
break;
}
case spu_itype::HBR:
{
hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt);
hbr_tg = vflags[op.ra] & vf::is_const && !op.c ? values[op.ra] & 0x3fffc : -1;
break;
}
case spu_itype::HBRA:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(0x0, op.i16);
break;
}
case spu_itype::HBRR:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(pos, op.i16);
break;
}
case spu_itype::IL:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.si16;
break;
}
case spu_itype::ILA:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i18;
break;
}
case spu_itype::ILH:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16 | op.i16;
break;
}
case spu_itype::ILHU:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16;
break;
}
case spu_itype::IOHL:
{
m_regmod[pos / 4] = op.rt;
values[op.rt] = values[op.rt] | op.i16;
break;
}
case spu_itype::ORI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] | op.si10;
break;
}
case spu_itype::OR:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] | values[op.rb];
break;
}
case spu_itype::ANDI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] & op.si10;
break;
}
case spu_itype::AND:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] & values[op.rb];
break;
}
case spu_itype::AI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] + op.si10;
break;
}
case spu_itype::A:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] + values[op.rb];
break;
}
case spu_itype::SFI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = op.si10 - values[op.ra];
break;
}
case spu_itype::SF:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.rb] - values[op.ra];
break;
}
case spu_itype::ROTMI:
{
m_regmod[pos / 4] = op.rt;
2020-03-04 15:08:40 +01:00
if ((0 - op.i7) & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
2020-03-04 15:08:40 +01:00
values[op.rt] = values[op.ra] >> ((0 - op.i7) & 0x1f);
break;
}
case spu_itype::SHLI:
{
m_regmod[pos / 4] = op.rt;
if (op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
break;
}
default:
{
// Unconst
const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
m_regmod[pos / 4] = op_rt;
vflags[op_rt] = {};
break;
}
}
// Insert raw instruction value
const u32 new_size = (pos - lsa) / 4;
if (result.data.size() <= new_size)
{
if (result.data.size() < new_size)
{
result.data.resize(new_size);
}
result.data.emplace_back(std::bit_cast<u32, be_t<u32>>(data));
}
else if (u32& raw_val = result.data[new_size])
{
ensure(raw_val == std::bit_cast<u32, be_t<u32>>(data));
}
else
{
raw_val = std::bit_cast<u32, be_t<u32>>(data);
}
}
while (lsa > 0 || limit < 0x40000)
{
2020-03-04 15:08:40 +01:00
const u32 initial_size = ::size32(result.data);
// Check unreachable blocks
limit = std::min<u32>(limit, lsa + initial_size * 4);
for (auto& pair : m_preds)
{
bool reachable = false;
2018-06-01 23:33:48 +02:00
if (pair.first >= limit)
{
2018-06-01 23:33:48 +02:00
continue;
}
// All (direct and indirect) predecessors to check
std::basic_string<u32> workload;
// Bit array used to deduplicate workload list
workload.push_back(pair.first);
m_bits[pair.first / 4] = true;
2020-12-18 08:39:54 +01:00
for (usz i = 0; !reachable && i < workload.size(); i++)
2018-06-01 23:33:48 +02:00
{
for (u32 j = workload[i];; j -= 4)
{
// Go backward from an address until the entry point is reached
if (j == entry_point)
2018-06-01 23:33:48 +02:00
{
reachable = true;
break;
}
const auto found = m_preds.find(j);
bool had_fallthrough = false;
if (found != m_preds.end())
{
for (u32 new_pred : found->second)
{
// Check whether the predecessor is previous instruction
if (new_pred == j - 4)
{
had_fallthrough = true;
continue;
}
// Check whether in range and not already added
if (new_pred >= lsa && new_pred < limit && !m_bits[new_pred / 4])
{
workload.push_back(new_pred);
m_bits[new_pred / 4] = true;
}
}
}
// Check for possible fallthrough predecessor
if (!had_fallthrough)
{
if (::at32(result.data, (j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4))
2018-06-01 23:33:48 +02:00
{
break;
}
}
if (i == 0)
{
// TODO
}
}
}
2018-06-01 23:33:48 +02:00
for (u32 pred : workload)
{
m_bits[pred / 4] = false;
}
if (!reachable && pair.first < limit)
{
limit = pair.first;
}
}
result.data.resize((limit - lsa) / 4);
// Check holes in safe mode (TODO)
u32 valid_size = 0;
for (u32 i = 0; i < result.data.size(); i++)
{
if (result.data[i] == 0)
{
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
// Allow only NOP or LNOP instructions in holes
if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
{
continue;
}
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
result.data.resize(valid_size);
break;
}
}
else
{
valid_size = i + 1;
}
}
// Even if NOP or LNOP, should be removed at the end
result.data.resize(valid_size);
// Repeat if blocks were removed
if (result.data.size() == initial_size)
{
break;
}
}
limit = std::min<u32>(limit, lsa + ::size32(result.data) * 4);
// Cleanup block info
for (u32 i = 0; i < workload.size(); i++)
{
const u32 addr = workload[i];
if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4])
{
m_block_info[addr / 4] = false;
m_entry_info[addr / 4] = false;
m_ret_info[addr / 4] = false;
m_preds.erase(addr);
}
}
// Complete m_preds and associated m_targets for adjacent blocks
for (auto it = m_preds.begin(); it != m_preds.end();)
{
if (it->first < lsa || it->first >= limit)
{
it = m_preds.erase(it);
continue;
}
// Erase impossible predecessors
const auto new_end = std::remove_if(it->second.begin(), it->second.end(), [&](u32 addr)
{
return addr < lsa || addr >= limit;
});
it->second.erase(new_end, it->second.end());
// Don't add fallthrough target if all predecessors are removed
if (it->second.empty() && !m_entry_info[it->first / 4])
{
// If not an entry point, remove the block completely
m_block_info[it->first / 4] = false;
it = m_preds.erase(it);
continue;
}
// Previous instruction address
const u32 prev = (it->first - 4) & 0x3fffc;
// TODO: check the correctness
if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4])
{
// Add target and the predecessor
m_targets[prev].push_back(it->first);
it->second.push_back(prev);
}
it++;
}
if (out_target_list)
{
out_target_list->insert(m_targets.begin(), m_targets.end());
}
// Remove unnecessary target lists
for (auto it = m_targets.begin(); it != m_targets.end();)
{
if (it->first < lsa || it->first >= limit)
{
it = m_targets.erase(it);
continue;
}
it++;
}
// Fill holes which contain only NOP and LNOP instructions (TODO: compile)
for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++)
{
if (i >= result.data.size() || result.data[i])
{
if (nnop && nnop == i - vsize)
{
// Write only complete NOP sequence
for (u32 j = vsize; j < i; j++)
{
result.data[j] = std::bit_cast<u32, be_t<u32>>(ls[lsa / 4 + j]);
}
}
nnop = 0;
vsize = i + 1;
}
else
{
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
{
nnop++;
}
}
}
// Fill block info
for (auto& pred : m_preds)
{
auto& block = m_bbs[pred.first];
// Copy predeccessors (wrong at this point, needs a fixup later)
block.preds = pred.second;
// Fill register usage info
2019-05-11 23:12:09 +02:00
for (u32 ia = pred.first; ia < limit; ia += 4)
{
block.size++;
// Decode instruction
const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
const auto type = g_spu_itype.decode(op.opcode);
u8 reg_save = 255;
if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt])
{
// Register saved onto the stack before use
block.reg_save_dom[op.rt] = true;
reg_save = op.rt;
}
for (auto* _use : {&m_use_ra, &m_use_rb, &m_use_rc})
{
if (u8 reg = (*_use)[ia / 4]; reg < s_reg_max)
{
// Register reg use only if it happens before reg mod
if (!block.reg_mod[reg])
{
block.reg_use.set(reg);
if (reg_save != reg && block.reg_save_dom[reg])
{
// Register is still used after saving; probably not eligible for optimization
block.reg_save_dom[reg] = false;
}
}
}
}
if (m_use_rb[ia / 4] == s_reg_mfc_eal)
{
// Expand MFC_Cmd reg use
for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size})
{
if (!block.reg_mod[reg])
block.reg_use.set(reg);
}
}
// Register reg modification
if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max)
{
block.reg_mod.set(reg);
block.reg_mod_xf.set(reg, type & spu_itype::xfloat);
if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb]))
block.reg_mod_xf.set(reg);
// Possible post-dominating register load
if (type == spu_itype::LQD && op.ra == s_reg_sp)
block.reg_load_mod[reg] = ia + 1;
else
block.reg_load_mod[reg] = 0;
}
// Find targets (also means end of the block)
const auto tfound = m_targets.find(ia);
if (tfound != m_targets.end())
{
// Copy targets
block.targets = tfound->second;
// Assume that the call reads and modifies all volatile registers (TODO)
bool is_call = false;
bool is_tail = false;
switch (type)
{
case spu_itype::BRSL:
is_call = spu_branch_target(ia, op.i16) != ia + 4;
break;
case spu_itype::BRASL:
is_call = spu_branch_target(0, op.i16) != ia + 4;
break;
case spu_itype::BRA:
is_call = true;
is_tail = true;
break;
case spu_itype::BISL:
case spu_itype::BISLED:
is_call = true;
break;
default:
break;
}
if (is_call)
{
for (u32 i = 0; i < s_reg_max; ++i)
{
if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127)
{
if (!block.reg_mod[i])
block.reg_use.set(i);
if (!is_tail)
{
block.reg_mod.set(i);
block.reg_mod_xf[i] = false;
}
}
}
}
break;
}
}
}
// Fixup block predeccessors to point to basic blocks, not last instructions
for (auto& bb : m_bbs)
{
const u32 addr = bb.first;
for (u32& pred : bb.second.preds)
{
pred = std::prev(m_bbs.upper_bound(pred))->first;
}
if (m_entry_info[addr / 4] && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
// Register empty chunk
m_chunks.push_back(addr);
// Register function if necessary
if (!m_ret_info[addr / 4])
{
m_funcs[addr];
}
}
}
// Ensure there is a function at the lowest address
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second)
{
const u32 addr = emp.first->first;
spu_log.error("[0x%05x] Fixed first function at 0x%05x", entry_point, addr);
m_entry_info[addr / 4] = true;
m_ret_info[addr / 4] = false;
}
}
// Split functions
while (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
bool need_repeat = false;
u32 start = 0;
u32 limit = 0x40000;
// Walk block list in ascending order
for (auto& block : m_bbs)
{
const u32 addr = block.first;
if (m_entry_info[addr / 4] && !m_ret_info[addr / 4])
{
const auto upper = m_funcs.upper_bound(addr);
start = addr;
limit = upper == m_funcs.end() ? 0x40000 : upper->first;
}
// Find targets that exceed [start; limit) range and make new functions from them
for (u32 target : block.second.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
if (target < start || target >= limit)
{
if (!m_entry_info[target / 4] || m_ret_info[target / 4])
{
// Create new function entry (likely a tail call)
m_entry_info[target / 4] = true;
m_ret_info[target / 4] = false;
m_funcs.try_emplace(target);
if (target < limit)
{
need_repeat = true;
}
}
}
}
block.second.func = start;
}
if (!need_repeat)
{
break;
}
}
if (!m_bbs.count(entry_point))
{
// Invalid code
spu_log.error("[0x%x] Invalid code", entry_point);
return {};
}
// Fill entry map
while (true)
{
workload.clear();
workload.push_back(entry_point);
2023-08-28 11:20:17 +02:00
ensure(m_bbs.count(entry_point));
std::basic_string<u32> new_entries;
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
const u32 _new = block.chunk;
if (!m_entry_info[addr / 4])
{
// Check block predecessors
for (u32 pred : block.preds)
{
const u32 _old = ::at32(m_bbs, pred).chunk;
if (_old < 0x40000 && _old != _new)
{
// If block has multiple 'entry' points, it becomes an entry point itself
new_entries.push_back(addr);
}
}
}
// Update chunk address
block.chunk = m_entry_info[addr / 4] ? addr : _new;
// Process block targets
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
const u32 value = m_entry_info[target / 4] ? target : block.chunk;
if (u32& tval = tb.chunk; tval < 0x40000)
{
// TODO: fix condition
if (tval != value && !m_entry_info[target / 4])
{
new_entries.push_back(target);
}
}
else
{
tval = value;
workload.emplace_back(target);
}
}
}
if (new_entries.empty())
{
break;
}
for (u32 entry : new_entries)
{
m_entry_info[entry / 4] = true;
// Acknowledge artificial (reversible) chunk entry point
m_ret_info[entry / 4] = true;
}
for (auto& bb : m_bbs)
{
// Reset chunk info
bb.second.chunk = 0x40000;
}
}
workload.clear();
workload.push_back(entry_point);
// Fill workload adding targets
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
block.analysed = true;
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
if (!tb.analysed)
{
workload.push_back(target);
tb.analysed = true;
}
// Limited xfloat hint propagation (possibly TODO)
if (tb.chunk == block.chunk)
{
tb.reg_maybe_xf &= block.reg_mod_xf;
}
else
{
tb.reg_maybe_xf.reset();
}
}
block.reg_origin.fill(0x80000000);
block.reg_origin_abs.fill(0x80000000);
}
// Fill register origin info
while (true)
{
bool must_repeat = false;
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
// Initialize entry point with default value: unknown origin (requires load)
if (m_entry_info[addr / 4])
{
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin[i] == 0x80000000)
block.reg_origin[i] = 0x40000;
}
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_entry_info[addr / 4] && !m_ret_info[addr / 4])
{
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin_abs[i] == 0x80000000)
block.reg_origin_abs[i] = 0x40000;
else if (block.reg_origin_abs[i] + 1 == 0)
block.reg_origin_abs[i] = -2;
}
}
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
for (u32 i = 0; i < s_reg_max; i++)
{
if (tb.chunk == block.chunk && tb.reg_origin[i] + 1)
{
const u32 expected = block.reg_mod[i] ? addr : block.reg_origin[i];
if (tb.reg_origin[i] == 0x80000000)
{
tb.reg_origin[i] = expected;
}
else if (tb.reg_origin[i] != expected)
{
// Set -1 if multiple origins merged (requires PHI node)
tb.reg_origin[i] = -1;
must_repeat |= !tb.targets.empty();
}
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && tb.func == block.func && tb.reg_origin_abs[i] + 2)
{
const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i];
if (tb.reg_origin_abs[i] == 0x80000000)
{
tb.reg_origin_abs[i] = expected;
}
else if (tb.reg_origin_abs[i] != expected)
{
if (tb.reg_origin_abs[i] == 0x40000 || expected + 2 == 0 || expected == 0x40000)
{
// Set -2: sticky value indicating possible external reg origin (0x40000)
tb.reg_origin_abs[i] = -2;
must_repeat |= !tb.targets.empty();
}
else if (tb.reg_origin_abs[i] + 1)
{
tb.reg_origin_abs[i] = -1;
must_repeat |= !tb.targets.empty();
}
}
}
}
}
}
if (!must_repeat)
{
break;
}
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
// Reset values for the next attempt (keep negative values)
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin[i] <= 0x40000)
block.reg_origin[i] = 0x80000000;
if (block.reg_origin_abs[i] <= 0x40000)
block.reg_origin_abs[i] = 0x80000000;
}
}
}
// Fill more block info
for (u32 wi = 0; wi < workload.size(); wi++)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
break;
}
const u32 addr = workload[wi];
auto& bb = ::at32(m_bbs, addr);
auto& func = ::at32(m_funcs, bb.func);
2018-05-02 20:49:19 +02:00
// Update function size
func.size = std::max<u16>(func.size, bb.size + (addr - bb.func) / 4);
// Copy constants according to reg origin info
for (u32 i = 0; i < s_reg_max; i++)
{
const u32 orig = bb.reg_origin_abs[i];
if (orig < 0x40000)
{
auto& src = ::at32(m_bbs, orig);
bb.reg_const[i] = src.reg_const[i];
bb.reg_val32[i] = src.reg_val32[i];
}
if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig + 2 == 0))
{
// Destroy offset if external reg value is used
func.reg_save_off[i] = -1;
}
}
if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000)
{
auto& prologue = ::at32(m_bbs, orig);
// Copy stack offset (from the assumed prologue)
bb.stack_sub = prologue.stack_sub;
}
else if (orig > 0x40000)
{
// Unpredictable stack
bb.stack_sub = 0x80000000;
}
spu_opcode_t op{};
2018-05-12 21:55:52 +02:00
auto last_inst = spu_itype::UNK;
2018-05-02 20:49:19 +02:00
for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4)
{
// Decode instruction again
op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
last_inst = g_spu_itype.decode(op.opcode);
2018-05-02 20:49:19 +02:00
// Propagate some constants
switch (last_inst)
{
case spu_itype::IL:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.si16;
break;
}
case spu_itype::ILA:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i18;
break;
}
case spu_itype::ILHU:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i16 << 16;
break;
}
case spu_itype::ILH:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i16 << 16 | op.i16;
break;
}
case spu_itype::IOHL:
{
bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16;
break;
}
case spu_itype::ORI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10;
break;
}
case spu_itype::OR:
{
2022-09-13 15:08:55 +02:00
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb];
break;
}
case spu_itype::AI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10;
break;
}
case spu_itype::A:
{
2022-09-13 15:08:55 +02:00
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb];
break;
}
case spu_itype::SFI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra];
break;
}
case spu_itype::SF:
{
2022-09-13 15:08:55 +02:00
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra];
break;
}
case spu_itype::STQD:
{
if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt])
{
const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub;
if (func.reg_save_off[op.rt] == 0)
{
// Store reg save offset
func.reg_save_off[op.rt] = offset;
}
else if (func.reg_save_off[op.rt] != offset)
{
// Conflict of different offsets
func.reg_save_off[op.rt] = -1;
}
}
2018-05-02 20:49:19 +02:00
break;
}
case spu_itype::LQD:
{
if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1)
{
// Adjust reg load offset
bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub;
}
2018-05-02 20:49:19 +02:00
// Clear const
bb.reg_const[op.rt] = false;
break;
}
default:
{
// Clear const if reg is modified here
if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max)
bb.reg_const[reg] = false;
break;
}
}
// $SP is modified
if (m_regmod[ia / 4] == s_reg_sp)
{
if (bb.reg_const[s_reg_sp])
{
// Making $SP a constant is a funny thing too.
bb.stack_sub = 0x80000000;
}
if (bb.stack_sub != 0x80000000)
{
switch (last_inst)
{
case spu_itype::AI:
{
if (op.ra == s_reg_sp)
bb.stack_sub -= op.si10;
else
bb.stack_sub = 0x80000000;
break;
}
case spu_itype::A:
{
if (op.ra == s_reg_sp && bb.reg_const[op.rb])
bb.stack_sub -= bb.reg_val32[op.rb];
else if (op.rb == s_reg_sp && bb.reg_const[op.ra])
bb.stack_sub -= bb.reg_val32[op.ra];
else
bb.stack_sub = 0x80000000;
break;
}
case spu_itype::SF:
{
if (op.rb == s_reg_sp && bb.reg_const[op.ra])
bb.stack_sub += bb.reg_val32[op.ra];
else
bb.stack_sub = 0x80000000;
break;
}
default:
{
bb.stack_sub = 0x80000000;
break;
}
}
}
2018-05-02 20:49:19 +02:00
// Check for funny values.
if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16)
{
bb.stack_sub = 0x80000000;
}
}
}
// Analyse terminator instruction
const u32 tia = addr + bb.size * 4 - 4;
switch (last_inst)
{
case spu_itype::BR:
case spu_itype::BRNZ:
case spu_itype::BRZ:
case spu_itype::BRHNZ:
case spu_itype::BRHZ:
case spu_itype::BRSL:
{
const u32 target = spu_branch_target(tia, op.i16);
if (target == tia + 4)
{
bb.terminator = term_type::fallthrough;
}
else if (last_inst != spu_itype::BRSL)
{
// No-op terminator or simple branch instruction
bb.terminator = term_type::br;
if (target == bb.func)
{
// Recursive tail call
bb.terminator = term_type::ret;
}
}
else if (op.rt == s_reg_lr)
{
bb.terminator = term_type::call;
}
else
{
bb.terminator = term_type::interrupt_call;
}
break;
}
case spu_itype::BRA:
case spu_itype::BRASL:
{
bb.terminator = term_type::indirect_call;
break;
}
case spu_itype::BI:
{
if (op.d || op.e || bb.targets.size() == 1)
{
bb.terminator = term_type::interrupt_call;
}
else if (bb.targets.size() > 1)
{
// Jump table
bb.terminator = term_type::br;
}
else if (op.ra == s_reg_lr)
{
// Return (TODO)
bb.terminator = term_type::ret;
}
else
{
// Indirect tail call (TODO)
bb.terminator = term_type::interrupt_call;
}
break;
}
case spu_itype::BISLED:
case spu_itype::IRET:
{
bb.terminator = term_type::interrupt_call;
break;
}
case spu_itype::BISL:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ:
{
if (op.d || op.e || bb.targets.size() != 1)
{
bb.terminator = term_type::interrupt_call;
}
else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr)
{
// Conditional return (TODO)
bb.terminator = term_type::ret;
}
else if (last_inst == spu_itype::BISL)
{
// Indirect call
bb.terminator = term_type::indirect_call;
}
else
{
// TODO
bb.terminator = term_type::interrupt_call;
}
break;
}
default:
{
// Normal instruction
bb.terminator = term_type::fallthrough;
break;
}
}
}
// Check function blocks, verify and print some reasons
for (auto& f : m_funcs)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
break;
}
bool is_ok = true;
u32 used_stack = 0;
for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it)
{
auto& bb = it->second;
auto& func = ::at32(m_funcs, bb.func);
const u32 addr = it->first;
const u32 flim = bb.func + func.size * 4;
used_stack |= bb.stack_sub;
if (is_ok && bb.terminator >= term_type::indirect_call)
{
is_ok = false;
}
if (is_ok && bb.terminator == term_type::ret)
{
// Check $LR (alternative return registers are currently not supported)
if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000)
{
auto& src = ::at32(m_bbs, lr_orig);
if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr])
{
spu_log.error("Function 0x%05x: [0x%05x] $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]);
is_ok = false;
}
else if (src.reg_load_mod[s_reg_lr] == 0)
{
spu_log.error("Function 0x%05x: [0x%05x] $LR modified (src=0x%x)", f.first, addr, lr_orig);
is_ok = false;
}
}
else if (lr_orig > 0x40000)
{
spu_log.todo("Function 0x%05x: [0x%05x] $LR unpredictable (src=0x%x)", f.first, addr, lr_orig);
is_ok = false;
}
// Check $80..$127 (should be restored or unmodified)
for (u32 i = s_reg_80; is_ok && i <= s_reg_127; i++)
{
if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000)
{
auto& src = ::at32(m_bbs, orig);
if (src.reg_load_mod[i] != func.reg_save_off[i])
{
spu_log.error("Function 0x%05x: [0x%05x] $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, i, orig, src.reg_load_mod[i], func.reg_save_off[i]);
is_ok = false;
}
}
else if (orig > 0x40000)
{
spu_log.todo("Function 0x%05x: [0x%05x] $%u unpredictable (src=0x%x)", f.first, addr, i, orig);
is_ok = false;
}
if (func.reg_save_off[i] + 1 == 0)
{
spu_log.error("Function 0x%05x: [0x%05x] $%u used incorrectly", f.first, addr, i);
is_ok = false;
}
}
// Check $SP (should be restored or unmodified)
if (bb.stack_sub != 0 && bb.stack_sub != 0x80000000)
{
spu_log.error("Function 0x%05x: [0x%05x] return with stack frame 0x%x", f.first, addr, bb.stack_sub);
is_ok = false;
}
}
if (is_ok && bb.terminator == term_type::call)
{
// Check call instruction (TODO)
if (bb.stack_sub == 0)
{
// Call without a stack frame
spu_log.error("Function 0x%05x: [0x%05x] frameless call", f.first, addr);
is_ok = false;
}
}
if (is_ok && bb.terminator == term_type::fallthrough)
{
// Can't just fall out of the function
if (bb.targets.size() != 1 || bb.targets[0] >= flim)
{
spu_log.error("Function 0x%05x: [0x%05x] bad fallthrough to 0x%x", f.first, addr, bb.targets[0]);
is_ok = false;
}
}
if (is_ok && bb.stack_sub == 0x80000000)
{
spu_log.error("Function 0x%05x: [0x%05x] bad stack frame", f.first, addr);
is_ok = false;
}
// Fill external function targets (calls, possibly tail calls)
for (u32 target : bb.targets)
{
if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func))
{
if (func.calls.find_first_of(target) + 1 == 0)
{
func.calls.push_back(target);
}
}
}
}
if (is_ok && used_stack && f.first == entry_point)
{
spu_log.error("Function 0x%05x: considered possible chunk", f.first);
is_ok = false;
}
// if (is_ok && f.first > 0x1d240 && f.first < 0x1e000)
// {
// spu_log.error("Function 0x%05x: manually disabled", f.first);
// is_ok = false;
// }
f.second.good = is_ok;
}
// Check function call graph
while (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
bool need_repeat = false;
for (auto& f : m_funcs)
{
if (!f.second.good)
{
continue;
}
for (u32 call : f.second.calls)
{
const auto ffound = std::as_const(m_funcs).find(call);
if (ffound == m_funcs.cend() || ffound->second.good == false)
{
need_repeat = true;
if (f.second.good)
{
spu_log.error("Function 0x%05x: calls bad function (0x%05x)", f.first, call);
f.second.good = false;
}
}
}
}
if (!need_repeat)
{
break;
}
}
if (result.data.empty())
{
// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback
}
return result;
}
void spu_recompiler_base::dump(const spu_program& result, std::string& out)
{
SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast<const u8*>(result.data.data()), result.lower_bound);
std::string hash;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(result.data.data()), result.data.size() * 4);
sha1_finish(&ctx, output);
fmt::append(hash, "%s", fmt::base57(output));
}
fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n\n", result.entry_point, result.data.size(), hash);
for (auto& bb : m_bbs)
{
for (u32 pos = bb.first, end = bb.first + bb.second.size * 4; pos < end; pos += 4)
{
dis_asm.disasm(pos);
if (!dis_asm.last_opcode.ends_with('\n'))
{
dis_asm.last_opcode += '\n';
}
fmt::append(out, ">%s", dis_asm.last_opcode);
}
out += '\n';
if (m_block_info[bb.first / 4])
{
fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block");
fmt::append(out, "\tF: 0x%05x\n", bb.second.func);
for (u32 pred : bb.second.preds)
{
fmt::append(out, "\t<- 0x%05x\n", pred);
}
for (u32 target : bb.second.targets)
{
fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)");
}
}
else
{
fmt::append(out, "A: [0x%05x] ?\n", bb.first);
}
out += '\n';
}
for (auto& f : m_funcs)
{
fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)");
fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first);
for (u32 call : f.second.calls)
{
fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)");
}
}
out += '\n';
}
struct spu_llvm_worker
{
lf_queue<std::pair<u64, const spu_program*>> registered;
void operator()()
{
// SPU LLVM Recompiler instance
const auto compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
for (auto slice = registered.pop_all();; [&]
{
if (slice)
{
slice.pop_front();
}
if (slice || thread_ctrl::state() == thread_state::aborting)
{
return;
}
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
slice = registered.pop_all();
}())
{
auto* prog = slice.get();
if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
if (!prog)
{
continue;
}
if (!prog->second)
{
break;
}
const auto& func = *prog->second;
// Get data start
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
// Initialize LS with function data only
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (const auto target = compiler->compile(std::move(func2)))
{
// Redirect old function (TODO: patch in multiple places)
const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;
union
{
u8 bytes[8];
u64 result;
};
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
bytes[5] = 0x90;
bytes[6] = 0x90;
bytes[7] = 0x90;
atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
}
else
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
return;
}
// Clear fake LS
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
}
}
};
// SPU LLVM recompiler thread context
struct spu_llvm
{
// Workload
lf_queue<std::pair<const u64, spu_item*>> registered;
atomic_ptr<named_thread_group<spu_llvm_worker>> m_workers;
spu_llvm()
{
// Dependency
g_fxo->init<spu_cache>();
}
void operator()()
{
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
{
return;
}
// To compile (hash -> item)
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
// Mini-profiler (hash -> number of occurrences)
std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
// For synchronization with profiler thread
stx::init_mutex prof_mutex;
named_thread profiler("SPU LLVM Profiler"sv, [&]()
{
while (thread_ctrl::state() != thread_state::aborting)
{
{
// Lock if enabled
const auto lock = prof_mutex.access();
if (!lock)
{
// Wait when the profiler is disabled
prof_mutex.wait_for_initialized();
continue;
}
// Collect profiling samples
2021-03-05 20:05:37 +01:00
idm::select<named_thread<spu_thread>>([&](u32 /*id*/, spu_thread& spu)
{
const u64 name = atomic_storage<u64>::load(spu.block_hash);
2021-04-17 14:39:13 +02:00
if (auto state = +spu.state; !::is_paused(state) && !::is_stopped(state) && cpu_flag::wait - state)
{
2021-01-12 11:01:06 +01:00
const auto found = std::as_const(samples).find(name);
if (found != std::as_const(samples).end())
{
const_cast<atomic_t<u64>&>(found->second)++;
}
}
});
}
// Sleep for a short period if enabled
thread_ctrl::wait_for(20, false);
}
});
u32 worker_count = 1;
if (uint hc = utils::get_thread_count(); hc >= 12)
{
worker_count = hc - 10;
}
u32 worker_index = 0;
m_workers = make_single<named_thread_group<spu_llvm_worker>>("SPUW.", worker_count);
auto workers_ptr = m_workers.load();
auto& workers = *workers_ptr;
while (thread_ctrl::state() != thread_state::aborting)
{
for (const auto& pair : registered.pop_all())
{
enqueued.emplace(pair);
// Interrupt and kick profiler thread
const auto lock = prof_mutex.init_always([&]{});
// Register new blocks to collect samples
samples.emplace(pair.first, 0);
}
if (enqueued.empty())
{
// Interrupt profiler thread and put it to sleep
static_cast<void>(prof_mutex.reset());
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
continue;
}
// Find the most used enqueued item
u64 sample_max = 0;
auto found_it = enqueued.begin();
for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
{
const u64 cur = ::at32(std::as_const(samples), it->first);
if (cur > sample_max)
{
sample_max = cur;
found_it = it;
}
}
// Start compiling
const spu_program& func = found_it->second->data;
2019-10-25 23:52:56 +02:00
// Old function pointer (pre-recompiled)
const spu_function_t _old = found_it->second->compiled;
// Remove item from the queue
enqueued.erase(found_it);
// Push the workload
(workers.begin() + (worker_index++ % worker_count))->registered.push(reinterpret_cast<u64>(_old), &func);
}
2021-02-21 15:16:06 +01:00
static_cast<void>(prof_mutex.init_always([&]{ samples.clear(); }));
m_workers.reset();
for (u32 i = 0; i < worker_count; i++)
{
(workers.begin() + i)->operator=(thread_state::aborting);
}
}
spu_llvm& operator=(thread_state)
{
if (const auto workers = m_workers.load())
{
for (u32 i = 0; i < workers->size(); i++)
{
(workers->begin() + i)->operator=(thread_state::aborting);
}
}
return *this;
}
static constexpr auto thread_name = "SPU LLVM"sv;
};
using spu_llvm_thread = named_thread<spu_llvm>;
struct spu_fast : public spu_recompiler_base
{
virtual void init() override
{
if (!m_spurt)
{
2021-03-02 12:59:19 +01:00
m_spurt = &g_fxo->get<spu_runtime>();
}
}
virtual spu_function_t compile(spu_program&& _func) override
{
2019-10-25 23:52:56 +02:00
const auto add_loc = m_spurt->add_empty(std::move(_func));
2019-10-25 23:52:56 +02:00
if (!add_loc)
{
2019-10-25 23:52:56 +02:00
return nullptr;
}
2019-10-25 23:52:56 +02:00
if (add_loc->compiled)
{
2019-10-25 23:52:56 +02:00
return add_loc->compiled;
}
const spu_program& func = add_loc->data;
2019-10-25 23:52:56 +02:00
if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
{
std::string log;
2019-10-25 23:52:56 +02:00
this->dump(func, log);
fs::write_file(m_spurt->get_cache_path() + "spu.log", fs::create + fs::write + fs::append, log);
}
// Allocate executable area with necessary size
const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16);
if (!result)
{
return nullptr;
}
m_pos = func.lower_bound;
m_size = ::size32(func.data) * 4;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
be_t<u64> hash_start;
std::memcpy(&hash_start, output, sizeof(hash_start));
m_hash_start = hash_start;
}
u8* raw = result;
// 8-byte intruction for patching (long NOP)
*raw++ = 0x0f;
*raw++ = 0x1f;
*raw++ = 0x84;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
// mov rax, m_hash_start
*raw++ = 0x48;
*raw++ = 0xb8;
std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
raw += 8;
// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
*raw++ = 0x49;
*raw++ = 0x89;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// Verification (slow)
for (u32 i = 0; i < func.data.size(); i++)
{
if (!func.data[i])
{
continue;
}
// cmp dword ptr [rcx + off], opc
*raw++ = 0x81;
*raw++ = 0xb9;
const u32 off = i * 4;
const u32 opc = func.data[i];
std::memcpy(raw + 0, &off, 4);
std::memcpy(raw + 4, &opc, 4);
raw += 8;
// jne tr_dispatch
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_dispatch) - reinterpret_cast<u64>(raw) - 6;
*raw++ = 0x0f;
*raw++ = 0x85;
std::memcpy(raw + 0, &rel, 4);
raw += 4;
}
// trap
//*raw++ = 0xcc;
// Secondary prologue: sub rsp,0x28
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xec;
*raw++ = 0x28;
// Fix args: xchg r13,rbp
*raw++ = 0x49;
*raw++ = 0x87;
*raw++ = 0xed;
// mov r12d, eax
*raw++ = 0x41;
*raw++ = 0x89;
*raw++ = 0xc4;
// mov esi, 0x7f0
*raw++ = 0xbe;
*raw++ = 0xf0;
*raw++ = 0x07;
*raw++ = 0x00;
*raw++ = 0x00;
// lea rdi, [rbp + spu_thread::gpr]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x7d;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::gpr));
// Save base pc: mov [rbp + spu_thread::base_pc], eax
*raw++ = 0x89;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
// inc block_counter
*raw++ = 0x48;
*raw++ = 0xff;
*raw++ = 0x85;
const u32 blc_off = ::offset32(&spu_thread::block_counter);
std::memcpy(raw, &blc_off, 4);
raw += 4;
// lea r14, [local epilogue]
*raw++ = 0x4c;
*raw++ = 0x8d;
*raw++ = 0x35;
const u32 epi_off = ::size32(func.data) * 16;
std::memcpy(raw, &epi_off, 4);
raw += 4;
// Instructions (each instruction occupies fixed number of bytes)
for (u32 i = 0; i < func.data.size(); i++)
{
const u32 pos = m_pos + i * 4;
if (!func.data[i])
{
// Save pc: mov [rbp + spu_thread::pc], r12d
*raw++ = 0x44;
*raw++ = 0x89;
*raw++ = 0x65;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Epilogue: add rsp,0x28
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x28;
// ret (TODO)
*raw++ = 0xc3;
std::memset(raw, 0xcc, 16 - 9);
raw += 16 - 9;
continue;
}
// Fix endianness
const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
switch (auto type = g_spu_itype.decode(op.opcode))
{
case spu_itype::BRZ:
case spu_itype::BRHZ:
case spu_itype::BRNZ:
case spu_itype::BRHNZ:
{
const u32 target = spu_branch_target(pos, op.i16);
if (0 && target >= m_pos && target < m_pos + m_size)
{
*raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90;
*raw++ = 0x83;
*raw++ = 0xbd;
const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12;
std::memcpy(raw, &off, 4);
raw += 4;
*raw++ = 0x00;
*raw++ = 0x0f;
*raw++ = type == spu_itype::BRZ || type == spu_itype::BRHZ ? 0x84 : 0x85;
const u32 dif = (target - (pos + 4)) / 4 * 16 + 2;
std::memcpy(raw, &dif, 4);
raw += 4;
*raw++ = 0x66;
*raw++ = 0x90;
break;
}
[[fallthrough]];
}
default:
{
// Ballast: mov r15d, pos
*raw++ = 0x41;
*raw++ = 0xbf;
std::memcpy(raw, &pos, 4);
raw += 4;
// mov ebx, opc
*raw++ = 0xbb;
std::memcpy(raw, &op, 4);
raw += 4;
// call spu_* (specially built interpreter function)
2023-09-06 05:53:10 +02:00
const s64 rel = spu_runtime::g_interpreter_table[static_cast<usz>(type)] - reinterpret_cast<u64>(raw) - 5;
*raw++ = 0xe8;
std::memcpy(raw, &rel, 4);
raw += 4;
break;
}
}
}
// Local dispatcher/epilogue: fix stack after branch instruction, then dispatch or return
// add rsp, 8
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x08;
// and rsp, -16
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xe4;
*raw++ = 0xf0;
// lea rax, [r12 - size]
*raw++ = 0x49;
*raw++ = 0x8d;
*raw++ = 0x84;
*raw++ = 0x24;
const u32 msz = 0u - m_size;
std::memcpy(raw, &msz, 4);
raw += 4;
// sub eax, [rbp + spu_thread::base_pc]
*raw++ = 0x2b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
// cmp eax, (0 - size)
*raw++ = 0x3d;
std::memcpy(raw, &msz, 4);
raw += 4;
// jb epilogue
*raw++ = 0x72;
*raw++ = +12;
// movsxd rax, eax
*raw++ = 0x48;
*raw++ = 0x63;
*raw++ = 0xc0;
// shl rax, 2
*raw++ = 0x48;
*raw++ = 0xc1;
*raw++ = 0xe0;
*raw++ = 0x02;
// add rax, r14
*raw++ = 0x4c;
*raw++ = 0x01;
*raw++ = 0xf0;
// jmp rax
*raw++ = 0xff;
*raw++ = 0xe0;
// Save pc: mov [rbp + spu_thread::pc], r12d
*raw++ = 0x44;
*raw++ = 0x89;
*raw++ = 0x65;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Epilogue: add rsp,0x28 ; ret
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x28;
*raw++ = 0xc3;
2019-10-25 23:52:56 +02:00
const auto fn = reinterpret_cast<spu_function_t>(result);
// Install pointer carefully
const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
// Check hash against allowed bounds
const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound;
if ((!inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound || m_hash_start > g_cfg.core.spu_llvm_upper_bound)) ||
(inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound && m_hash_start > g_cfg.core.spu_llvm_upper_bound)))
{
spu_log.error("[Debug] Skipped function %s", fmt::base57(be_t<u64>{m_hash_start}));
}
else if (added)
2019-10-25 23:52:56 +02:00
{
// Send work to LLVM compiler thread
2021-03-02 12:59:19 +01:00
g_fxo->get<spu_llvm_thread>().registered.push(m_hash_start, add_loc);
2019-10-25 23:52:56 +02:00
}
// Rebuild trampoline if necessary
if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
{
return nullptr;
}
2019-10-25 23:52:56 +02:00
if (added)
{
add_loc->compiled.notify_all();
}
2019-10-25 23:52:56 +02:00
return fn;
}
};
std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_fast_llvm_recompiler()
{
return std::make_unique<spu_fast>();
}