rpcsx/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp

7855 lines
177 KiB
C++
Raw Normal View History

2020-12-05 13:08:24 +01:00
#include "stdafx.h"
#include "SPURecompiler.h"
#include "Emu/System.h"
#include "Emu/system_config.h"
#include "Emu/system_progress.hpp"
#include "Emu/system_utils.hpp"
#include "Emu/cache_utils.hpp"
#include "Emu/IdManager.h"
2018-05-02 20:49:19 +02:00
#include "Crypto/sha1.h"
#include "Utilities/StrUtil.h"
#include "Utilities/JIT.h"
#include "util/init_mutex.hpp"
#include "util/shared_ptr.hpp"
#include "Emu/Cell/Modules/cellSync.h"
#include "SPUThread.h"
#include "SPUAnalyser.h"
#include "SPUInterpreter.h"
#include "SPUDisAsm.h"
2017-07-20 16:20:28 +02:00
#include <algorithm>
#include <optional>
#include <unordered_set>
#include "util/v128.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
const extern spu_decoder<spu_itype> g_spu_itype;
const extern spu_decoder<spu_iname> g_spu_iname;
const extern spu_decoder<spu_iflag> g_spu_iflag;
constexpr u32 s_reg_max = spu_recompiler_base::s_reg_max;
// Move 4 args for calling native function from a GHC calling convention function
#if defined(ARCH_X64)
static u8* move_args_ghc_to_native(u8* raw)
{
#ifdef _WIN32
// mov rcx, r13
// mov rdx, rbp
// mov r8, r12
// mov r9, rbx
std::memcpy(raw, "\x4C\x89\xE9\x48\x89\xEA\x4D\x89\xE0\x49\x89\xD9", 12);
#else
// mov rdi, r13
// mov rsi, rbp
// mov rdx, r12
// mov rcx, rbx
std::memcpy(raw, "\x4C\x89\xEF\x48\x89\xEE\x4C\x89\xE2\x48\x89\xD9", 12);
#endif
return raw + 12;
}
#elif defined(ARCH_ARM64)
static void ghc_cpp_trampoline(u64 fn_target, native_asm& c, auto& args)
{
using namespace asmjit;
Label target = c.newLabel();
c.mov(args[0], a64::x19);
c.mov(args[1], a64::x20);
c.mov(args[2], a64::x21);
c.mov(args[3], a64::x22);
c.ldr(a64::x15, arm::Mem(target));
c.br(a64::x15);
c.brk(Imm(0x42)); // Unreachable
c.bind(target);
c.embedUInt64(fn_target);
}
#endif
DECLARE(spu_runtime::tr_dispatch) = []
{
PPU LLVM arm64+macOS port (#12115) * BufferUtils: use naive function pointer on Apple arm64 Use naive function pointer on Apple arm64 because ASLR breaks asmjit. See BufferUtils.cpp comment for explanation on why this happens and how to fix if you want to use asmjit. * build-macos: fix source maps for Mac Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo modes. * LLVM PPU: fix aarch64 on macOS Force MachO on macOS to fix LLVM being unable to patch relocations during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by PPUTranslator/Recompiler. * virtual memory: use 16k pages on aarch64 macOS Temporary hack to get things working by using 16k pages instead of 4k pages in VM emulation. * PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS Fixes some intrinsics usage and patches usages of asmjit to properly emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also patches the SPU recompiler to properly work on arm64 by telling LLVM to target arm64. * virtual memory: fix W^X toggles on macOS aarch64 Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default to RW mode. For both SPU and PPU execution threads, when initialization finishes we toggle to RX mode. This exploits Apple's per-thread setting for RW/RX to let us be technically compliant with the OS's W^X enforcement while not needing to actually separate the memory allocated for code/data. * PPU: implement aarch64 specific functions Implements ppu_gateway for arm64 and patches LLVM initialization to use the correct triple. Adds some fixes for macOS W^X JIT restrictions when entering/exiting JITed code. * PPU: Mark rpcs3 calls as non-tail Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you call a function inside e.g. an L2 syscall, it will clobber LR on arm64 and subtly break returns in emulated code. Only JIT -> JIT "calls" should be tail. * macOS/arm64: compatibility fixes * vm: patch virtual memory for arm64 macOS Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix memory unmapping on 16K page machines with a hack to mark "unmapped" pages as RW. * PPU: remove wrong comment * PPU: fix a merge regression * vm: remove 16k page hacks * PPU: formatting fixes * PPU: fix arm64 null function assembly * ppu: clean up arch-specific instructions
2022-06-14 14:28:38 +02:00
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
#if defined(ARCH_X64)
// Generate a special trampoline to spu_recompiler_base::dispatch with pause instruction
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xf3; // pause
*raw++ = 0x90;
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::dispatch);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_dispatch",
[](native_asm& c, auto& args)
{
c.yield();
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::dispatch), c, args);
c.embed("tr_dispatch", 11);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::tr_branch) = []
{
#if defined(ARCH_X64)
// Generate a trampoline to spu_recompiler_base::branch
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::branch);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_branch",
[](native_asm& c, auto& args)
{
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::branch), c, args);
c.embed("tr_branch", 9);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::tr_interpreter) = []
{
#if defined(ARCH_X64)
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = move_args_ghc_to_native(trptr);
*raw++ = 0xff; // jmp [rip]
*raw++ = 0x25;
std::memset(raw, 0, 4);
const u64 target = reinterpret_cast<u64>(&spu_recompiler_base::old_interpreter);
std::memcpy(raw + 4, &target, 8);
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_interpreter",
[](native_asm& c, auto& args)
{
ghc_cpp_trampoline(reinterpret_cast<u64>(&spu_recompiler_base::old_interpreter), c, args);
c.embed("tr_interpreter", 14);
});
return trptr;
#endif
}();
DECLARE(spu_runtime::g_dispatcher) = []
{
// Allocate 2^20 positions in data area
const auto ptr = reinterpret_cast<std::remove_const_t<decltype(spu_runtime::g_dispatcher)>>(jit_runtime::alloc(sizeof(*g_dispatcher), 64, false));
for (auto& x : *ptr)
{
x.raw() = tr_dispatch;
}
return ptr;
}();
DECLARE(spu_runtime::tr_all) = []
{
#if defined(ARCH_X64)
u8* const trptr = jit_runtime::alloc(32, 16);
u8* raw = trptr;
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// mov eax, [rcx]
*raw++ = 0x8b;
*raw++ = 0x01;
// shr eax, (32 - 20)
*raw++ = 0xc1;
*raw++ = 0xe8;
*raw++ = 0x0c;
// Load g_dispatcher to rdx
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x15;
const s32 r32 = ::narrow<s32>(reinterpret_cast<u64>(g_dispatcher) - reinterpret_cast<u64>(raw) - 4);
std::memcpy(raw, &r32, 4);
raw += 4;
// Update block_hash (set zero): mov [r13 + spu_thread::m_block_hash], 0
*raw++ = 0x49;
*raw++ = 0xc7;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x00;
// jmp [rdx + rax * 8]
*raw++ = 0xff;
*raw++ = 0x24;
*raw++ = 0xc2;
return reinterpret_cast<spu_function_t>(trptr);
#elif defined(ARCH_ARM64)
auto trptr = build_function_asm<spu_function_t>("tr_all",
[](native_asm& c, auto& args)
{
using namespace asmjit;
// w1: PC (eax in x86 SPU)
// x7: lsa (rcx in x86 SPU)
// Load PC
Label pc_offset = c.newLabel();
c.ldr(a64::x0, arm::Mem(pc_offset));
c.ldr(a64::w1, arm::Mem(a64::x19, a64::x0)); // REG_Base + offset(spu_thread::pc)
// Compute LS address = REG_Sp + PC, store into x7 (use later)
c.add(a64::x7, a64::x20, a64::x1);
// Load 32b from LS address
c.ldr(a64::w3, arm::Mem(a64::x7));
// shr (32 - 20)
c.lsr(a64::w3, a64::w3, Imm(32 - 20));
// Load g_dispatcher
Label g_dispatcher_offset = c.newLabel();
c.ldr(a64::x4, arm::Mem(g_dispatcher_offset));
// Update block hash
Label block_hash_offset = c.newLabel();
c.mov(a64::x5, Imm(0));
c.ldr(a64::x6, arm::Mem(block_hash_offset));
c.str(a64::x5, arm::Mem(a64::x19, a64::x6)); // REG_Base + offset(spu_thread::block_hash)
// Jump to [g_dispatcher + idx * 8]
c.mov(a64::x6, Imm(8));
c.mul(a64::x6, a64::x3, a64::x6);
c.add(a64::x4, a64::x4, a64::x6);
c.ldr(a64::x4, arm::Mem(a64::x4));
c.br(a64::x4);
c.bind(pc_offset);
c.embedUInt64(::offset32(&spu_thread::pc));
c.bind(g_dispatcher_offset);
c.embedUInt64(reinterpret_cast<u64>(g_dispatcher));
c.bind(block_hash_offset);
c.embedUInt64(::offset32(&spu_thread::block_hash));
c.embed("tr_all", 6);
});
return trptr;
#else
#error "Unimplemented"
#endif
}();
DECLARE(spu_runtime::g_gateway) = build_function_asm<spu_function_t>("spu_gateway", [](native_asm& c, auto& args)
{
// Gateway for SPU dispatcher, converts from native to GHC calling convention, also saves RSP value for spu_escape
using namespace asmjit;
#if defined(ARCH_X64)
#ifdef _WIN32
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rsi);
c.push(x86::rdi);
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 0xa8);
c.movaps(x86::oword_ptr(x86::rsp, 0x90), x86::xmm15);
c.movaps(x86::oword_ptr(x86::rsp, 0x80), x86::xmm14);
c.movaps(x86::oword_ptr(x86::rsp, 0x70), x86::xmm13);
c.movaps(x86::oword_ptr(x86::rsp, 0x60), x86::xmm12);
c.movaps(x86::oword_ptr(x86::rsp, 0x50), x86::xmm11);
c.movaps(x86::oword_ptr(x86::rsp, 0x40), x86::xmm10);
c.movaps(x86::oword_ptr(x86::rsp, 0x30), x86::xmm9);
c.movaps(x86::oword_ptr(x86::rsp, 0x20), x86::xmm8);
c.movaps(x86::oword_ptr(x86::rsp, 0x10), x86::xmm7);
c.movaps(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
#else
c.push(x86::rbp);
c.push(x86::r15);
c.push(x86::r14);
c.push(x86::r13);
c.push(x86::r12);
c.push(x86::rbx);
c.push(x86::rax);
#endif
// Save native stack pointer for longjmp emulation
c.mov(x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)), x86::rsp);
// Move 4 args (despite spu_function_t def)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);
if (utils::has_avx())
{
c.vzeroupper();
}
2022-01-17 17:24:53 +01:00
c.call(spu_runtime::tr_all);
if (utils::has_avx())
{
c.vzeroupper();
}
#ifdef _WIN32
c.movaps(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rsp, 0x10));
c.movaps(x86::xmm8, x86::oword_ptr(x86::rsp, 0x20));
c.movaps(x86::xmm9, x86::oword_ptr(x86::rsp, 0x30));
c.movaps(x86::xmm10, x86::oword_ptr(x86::rsp, 0x40));
c.movaps(x86::xmm11, x86::oword_ptr(x86::rsp, 0x50));
c.movaps(x86::xmm12, x86::oword_ptr(x86::rsp, 0x60));
c.movaps(x86::xmm13, x86::oword_ptr(x86::rsp, 0x70));
c.movaps(x86::xmm14, x86::oword_ptr(x86::rsp, 0x80));
c.movaps(x86::xmm15, x86::oword_ptr(x86::rsp, 0x90));
c.add(x86::rsp, 0xa8);
c.pop(x86::rbx);
c.pop(x86::rbp);
c.pop(x86::rdi);
c.pop(x86::rsi);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
#else
c.add(x86::rsp, +8);
c.pop(x86::rbx);
c.pop(x86::r12);
c.pop(x86::r13);
c.pop(x86::r14);
c.pop(x86::r15);
c.pop(x86::rbp);
#endif
c.ret();
#elif defined(ARCH_ARM64)
// Push callee saved registers to the stack
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
c.sub(a64::sp, a64::sp, Imm(112));
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.str(a64::x30, arm::Mem(a64::sp, 96));
// Save native stack pointer for longjmp emulation
Label sp_offset = c.newLabel();
c.ldr(a64::x26, arm::Mem(sp_offset));
// sp not allowed to be used in load/stores directly
c.mov(a64::x15, a64::sp);
c.str(a64::x15, arm::Mem(args[0], a64::x26));
// Move 4 args (despite spu_function_t def)
c.mov(a64::x19, args[0]);
c.mov(a64::x20, args[1]);
c.mov(a64::x21, args[2]);
c.mov(a64::x22, args[3]);
// Save ret address to stack
// since non-tail calls to cpp fns may corrupt lr and
// g_tail_escape may jump out of a fn before the epilogue can restore lr
Label ret_addr = c.newLabel();
c.adr(a64::x0, ret_addr);
c.str(a64::x0, arm::Mem(a64::sp, 104));
Label call_target = c.newLabel();
c.ldr(a64::x0, arm::Mem(call_target));
c.blr(a64::x0);
c.bind(ret_addr);
// Restore stack ptr
c.ldr(a64::x26, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(a64::x19, a64::x26));
c.mov(a64::sp, a64::x15);
// Restore registers from the stack
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
// Restore stack ptr
c.add(a64::sp, a64::sp, Imm(112));
// Return
c.ret(a64::x30);
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.bind(call_target);
c.embedUInt64(reinterpret_cast<u64>(spu_runtime::tr_all));
c.embed("spu_gateway", 11);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_escape) = build_function_asm<void(*)(spu_thread*)>("spu_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Return to the return location
c.sub(x86::rsp, 8);
c.ret();
#elif defined(ARCH_ARM64)
// Restore native stack pointer (longjmp emulation)
Label sp_offset = c.newLabel();
c.ldr(a64::x15, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(args[0], a64::x15));
c.mov(a64::sp, a64::x15);
c.ldr(a64::x30, arm::Mem(a64::sp, 104));
c.ret(a64::x30);
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.embed("spu_escape", 10);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_tail_escape) = build_function_asm<void(*)(spu_thread*, spu_function_t, u8*)>("spu_tail_escape", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
// Restore native stack pointer (longjmp emulation)
c.mov(x86::rsp, x86::qword_ptr(args[0], ::offset32(&spu_thread::saved_native_sp)));
// Adjust stack for initial call instruction in the gateway
c.sub(x86::rsp, 16);
// Tail call, GHC CC (second arg)
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, x86::qword_ptr(args[0], ::offset32(&spu_thread::ls)));
c.mov(x86::r12, args[2]);
c.xor_(x86::ebx, x86::ebx);
c.mov(x86::qword_ptr(x86::rsp), args[1]);
c.ret();
#elif defined(ARCH_ARM64)
// Restore native stack pointer (longjmp emulation)
Label sp_offset = c.newLabel();
c.ldr(a64::x15, arm::Mem(sp_offset));
c.ldr(a64::x15, arm::Mem(args[0], a64::x15));
c.mov(a64::sp, a64::x15);
// Reload lr, since it might've been clobbered by a cpp fn
// and g_tail_escape runs before epilogue
c.ldr(a64::x30, arm::Mem(a64::sp, 104));
// Tail call, GHC CC
c.mov(a64::x19, args[0]); // REG_Base
Label ls_offset = c.newLabel();
c.ldr(a64::x20, arm::Mem(ls_offset));
c.ldr(a64::x20, arm::Mem(args[0], a64::x20)); // REG_Sp
c.mov(a64::x21, args[2]); // REG_Hp
c.eor(a64::w22, a64::w22, a64::w22); // REG_R1
c.br(args[1]);
c.bind(ls_offset);
c.embedUInt64(::offset32(&spu_thread::ls));
c.bind(sp_offset);
c.embedUInt64(::offset32(&spu_thread::saved_native_sp));
c.embed("spu_tail_escape", 15);
#else
#error "Unimplemented"
#endif
});
DECLARE(spu_runtime::g_interpreter_table) = {};
DECLARE(spu_runtime::g_interpreter) = nullptr;
spu_cache::spu_cache(const std::string& loc)
: m_file(loc, fs::read + fs::write + fs::create + fs::append)
{
}
spu_cache::~spu_cache()
{
}
extern void utilize_spu_data_segment(u32 vaddr, const void* ls_data_vaddr, u32 size)
{
if (vaddr % 4)
{
return;
}
size &= -4;
if (!size || vaddr + size > SPU_LS_SIZE)
{
return;
}
if (!g_cfg.core.llvm_precompilation)
{
return;
}
g_fxo->need<spu_cache>();
if (!g_fxo->get<spu_cache>().collect_funcs_to_precompile)
{
return;
}
std::basic_string<u32> data(size / 4, 0);
std::memcpy(data.data(), ls_data_vaddr, size);
spu_cache::precompile_data_t obj{vaddr, std::move(data)};
obj.funcs = spu_thread::discover_functions(vaddr, { reinterpret_cast<const u8*>(ls_data_vaddr), size }, vaddr != 0, umax);
if (obj.funcs.empty())
{
// Nothing to add
return;
}
for (u32 addr : obj.funcs)
{
spu_log.notice("Found SPU function at: 0x%05x", addr);
}
spu_log.notice("Found %u SPU functions", obj.funcs.size());
g_fxo->get<spu_cache>().precompile_funcs.push(std::move(obj));
}
2023-09-02 10:55:21 +02:00
// For SPU cache validity check
static u16 calculate_crc16(const uchar* data, usz length)
{
u16 crc = umax;
while (length--)
{
u8 x = (crc >> 8) ^ *data++;
x ^= (x >> 4);
crc = static_cast<u16>((crc << 8) ^ (x << 12) ^ (x << 5) ^ x);
}
return crc;
}
std::deque<spu_program> spu_cache::get()
{
std::deque<spu_program> result;
if (!m_file)
{
return result;
}
m_file.seek(0);
// TODO: signal truncated or otherwise broken file
while (true)
{
2023-09-02 10:55:21 +02:00
struct block_info_t
{
be_t<u16> crc;
be_t<u16> size;
be_t<u32> addr;
} block_info{};
2023-09-02 10:55:21 +02:00
if (!m_file.read(block_info))
{
break;
}
2023-09-02 10:55:21 +02:00
const u32 crc = block_info.crc;
const u32 size = block_info.size;
const u32 addr = block_info.addr;
if (utils::add_saturate<u32>(addr, size * 4) > SPU_LS_SIZE)
{
break;
}
2023-09-02 10:55:21 +02:00
std::vector<u32> func;
if (!m_file.read(func, size))
{
break;
}
if (!size || !func[0])
{
// Skip old format Giga entries
continue;
}
2023-09-02 10:55:21 +02:00
// CRC check is optional to be compatible with old format
if (crc && std::max<u32>(calculate_crc16(reinterpret_cast<const uchar*>(func.data()), size * 4), 1) != crc)
{
// Invalid, but continue anyway
continue;
}
spu_program res;
res.entry_point = addr;
res.lower_bound = addr;
res.data = std::move(func);
result.emplace_front(std::move(res));
}
return result;
}
void spu_cache::add(const spu_program& func)
{
if (!m_file)
{
return;
}
be_t<u32> size = ::size32(func.data);
be_t<u32> addr = func.entry_point;
2023-09-02 10:55:21 +02:00
// Add CRC (forced non-zero)
size |= std::max<u32>(calculate_crc16(reinterpret_cast<const uchar*>(func.data.data()), size * 4), 1) << 16;
const fs::iovec_clone gather[3]
{
{&size, sizeof(size)},
{&addr, sizeof(addr)},
{func.data.data(), func.data.size() * 4}
};
// Append data
m_file.write_gather(gather, 3);
}
void spu_cache::initialize(bool build_existing_cache)
{
spu_runtime::g_interpreter = spu_runtime::g_gateway;
if (g_cfg.core.spu_decoder == spu_decoder_type::_static || g_cfg.core.spu_decoder == spu_decoder_type::dynamic)
{
for (auto& x : *spu_runtime::g_dispatcher)
{
x.raw() = spu_runtime::tr_interpreter;
}
}
const std::string ppu_cache = rpcs3::cache::get_ppu_cache();
if (ppu_cache.empty())
{
return;
}
// SPU cache file (version + block size type)
const std::string loc = ppu_cache + "spu-" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1-tane.dat";
2019-09-26 17:45:29 +02:00
spu_cache cache(loc);
2019-09-26 17:45:29 +02:00
if (!cache)
{
spu_log.error("Failed to initialize SPU cache at: %s", loc);
return;
}
// Read cache
2019-09-26 17:45:29 +02:00
auto func_list = cache.get();
2020-12-18 08:39:54 +01:00
atomic_t<usz> fnext{};
atomic_t<u8> fail_flag{0};
auto data_list = g_fxo->get<spu_cache>().precompile_funcs.pop_all();
g_fxo->get<spu_cache>().collect_funcs_to_precompile = false;
2023-12-29 18:33:29 +01:00
usz total_precompile = 0;
for (auto& sec : data_list)
{
total_precompile += sec.funcs.size();
}
const bool spu_precompilation_enabled = func_list.empty() && g_cfg.core.spu_cache && g_cfg.core.llvm_precompilation;
2023-08-28 11:20:17 +02:00
if (spu_precompilation_enabled)
{
// What compiles in this case goes straight to disk
g_fxo->get<spu_cache>() = std::move(cache);
}
else if (!build_existing_cache)
{
return;
}
2023-08-28 11:20:17 +02:00
else
{
total_precompile = 0;
data_list = {};
2023-08-28 11:20:17 +02:00
}
atomic_t<usz> data_indexer = 0;
if (g_cfg.core.spu_decoder == spu_decoder_type::dynamic || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
if (auto compiler = spu_recompiler_base::make_llvm_recompiler(11))
{
compiler->init();
2019-10-25 23:52:56 +02:00
if (compiler->compile({}) && spu_runtime::g_interpreter)
{
spu_log.success("SPU Runtime: Built the interpreter.");
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
{
return;
}
}
2019-10-25 23:52:56 +02:00
else
{
spu_log.fatal("SPU Runtime: Failed to build the interpreter.");
2019-10-25 23:52:56 +02:00
}
}
}
2020-04-07 14:41:38 +02:00
u32 worker_count = 0;
std::optional<scoped_progress_dialog> progr;
u32 total_funcs = 0;
2020-02-29 12:57:41 +01:00
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
2023-12-29 18:33:29 +01:00
const usz add_count = func_list.size() + total_precompile;
if (add_count)
{
2023-12-29 18:33:29 +01:00
total_funcs = build_existing_cache ? ::narrow<u32>(add_count) : 0;
}
2023-12-29 18:33:29 +01:00
worker_count = std::min<u32>(rpcs3::utils::get_max_threads(), ::narrow<u32>(add_count));
2020-02-29 12:57:41 +01:00
}
atomic_t<u32> pending_progress = 0;
atomic_t<bool> showing_progress = false;
if (!g_progr_ptotal)
{
g_progr_ptotal += total_funcs;
showing_progress.release(true);
progr.emplace("Building SPU cache...");
}
2020-04-07 14:41:38 +02:00
named_thread_group workers("SPU Worker ", worker_count, [&]() -> uint
{
PPU LLVM arm64+macOS port (#12115) * BufferUtils: use naive function pointer on Apple arm64 Use naive function pointer on Apple arm64 because ASLR breaks asmjit. See BufferUtils.cpp comment for explanation on why this happens and how to fix if you want to use asmjit. * build-macos: fix source maps for Mac Tell Qt not to strip debug symbols when we're in debug or relwithdebinfo modes. * LLVM PPU: fix aarch64 on macOS Force MachO on macOS to fix LLVM being unable to patch relocations during codegen. Adds Aarch64 NEON intrinsics for x86 intrinsics used by PPUTranslator/Recompiler. * virtual memory: use 16k pages on aarch64 macOS Temporary hack to get things working by using 16k pages instead of 4k pages in VM emulation. * PPU/SPU: fix NEON intrinsics and compilation for arm64 macOS Fixes some intrinsics usage and patches usages of asmjit to properly emit absolute jmps so ASLR doesn't cause out of bounds rel jumps. Also patches the SPU recompiler to properly work on arm64 by telling LLVM to target arm64. * virtual memory: fix W^X toggles on macOS aarch64 Fixes W^X on macOS aarch64 by setting all JIT mmap'd regions to default to RW mode. For both SPU and PPU execution threads, when initialization finishes we toggle to RX mode. This exploits Apple's per-thread setting for RW/RX to let us be technically compliant with the OS's W^X enforcement while not needing to actually separate the memory allocated for code/data. * PPU: implement aarch64 specific functions Implements ppu_gateway for arm64 and patches LLVM initialization to use the correct triple. Adds some fixes for macOS W^X JIT restrictions when entering/exiting JITed code. * PPU: Mark rpcs3 calls as non-tail Strictly speaking, rpcs3 JIT -> C++ calls are not tail calls. If you call a function inside e.g. an L2 syscall, it will clobber LR on arm64 and subtly break returns in emulated code. Only JIT -> JIT "calls" should be tail. * macOS/arm64: compatibility fixes * vm: patch virtual memory for arm64 macOS Tag mmap calls with MAP_JIT to allow W^X on macOS. Fix mmap calls to existing mmap'd addresses that were tagged with MAP_JIT on macOS. Fix memory unmapping on 16K page machines with a hack to mark "unmapped" pages as RW. * PPU: remove wrong comment * PPU: fix a merge regression * vm: remove 16k page hacks * PPU: formatting fixes * PPU: fix arm64 null function assembly * ppu: clean up arch-specific instructions
2022-06-14 14:28:38 +02:00
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
// Set low priority
thread_ctrl::scoped_priority low_prio(-1);
2020-02-29 12:57:41 +01:00
// Initialize compiler instances for parallel compilation
std::unique_ptr<spu_recompiler_base> compiler;
if (g_cfg.core.spu_decoder == spu_decoder_type::asmjit)
{
compiler = spu_recompiler_base::make_asmjit_recompiler();
}
else if (g_cfg.core.spu_decoder == spu_decoder_type::llvm)
{
compiler = spu_recompiler_base::make_llvm_recompiler();
}
compiler->init();
// Counter for error reporting
u32 logged_error = 0;
2020-02-29 12:57:41 +01:00
// How much every thread compiled
uint result = 0;
// Fake LS
std::vector<be_t<u32>> ls(0x10000);
usz func_i = fnext++;
// Ensure some actions are performed on a single thread
const bool is_first_thread = func_i == 0;
// Build functions
for (; func_i < func_list.size(); func_i = fnext++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0)
{
const spu_program& func = std::as_const(func_list)[func_i];
if (Emu.IsStopped() || fail_flag)
2018-06-01 14:52:31 +02:00
{
continue;
}
// Get data start
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
be_t<u64> hash_start;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
std::memcpy(&hash_start, output, sizeof(hash_start));
}
// Check hash against allowed bounds
const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound;
if ((!inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound || hash_start > g_cfg.core.spu_llvm_upper_bound)) ||
(inverse_bounds && (hash_start < g_cfg.core.spu_llvm_lower_bound && hash_start > g_cfg.core.spu_llvm_upper_bound)))
{
spu_log.error("[Debug] Skipped function %s", fmt::base57(hash_start));
result++;
continue;
}
// Initialize LS with function data only
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
if (logged_error < 2)
{
std::string log;
compiler->dump(func, log);
spu_log.notice("[0x%05x] Function: %s", func.entry_point, log);
logged_error++;
}
}
else if (!compiler->compile(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
continue;
}
// Clear fake LS
2019-10-25 23:52:56 +02:00
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
2020-02-29 12:57:41 +01:00
result++;
if (is_first_thread && !showing_progress)
{
if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal)
{
showing_progress = true;
g_progr_pdone += pending_progress.exchange(0);
g_progr_ptotal += total_funcs;
progr.emplace("Building SPU cache...");
}
}
else if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
2018-06-01 14:52:31 +02:00
}
2020-02-29 12:57:41 +01:00
u32 last_sec_idx = umax;
for (func_i = data_indexer++;; func_i = data_indexer++, (showing_progress ? g_progr_pdone : pending_progress) += build_existing_cache ? 1 : 0)
{
2023-12-29 18:33:29 +01:00
usz passed_count = 0;
u32 func_addr = 0;
u32 next_func = 0;
u32 sec_addr = umax;
u32 sec_idx = 0;
std::basic_string_view<u32> inst_data;
// Try to get the data this index points to
for (auto& sec : data_list)
{
if (func_i < passed_count + sec.funcs.size())
{
2023-12-29 18:33:29 +01:00
const usz func_idx = func_i - passed_count;
sec_addr = sec.vaddr;
func_addr = ::at32(sec.funcs, func_idx);
inst_data = sec.inst_data;
2023-12-29 18:33:29 +01:00
next_func = sec.funcs.size() >= func_idx ? ::narrow<u32>(sec_addr + inst_data.size() * 4) : sec.funcs[func_idx];
break;
}
passed_count += sec.funcs.size();
sec_idx++;
}
if (sec_addr == umax)
{
// End of compilation for thread
break;
}
if (Emu.IsStopped() || fail_flag)
{
continue;
}
if (last_sec_idx != sec_idx)
{
if (last_sec_idx != umax)
{
// Clear fake LS of previous section
auto& sec = data_list[last_sec_idx];
std::memset(ls.data() + sec.vaddr / 4, 0, sec.inst_data.size() * 4);
}
// Initialize LS with the entire section data
for (u32 i = 0, pos = sec_addr; i < inst_data.size(); i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(inst_data[i]);
}
last_sec_idx = sec_idx;
}
u32 block_addr = func_addr;
std::map<u32, std::basic_string<u32>> targets;
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), block_addr, &targets);
while (!func2.data.empty())
{
const u32 last_inst = std::bit_cast<be_t<u32>>(func2.data.back());
2023-12-29 18:33:29 +01:00
const u32 prog_size = ::size32(func2.data);
if (!compiler->compile(std::move(func2)))
{
// Likely, out of JIT memory. Signal to prevent further building.
fail_flag |= 1;
break;
}
result++;
const u32 start_new = block_addr + prog_size * 4;
if (start_new >= next_func || (start_new == next_func - 4 && ls[start_new / 4] == 0x200000u))
{
// Completed
break;
}
if (auto type = g_spu_itype.decode(last_inst);
type == spu_itype::BRSL || type == spu_itype::BRASL || type == spu_itype::BISL || type == spu_itype::SYNC)
{
if (ls[start_new / 4] && g_spu_itype.decode(ls[start_new / 4]) != spu_itype::UNK)
{
spu_log.notice("Precompiling fallthrough to 0x%05x", start_new);
func2 = compiler->analyse(ls.data(), start_new, &targets);
block_addr = start_new;
continue;
}
}
if (targets.empty())
{
break;
}
const auto upper = targets.upper_bound(func_addr);
if (upper == targets.begin())
{
break;
}
u32 new_entry = umax;
// Find the lowest target in the space in-between
for (auto it = std::prev(upper); it != targets.end() && it->first < start_new && new_entry > start_new; it++)
{
for (u32 target : it->second)
{
if (target >= start_new && target < next_func)
{
if (target < new_entry)
{
new_entry = target;
if (new_entry == start_new)
{
// Cannot go lower
break;
}
}
}
}
}
if (new_entry != umax && !spu_thread::is_exec_code(new_entry, { reinterpret_cast<const u8*>(ls.data()), SPU_LS_SIZE }, 0, true))
{
new_entry = umax;
}
if (new_entry == umax)
{
new_entry = start_new;
while (new_entry < next_func && (ls[start_new / 4] < 0x3fffc || !spu_thread::is_exec_code(new_entry, { reinterpret_cast<const u8*>(ls.data()), SPU_LS_SIZE }, 0, true)))
{
new_entry += 4;
}
if (new_entry >= next_func || (new_entry == next_func - 4 && ls[new_entry / 4] == 0x200000u))
{
// Completed
break;
}
}
spu_log.notice("Precompiling filler space at 0x%05x (next=0x%05x)", new_entry, next_func);
func2 = compiler->analyse(ls.data(), new_entry, &targets);
block_addr = new_entry;
}
if (is_first_thread && !showing_progress)
{
if (!g_progr.load() && !g_progr_ptotal && !g_progr_ftotal)
{
showing_progress = true;
g_progr_pdone += pending_progress.exchange(0);
g_progr_ptotal += total_funcs;
progr.emplace("Building SPU cache...");
}
}
else if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
}
if (showing_progress && pending_progress)
{
// Cover missing progress due to a race
g_progr_pdone += pending_progress.exchange(0);
}
2020-02-29 12:57:41 +01:00
return result;
});
u32 built_total = 0;
2020-02-29 12:57:41 +01:00
// Join (implicitly) and print individual results
for (u32 i = 0; i < workers.size(); i++)
{
2020-02-29 12:57:41 +01:00
spu_log.notice("SPU Runtime: Worker %u built %u programs.", i + 1, workers[i]);
built_total += workers[i];
}
spu_log.notice("SPU Runtime: Workers built %u programs.", built_total);
if (Emu.IsStopped())
{
spu_log.error("SPU Runtime: Cache building aborted.");
return;
}
if (fail_flag)
{
spu_log.fatal("SPU Runtime: Cache building failed (out of memory).");
return;
}
2020-02-29 12:57:41 +01:00
if ((g_cfg.core.spu_decoder == spu_decoder_type::asmjit || g_cfg.core.spu_decoder == spu_decoder_type::llvm) && !func_list.empty())
{
spu_log.success("SPU Runtime: Built %u functions.", func_list.size());
if (g_cfg.core.spu_debug)
{
std::string dump;
dump.reserve(10'000'000);
std::map<std::basic_string_view<u8>, spu_program*> sorted;
for (auto&& f : func_list)
{
// Interpret as a byte string
std::basic_string_view<u8> data = {reinterpret_cast<u8*>(f.data.data()), f.data.size() * sizeof(u32)};
sorted[data] = &f;
}
std::unordered_set<u32> depth_n;
u32 n_max = 0;
for (auto&& [bytes, f] : sorted)
{
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, bytes.data(), bytes.size());
sha1_finish(&ctx, output);
fmt::append(dump, "\n\t[%s] ", fmt::base57(output));
}
u32 depth_m = 0;
for (auto&& [data, f2] : sorted)
{
u32 depth = 0;
if (f2 == f)
{
continue;
}
for (u32 i = 0; i < bytes.size(); i++)
{
if (i < data.size() && data[i] == bytes[i])
{
depth++;
}
else
{
break;
}
}
depth_n.emplace(depth);
depth_m = std::max(depth, depth_m);
}
fmt::append(dump, "c=%06d,d=%06d ", depth_n.size(), depth_m);
bool sk = false;
2024-05-20 15:04:15 +02:00
for (u32 i = 0; i < std::min<usz>(bytes.size(), std::max<usz>(256, depth_m)); i++)
{
if (depth_m == i)
{
dump += '|';
sk = true;
}
fmt::append(dump, "%02x", bytes[i]);
if (i % 4 == 3)
{
if (sk)
{
sk = false;
}
else
{
dump += ' ';
}
dump += ' ';
}
}
fmt::append(dump, "\n\t%49s", "");
2024-05-20 15:04:15 +02:00
for (u32 i = 0; i < std::min<usz>(f->data.size(), std::max<usz>(64, utils::aligned_div<u32>(depth_m, 4))); i++)
{
fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast<be_t<u32>>(f->data[i])));
}
n_max = std::max(n_max, ::size32(depth_n));
depth_n.clear();
}
spu_log.notice("SPU Cache Dump (max_c=%d): %s", n_max, dump);
}
}
2019-09-26 17:45:29 +02:00
// Initialize global cache instance
2023-08-28 11:20:17 +02:00
if (g_cfg.core.spu_cache && cache)
{
2021-03-02 12:59:19 +01:00
g_fxo->get<spu_cache>() = std::move(cache);
}
}
bool spu_program::operator==(const spu_program& rhs) const noexcept
{
// TODO
return entry_point - lower_bound == rhs.entry_point - rhs.lower_bound && data == rhs.data;
}
bool spu_program::operator<(const spu_program& rhs) const noexcept
{
const u32 lhs_offs = (entry_point - lower_bound) / 4;
const u32 rhs_offs = (rhs.entry_point - rhs.lower_bound) / 4;
// Select range for comparison
std::basic_string_view<u32> lhs_data(data.data() + lhs_offs, data.size() - lhs_offs);
std::basic_string_view<u32> rhs_data(rhs.data.data() + rhs_offs, rhs.data.size() - rhs_offs);
const auto cmp0 = lhs_data.compare(rhs_data);
if (cmp0 < 0)
return true;
else if (cmp0 > 0)
return false;
// Compare from address 0 to the point before the entry point (TODO: undesirable)
lhs_data = {data.data(), lhs_offs};
rhs_data = {rhs.data.data(), rhs_offs};
const auto cmp1 = lhs_data.compare(rhs_data);
if (cmp1 < 0)
return true;
else if (cmp1 > 0)
return false;
// TODO
return lhs_offs < rhs_offs;
}
spu_runtime::spu_runtime()
{
// Clear LLVM output
m_cache_path = rpcs3::cache::get_ppu_cache();
2019-08-27 16:25:49 +02:00
if (m_cache_path.empty())
{
return;
}
if (g_cfg.core.spu_debug && g_cfg.core.spu_decoder != spu_decoder_type::dynamic && g_cfg.core.spu_decoder != spu_decoder_type::_static)
{
if (!fs::create_dir(m_cache_path + "llvm/"))
{
fs::remove_all(m_cache_path + "llvm/", false);
}
fs::file(m_cache_path + "spu.log", fs::rewrite);
fs::file(m_cache_path + "spu-ir.log", fs::rewrite);
}
}
spu_item* spu_runtime::add_empty(spu_program&& data)
{
if (data.data.empty())
{
2019-10-25 23:52:56 +02:00
return nullptr;
}
2019-10-25 23:52:56 +02:00
// Store previous item if already added
spu_item* prev = nullptr;
2019-10-25 23:52:56 +02:00
//Try to add item that doesn't exist yet
const auto ret = m_stuff[data.data[0] >> 12].push_if([&](spu_item& _new, spu_item& _old)
{
if (_new.data == _old.data)
{
2019-10-25 23:52:56 +02:00
prev = &_old;
return false;
}
2019-10-25 23:52:56 +02:00
return true;
}, std::move(data));
if (ret)
{
return ret;
}
2019-10-25 23:52:56 +02:00
return prev;
}
spu_function_t spu_runtime::rebuild_ubertrampoline(u32 id_inst)
{
// Prepare sorted list
2019-10-25 23:52:56 +02:00
static thread_local std::vector<std::pair<std::basic_string_view<u32>, spu_function_t>> m_flat_list;
// Remember top position
auto stuff_it = ::at32(m_stuff, id_inst >> 12).begin();
auto stuff_end = ::at32(m_stuff, id_inst >> 12).end();
{
2019-10-25 23:52:56 +02:00
if (stuff_it->trampoline)
{
return stuff_it->trampoline;
}
2019-10-25 23:52:56 +02:00
m_flat_list.clear();
for (auto it = stuff_it; it != stuff_end; ++it)
{
if (const auto ptr = it->compiled.load())
{
std::basic_string_view<u32> range{it->data.data.data(), it->data.data.size()};
range.remove_prefix((it->data.entry_point - it->data.lower_bound) / 4);
2019-10-25 23:52:56 +02:00
m_flat_list.emplace_back(range, ptr);
}
else
{
// Pull oneself deeper (TODO)
++stuff_it;
}
}
}
std::sort(m_flat_list.begin(), m_flat_list.end(), FN(x.first < y.first));
2019-10-25 23:52:56 +02:00
struct work
{
u32 size;
u16 from;
u16 level;
u8* rel32;
decltype(m_flat_list)::iterator beg;
decltype(m_flat_list)::iterator end;
};
// Scratch vector
static thread_local std::vector<work> workload;
// Generate a dispatcher (übertrampoline)
const auto beg = m_flat_list.begin();
const auto _end = m_flat_list.end();
const u32 size0 = ::size32(m_flat_list);
2019-10-25 23:52:56 +02:00
auto result = beg->second;
if (size0 != 1)
{
#if defined(ARCH_ARM64)
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 128 + 16, 16);
if (!wxptr)
{
return nullptr;
}
// Raw assembly pointer
u8* raw = wxptr;
auto make_jump = [&](asmjit::arm::CondCode op, auto target)
{
// 36 bytes
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// ldr x9, #16 -> ldr x9, taddr
*raw++ = 0x89;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
if (op == asmjit::arm::CondCode::kAlways)
{
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
// nop
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
else
{
// b.COND #8 -> b.COND do_branch
switch (op)
{
case asmjit::arm::CondCode::kUnsignedLT:
*raw++ = 0x43;
break;
case asmjit::arm::CondCode::kUnsignedGT:
*raw++ = 0x48;
break;
default:
asm("brk 0x42");
}
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x54;
// b #16 -> b cont
*raw++ = 0x04;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// do_branch: br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1f;
*raw++ = 0xD6;
}
// taddr
std::memcpy(raw, &taddr, 8);
raw += 8;
// cont: next instruction
};
#elif defined(ARCH_X64)
// Allocate some writable executable memory
u8* const wxptr = jit_runtime::alloc(size0 * 22 + 14, 16);
if (!wxptr)
{
return nullptr;
}
// Raw assembly pointer
u8* raw = wxptr;
// Write jump instruction with rel32 immediate
auto make_jump = [&](u8 op, auto target)
{
ensure(raw + 8 <= wxptr + size0 * 22 + 16);
// Fallback to dispatch if no target
const u64 taddr = target ? reinterpret_cast<u64>(target) : reinterpret_cast<u64>(tr_dispatch);
// Compute the distance
const s64 rel = taddr - reinterpret_cast<u64>(raw) - (op != 0xe9 ? 6 : 5);
ensure(rel >= s32{smin} && rel <= s32{smax});
if (op != 0xe9)
{
// First jcc byte
*raw++ = 0x0f;
ensure((op >> 4) == 0x8);
}
*raw++ = op;
const s32 r32 = static_cast<s32>(rel);
std::memcpy(raw, &r32, 4);
raw += 4;
};
#endif
workload.clear();
workload.reserve(size0);
workload.emplace_back();
workload.back().size = size0;
workload.back().level = 0;
workload.back().from = -1;
2021-04-09 21:12:47 +02:00
workload.back().rel32 = nullptr;
workload.back().beg = beg;
workload.back().end = _end;
// LS address starting from PC is already loaded into rcx (see spu_runtime::tr_all)
2020-12-18 08:39:54 +01:00
for (usz i = 0; i < workload.size(); i++)
{
// Get copy of the workload info
auto w = workload[i];
// Split range in two parts
auto it = w.beg;
auto it2 = w.beg;
u32 size1 = w.size / 2;
u32 size2 = w.size - size1;
std::advance(it2, w.size / 2);
while (ensure(w.level < umax))
{
it = it2;
size1 = w.size - size2;
if (w.level >= w.beg->first.size())
{
// Cannot split: smallest function is a prefix of bigger ones (TODO)
break;
}
const u32 x1 = ::at32(w.beg->first, w.level);
if (!x1)
{
// Cannot split: some functions contain holes at this level
w.level++;
// Resort subrange starting from the new level
std::stable_sort(w.beg, w.end, [&](const auto& a, const auto& b)
{
std::basic_string_view<u32> lhs = a.first;
std::basic_string_view<u32> rhs = b.first;
lhs.remove_prefix(w.level);
rhs.remove_prefix(w.level);
return lhs < rhs;
});
continue;
}
// Adjust ranges (forward)
while (it != w.end && x1 == ::at32(it->first, w.level))
{
it++;
size1++;
}
if (it == w.end)
{
// Cannot split: words are identical within the range at this level
w.level++;
}
else
{
size2 = w.size - size1;
break;
}
}
if (w.rel32)
{
#if defined(ARCH_X64)
// Patch rel32 linking it to the current location if necessary
const s32 r32 = ::narrow<s32>(raw - w.rel32);
std::memcpy(w.rel32 - 4, &r32, 4);
#elif defined(ARCH_ARM64)
// Rewrite jump address
{
u64 raw64 = reinterpret_cast<u64>(raw);
memcpy(w.rel32 - 8, &raw64, 8);
}
#else
#error "Unimplemented"
#endif
}
if (w.level >= w.beg->first.size() || w.level >= it->first.size())
{
// If functions cannot be compared, assume smallest function
spu_log.error("Trampoline simplified at ??? (level=%u)", w.level);
#if defined(ARCH_X64)
make_jump(0xe9, w.beg->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
continue;
}
// Value for comparison
const u32 x = ::at32(it->first, w.level);
// Adjust ranges (backward)
while (it != m_flat_list.begin())
{
it--;
if (w.level >= it->first.size())
{
it = m_flat_list.end();
break;
}
if (::at32(it->first, w.level) != x)
{
it++;
break;
}
ensure(it != w.beg);
size1--;
size2++;
}
if (it == m_flat_list.end())
{
spu_log.error("Trampoline simplified (II) at ??? (level=%u)", w.level);
#if defined(ARCH_X64)
make_jump(0xe9, w.beg->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
continue;
}
// Emit 32-bit comparison
#if defined(ARCH_X64)
ensure(raw + 12 <= wxptr + size0 * 22 + 16); // "Asm overflow"
#elif defined(ARCH_ARM64)
ensure(raw + (4 * 4) <= wxptr + size0 * 128 + 16);
#else
#error "Unimplemented"
#endif
if (w.from != w.level)
{
// If necessary (level has advanced), emit load: mov eax, [rcx + addr]
const u32 cmp_lsa = w.level * 4u;
#if defined(ARCH_X64)
if (cmp_lsa < 0x80)
{
*raw++ = 0x8b;
*raw++ = 0x41;
*raw++ = ::narrow<s8>(cmp_lsa);
}
else
{
*raw++ = 0x8b;
*raw++ = 0x81;
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
}
#elif defined(ARCH_ARM64)
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;
// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// cmp_lsa
std::memcpy(raw, &cmp_lsa, 4);
raw += 4;
// ldr w1, [x7, x9]
*raw++ = 0xE1;
*raw++ = 0x68;
*raw++ = 0x69;
*raw++ = 0xB8;
#else
#error "Unimplemented"
#endif
}
// Emit comparison: cmp eax, imm32
#if defined(ARCH_X64)
*raw++ = 0x3d;
std::memcpy(raw, &x, 4);
raw += 4;
#elif defined(ARCH_ARM64)
// ldr w9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x18;
// b #8
*raw++ = 0x02;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x14;
// x
std::memcpy(raw, &x, 4);
raw += 4;
// cmp w1, w9
*raw++ = 0x3f;
*raw++ = 0x00;
*raw++ = 0x09;
*raw++ = 0x6B;
#else
#error "Unimplemented"
#endif
// Low subrange target
if (size1 == 1)
{
#if defined(ARCH_X64)
make_jump(0x82, w.beg->second); // jb rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(w.beg->second);
make_jump(asmjit::arm::CondCode::kUnsignedLT, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0x82, raw); // jb rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kUnsignedLT, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.end = it;
to.size = size1;
to.rel32 = raw;
to.from = w.level;
}
// Second subrange target
if (size2 == 1)
{
#if defined(ARCH_X64)
make_jump(0xe9, it->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
it2 = it;
// Select additional midrange for equality comparison
while (it2 != w.end && ::at32(it2->first, w.level) == x)
{
size2--;
it2++;
}
if (it2 != w.end)
{
// High subrange target
if (size2 == 1)
{
#if defined(ARCH_X64)
make_jump(0x87, it2->second); // ja rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it2->second);
make_jump(asmjit::arm::CondCode::kUnsignedGT, branch_target);
#else
#throw "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0x87, raw); // ja rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kUnsignedGT, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it2;
to.size = size2;
to.rel32 = raw;
to.from = w.level;
}
const u32 size3 = w.size - size1 - size2;
if (size3 == 1)
{
#if defined(ARCH_X64)
make_jump(0xe9, it->second); // jmp rel32
#elif defined(ARCH_ARM64)
u64 branch_target = reinterpret_cast<u64>(it->second);
make_jump(asmjit::arm::CondCode::kAlways, branch_target);
#else
#error "Unimplemented"
#endif
}
else
{
#if defined(ARCH_X64)
make_jump(0xe9, raw); // jmp rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kAlways, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it;
to.end = it2;
to.size = size3;
to.rel32 = raw;
to.from = w.level;
}
}
else
{
#if defined(ARCH_X64)
make_jump(0xe9, raw); // jmp rel32 (stub)
#elif defined(ARCH_ARM64)
make_jump(asmjit::arm::CondCode::kAlways, raw);
#else
#error "Unimplemented"
#endif
auto& to = workload.emplace_back(w);
to.beg = it;
to.size = w.size - size1;
to.rel32 = raw;
to.from = w.level;
}
}
}
workload.clear();
2019-10-25 23:52:56 +02:00
result = reinterpret_cast<spu_function_t>(reinterpret_cast<u64>(wxptr));
std::string fname;
fmt::append(fname, "__ub%u", m_flat_list.size());
jit_announce(wxptr, raw - wxptr, fname);
}
2019-10-25 23:52:56 +02:00
if (auto _old = stuff_it->trampoline.compare_and_swap(nullptr, result))
{
2019-10-25 23:52:56 +02:00
return _old;
}
2019-10-25 23:52:56 +02:00
// Install ubertrampoline
auto& insert_to = ::at32(*spu_runtime::g_dispatcher, id_inst >> 12);
2019-10-25 23:52:56 +02:00
auto _old = insert_to.load();
2019-10-25 23:52:56 +02:00
do
{
2019-10-25 23:52:56 +02:00
// Make sure we are replacing an older ubertrampoline but not newer one
if (_old != tr_dispatch)
{
2019-10-25 23:52:56 +02:00
bool ok = false;
2019-10-25 23:52:56 +02:00
for (auto it = stuff_it; it != stuff_end; ++it)
{
if (it->trampoline == _old)
{
ok = true;
break;
}
}
if (!ok)
{
return result;
}
}
}
2019-10-25 23:52:56 +02:00
while (!insert_to.compare_exchange(_old, result));
2019-10-25 23:52:56 +02:00
return result;
}
spu_function_t spu_runtime::find(const u32* ls, u32 addr) const
{
const u32 index = ls[addr / 4] >> 12;
for (const auto& item : ::at32(m_stuff, index))
{
2019-10-25 23:52:56 +02:00
if (const auto ptr = item.compiled.load())
{
std::basic_string_view<u32> range{item.data.data.data(), item.data.data.size()};
range.remove_prefix((item.data.entry_point - item.data.lower_bound) / 4);
2019-10-25 23:52:56 +02:00
if (addr / 4 + range.size() > 0x10000)
{
continue;
}
if (range.compare(0, range.size(), ls + addr / 4, range.size()) == 0)
{
return ptr;
}
}
}
return nullptr;
}
spu_function_t spu_runtime::make_branch_patchpoint(u16 data) const
{
#if defined(ARCH_X64)
u8* const raw = jit_runtime::alloc(16, 16);
if (!raw)
{
return nullptr;
}
// Save address of the following jmp (GHC CC 3rd argument)
raw[0] = 0x4c; // lea r12, [rip+1]
raw[1] = 0x8d;
raw[2] = 0x25;
raw[3] = 0x01;
raw[4] = 0x00;
raw[5] = 0x00;
raw[6] = 0x00;
raw[7] = 0x90; // nop
// Jump to spu_recompiler_base::branch
raw[8] = 0xe9;
// Compute the distance
const s64 rel = reinterpret_cast<u64>(tr_branch) - reinterpret_cast<u64>(raw + 8) - 5;
std::memcpy(raw + 9, &rel, 4);
raw[13] = 0xcc;
raw[14] = data >> 8;
raw[15] = data & 0xff;
return reinterpret_cast<spu_function_t>(raw);
#elif defined(ARCH_ARM64)
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
u8* const patch_fn = ensure(jit_runtime::alloc(36, 16));
u8* raw = patch_fn;
// adr x21, #16
*raw++ = 0x95;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x10;
// nop x3
for (int i = 0; i < 3; i++)
{
*raw++ = 0x1F;
*raw++ = 0x20;
*raw++ = 0x03;
*raw++ = 0xD5;
}
// ldr x9, #8
*raw++ = 0x49;
*raw++ = 0x00;
*raw++ = 0x00;
*raw++ = 0x58;
// br x9
*raw++ = 0x20;
*raw++ = 0x01;
*raw++ = 0x1F;
*raw++ = 0xD6;
u64 branch_target = reinterpret_cast<u64>(tr_branch);
std::memcpy(raw, &branch_target, 8);
raw += 8;
*raw++ = static_cast<u8>(data >> 8);
*raw++ = static_cast<u8>(data & 0xff);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
return reinterpret_cast<spu_function_t>(patch_fn);
#else
#error "Unimplemented"
#endif
}
spu_recompiler_base::spu_recompiler_base()
{
}
2017-01-25 00:22:19 +01:00
spu_recompiler_base::~spu_recompiler_base()
{
}
void spu_recompiler_base::dispatch(spu_thread& spu, void*, u8* rip)
{
// If code verification failed from a patched patchpoint, clear it with a dispatcher jump
if (rip)
{
#if defined(ARCH_X64)
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_all) - reinterpret_cast<u64>(rip - 8) - 5;
union
{
u8 bytes[8];
u64 result;
};
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
2019-10-25 23:52:56 +02:00
bytes[5] = 0x66; // lnop (2 bytes)
bytes[6] = 0x90;
bytes[7] = 0x90;
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip - 8), result);
#elif defined(ARCH_ARM64)
union
{
u8 bytes[16];
u128 result;
};
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(spu_runtime::tr_all);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
atomic_storage<u128>::release(*reinterpret_cast<u128*>(rip), result);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#else
#error "Unimplemented"
#endif
}
// Second attempt (recover from the recursion after repeated unsuccessful trampoline call)
if (spu.block_counter != spu.block_recover && &dispatch != ::at32(*spu_runtime::g_dispatcher, spu._ref<nse_t<u32>>(spu.pc) >> 12))
{
spu.block_recover = spu.block_counter;
return;
}
2019-10-25 23:52:56 +02:00
spu.jit->init();
// Compile
if (spu._ref<u32>(spu.pc) == 0u)
{
spu_runtime::g_escape(&spu);
return;
}
2019-10-25 23:52:56 +02:00
const auto func = spu.jit->compile(spu.jit->analyse(spu._ptr<u32>(0), spu.pc));
if (!func)
{
spu_log.fatal("[0x%05x] Compilation failed.", spu.pc);
2019-10-25 23:52:56 +02:00
return;
}
// Diagnostic
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
const v128 _info = spu.stack_mirror[(spu.gpr[1]._u32[3] & 0x3fff0) >> 4];
if (_info._u64[0] + 1)
{
spu_log.trace("Called from 0x%x", _info._u32[2] - 4);
}
}
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
2019-10-25 23:52:56 +02:00
#if defined(ARCH_ARM64)
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#endif
2019-10-25 23:52:56 +02:00
spu_runtime::g_tail_escape(&spu, func, nullptr);
}
void spu_recompiler_base::branch(spu_thread& spu, void*, u8* rip)
{
#if defined(ARCH_X64)
if (const u32 ls_off = ((rip[6] << 8) | rip[7]) * 4)
#elif defined(ARCH_ARM64)
if (const u32 ls_off = ((rip[16] << 8) | rip[17]) * 4) // See branch_patchpoint `data`
#else
#error "Unimplemented"
#endif
{
spu_log.todo("Special branch patchpoint hit.\nPlease report to the developer (0x%05x).", ls_off);
}
// Find function
const auto func = spu.jit->get_runtime().find(static_cast<u32*>(spu._ptr<void>(0)), spu.pc);
if (!func)
{
return;
}
#if defined(ARCH_X64)
// Overwrite jump to this function with jump to the compiled function
const s64 rel = reinterpret_cast<u64>(func) - reinterpret_cast<u64>(rip) - 5;
union
{
u8 bytes[8];
u64 result;
};
if (rel >= s32{smin} && rel <= s32{smax})
2015-09-04 01:23:31 +02:00
{
const s64 rel8 = (rel + 5) - 2;
if (rel8 >= s8{smin} && rel8 <= s8{smax})
2016-04-14 01:09:41 +02:00
{
bytes[0] = 0xeb; // jmp rel8
bytes[1] = static_cast<s8>(rel8);
std::memset(bytes + 2, 0xcc, 4);
}
else
{
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
bytes[5] = 0xcc;
2016-04-14 01:09:41 +02:00
}
bytes[6] = rip[6];
bytes[7] = rip[7];
}
else
{
fmt::throw_exception("Impossible far jump: %p -> %p", rip, func);
}
atomic_storage<u64>::release(*reinterpret_cast<u64*>(rip), result);
#elif defined(ARCH_ARM64)
union
{
u8 bytes[16];
u128 result;
};
// ldr x9, #8
bytes[0] = 0x49;
bytes[1] = 0x00;
bytes[2] = 0x00;
bytes[3] = 0x58;
// br x9
bytes[4] = 0x20;
bytes[5] = 0x01;
bytes[6] = 0x1F;
bytes[7] = 0xD6;
const u64 target = reinterpret_cast<u64>(func);
std::memcpy(bytes + 8, &target, 8);
#if defined(__APPLE__)
pthread_jit_write_protect_np(false);
#endif
atomic_storage<u128>::release(*reinterpret_cast<u128*>(rip), result);
#if defined(__APPLE__)
pthread_jit_write_protect_np(true);
#endif
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#else
#error "Unimplemented"
#endif
2019-10-25 23:52:56 +02:00
spu_runtime::g_tail_escape(&spu, func, rip);
}
2021-03-05 20:05:37 +01:00
void spu_recompiler_base::old_interpreter(spu_thread& spu, void* ls, u8* /*rip*/)
{
if (g_cfg.core.spu_decoder != spu_decoder_type::_static)
{
fmt::throw_exception("Invalid SPU decoder");
}
// Select opcode table
const auto& table = g_fxo->get<spu_interpreter_rt>();
// LS pointer
const auto base = static_cast<const u8*>(ls);
while (true)
{
if (spu.state) [[unlikely]]
{
if (spu.check_state())
break;
}
const u32 op = *reinterpret_cast<const be_t<u32>*>(base + spu.pc);
if (table.decode(op)(spu, {op}))
spu.pc += 4;
}
}
2023-08-29 13:50:50 +02:00
std::vector<u32> spu_thread::discover_functions(u32 base_addr, std::span<const u8> ls, bool is_known_addr, u32 /*entry*/)
{
std::vector<u32> calls;
std::vector<u32> branches;
calls.reserve(100);
// Discover functions
// Use the most simple method: search for instructions that calls them
// And then filter invalid cases
// TODO: Does not detect jumptables or fixed-addr indirect calls
const v128 brasl_mask = is_known_addr ? v128::from32p(0x62u << 23) : v128::from32p(umax);
2023-08-29 13:50:50 +02:00
2023-12-29 18:33:29 +01:00
for (u32 i = utils::align<u32>(base_addr, 0x10); i < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); i += 0x10)
{
// Search for BRSL LR and BRASL LR or BR
// TODO: BISL
2023-08-29 13:50:50 +02:00
const v128 inst = read_from_ptr<be_t<v128>>(ls.data(), i - base_addr);
const v128 cleared_i16 = gv_and32(inst, v128::from32p(utils::rol32(~0xffff, 7)));
const v128 eq_brsl = gv_eq32(cleared_i16, v128::from32p(0x66u << 23));
const v128 eq_brasl = gv_eq32(cleared_i16, brasl_mask);
const v128 eq_br = gv_eq32(cleared_i16, v128::from32p(0x64u << 23));
const v128 result = eq_brsl | eq_brasl;
if (!gv_testz(result))
{
for (u32 j = 0; j < 4; j++)
{
if (result.u32r[j])
{
calls.push_back(i + j * 4);
}
}
}
if (!gv_testz(eq_br))
{
for (u32 j = 0; j < 4; j++)
{
if (eq_br.u32r[j])
{
branches.push_back(i + j * 4);
}
}
}
}
calls.erase(std::remove_if(calls.begin(), calls.end(), [&](u32 caller)
{
// Check the validity of both the callee code and the following caller code
return !is_exec_code(caller, ls, base_addr, true) || !is_exec_code(caller + 4, ls, base_addr, true);
}), calls.end());
branches.erase(std::remove_if(branches.begin(), branches.end(), [&](u32 caller)
{
// Check the validity of the callee code
return !is_exec_code(caller, ls, base_addr, true);
}), branches.end());
std::vector<u32> addrs;
for (u32 addr : calls)
{
2023-08-29 13:50:50 +02:00
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls, addr - base_addr)};
const u32 func = op_branch_targets(addr, op)[0];
if (func == umax || addr + 4 == func || func == addr || std::count(addrs.begin(), addrs.end(), func))
{
continue;
}
2024-04-18 15:27:30 +02:00
if (std::count(calls.begin(), calls.end(), func))
{
// Cannot call another call instruction (link is overwritten)
continue;
}
addrs.push_back(func);
// Detect an "arguments passing" block, possible queue another function
for (u32 next = func, it = 10; it && next >= base_addr && next < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); it--, next += 4)
{
const spu_opcode_t test_op{read_from_ptr<be_t<u32>>(ls, next - base_addr)};
const auto type = g_spu_itype.decode(test_op.opcode);
if (type & spu_itype::branch && type != spu_itype::BR)
{
break;
}
if (type == spu_itype::UNK || !test_op.opcode)
{
break;
}
if (type != spu_itype::BR)
{
continue;
}
const u32 target = op_branch_targets(next, op)[0];
if (target == umax || addr + 4 == target || target == addr || std::count(addrs.begin(), addrs.end(), target))
{
break;
}
// Detect backwards branch to the block in examination
if (target >= func && target <= next)
{
break;
}
if (!is_exec_code(target, ls, base_addr, true))
{
break;
}
addrs.push_back(target);
break;
}
}
for (u32 addr : branches)
{
const spu_opcode_t op{read_from_ptr<be_t<u32>>(ls, addr - base_addr)};
const u32 func = op_branch_targets(addr, op)[0];
if (func == umax || addr + 4 == func || func == addr || !addr)
{
continue;
}
// Search for AI R1, -x in the called code
// Reasoning: AI R1, -x means stack frame creation, this is likely be a function
for (u32 next = func, it = 10; it && next >= base_addr && next < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); it--, next += 4)
{
const spu_opcode_t test_op{read_from_ptr<be_t<u32>>(ls, next - base_addr)};
const auto type = g_spu_itype.decode(test_op.opcode);
if (type & spu_itype::branch)
{
break;
}
if (type == spu_itype::UNK || !test_op.opcode)
{
break;
}
bool is_func = false;
if (type == spu_itype::AI && test_op.rt == 1u && test_op.ra == 1u)
{
if (test_op.si10 >= 0)
{
break;
}
is_func = true;
}
if (!is_func)
{
continue;
}
addr = SPU_LS_SIZE + 4; // Terminate the next condition, no further checks needed
if (std::count(addrs.begin(), addrs.end(), func))
{
break;
}
addrs.push_back(func);
break;
}
// Search for AI R1, +x or OR R3/4, Rx, 0 before the branch
// Reasoning: AI R1, +x means stack pointer restoration, branch after that is likely a tail call
// R3 and R4 are common function arguments because they are the first two
2023-12-29 18:33:29 +01:00
for (u32 back = addr - 4, it = 10; it && back >= base_addr && back < std::min<u32>(base_addr + ::size32(ls), 0x3FFF0); it--, back -= 4)
{
const spu_opcode_t test_op{read_from_ptr<be_t<u32>>(ls, back - base_addr)};
const auto type = g_spu_itype.decode(test_op.opcode);
if (type & spu_itype::branch)
{
break;
}
bool is_tail = false;
if (type == spu_itype::AI && test_op.rt == 1u && test_op.ra == 1u)
{
if (test_op.si10 <= 0)
{
break;
}
is_tail = true;
}
else if (!(type & spu_itype::zregmod))
{
const u32 op_rt = type & spu_itype::_quadrop ? +test_op.rt4 : +test_op.rt;
if (op_rt >= 80u && (type != spu_itype::LQD || test_op.ra != 1u))
{
// Modifying non-volatile registers, not a call (and not context restoration)
break;
}
//is_tail = op_rt == 3u || op_rt == 4u;
}
if (!is_tail)
{
continue;
}
if (std::count(addrs.begin(), addrs.end(), func))
{
break;
}
addrs.push_back(func);
break;
}
}
std::sort(addrs.begin(), addrs.end());
return addrs;
}
using reg_state_t = spu_recompiler_base::reg_state_t;
using vf = spu_recompiler_base::vf;
bool reg_state_t::is_const() const
{
return !!(flag & vf::is_const);
}
bool reg_state_t::operator&(vf to_test) const
{
return this->flag.all_of(to_test);
}
bool reg_state_t::is_less_than(u32 imm) const
{
if (flag & vf::is_const && value < imm)
{
return true;
}
2024-05-09 09:48:52 +02:00
if (~known_zeroes < imm)
{
2024-05-09 09:48:52 +02:00
// The highest number possible within the mask's limit is less than imm
return true;
}
return false;
}
bool reg_state_t::operator==(const reg_state_t& r) const
{
if ((flag ^ r.flag) - (vf::is_null + vf::is_mask))
{
return false;
}
return (flag & vf::is_const ? value == r.value : (tag == r.tag && known_ones == r.known_ones && known_zeroes == r.known_zeroes));
}
bool reg_state_t::operator==(u32 imm) const
{
return flag == vf::is_const && value == imm;
}
// Compare equality but try to ignore changes in unmasked bits
bool reg_state_t::compare_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const
{
if (!mask_bits)
{
return true;
}
if ((r.flag & flag) & vf::is_const)
{
// Simplified path for consts
if (((value ^ r.value) & mask_bits) == 0)
{
return true;
}
return false;
}
const bool is_equal = *this == r;
if (is_equal)
{
return true;
}
const auto _this = this->downgrade();
const auto _r = r.downgrade();
const bool is_mask_equal = (_this.tag == _r.tag && _this.flag == _r.flag && !((_this.known_ones ^ _r.known_ones) & mask_bits) && !((_this.known_zeroes ^ _r.known_zeroes) & mask_bits));
return is_mask_equal;
}
bool reg_state_t::compare_with_mask_indifference(u32 imm, u32 mask_bits) const
{
if (!mask_bits)
{
return true;
}
if (flag & vf::is_const)
{
if (((value ^ imm) & mask_bits) == 0)
{
return true;
}
}
return false;
}
// Ensure unequality but try to ignore changes in unmasked bits
bool reg_state_t::unequal_with_mask_indifference(const reg_state_t& r, u32 mask_bits) const
{
if (!mask_bits)
{
return true;
}
if ((r.flag & flag) & vf::is_const)
{
// Simplified path for consts
if ((value ^ r.value) & mask_bits)
{
return true;
}
return false;
}
const bool is_equal = *this == r;
if (is_equal)
{
return false;
}
// Full path
const auto _this = this->downgrade();
const auto _r = r.downgrade();
const bool is_base_value_equal = (_this.tag == _r.tag && _this.flag == _r.flag);
if (!is_base_value_equal)
{
// Cannot ascertain unequality if the value origin is different
return false;
}
// Find at least one bit that is known to be X state at value 'r', and known to be X^1 state at the objects' value
return (((_this.known_ones ^ _r.known_ones) & mask_bits) & ((_this.known_zeroes ^ _r.known_zeroes) & mask_bits)) != 0;
}
reg_state_t reg_state_t::downgrade() const
{
if (flag & vf::is_const)
{
return reg_state_t{vf::is_mask, 0, umax, this->value, ~this->value};
}
if (!(flag - vf::is_null))
{
return reg_state_t{vf::is_mask, 0, this->tag, 0, 0};
}
return *this;
}
reg_state_t reg_state_t::merge(const reg_state_t& rhs) const
{
if (rhs == *this)
{
// Perfect state: no conflicts
return rhs;
}
if ((rhs.flag + flag).all_of(vf::is_const + vf::is_mask))
{
// Try to downgrade to a known-bits type value
const reg_state_t _rhs = rhs.downgrade();
const reg_state_t _this = this->downgrade();
if ((_rhs.flag & _this.flag) & vf::is_mask)
{
// Now it is possible to merge the two values
reg_state_t res{vf::is_mask, 0, 0, _rhs.known_ones & _this.known_ones, _rhs.known_zeroes & _this.known_zeroes};
if (res.known_zeroes | res.known_ones)
{
// Success (create new value tag)
res.tag = reg_state_t::alloc_tag();
return res;
}
}
}
return make_unknown();
}
reg_state_t reg_state_t::build_on_top_of(const reg_state_t& rhs) const
{
if (flag & vf::is_null)
{
// Value unmodified
return rhs;
}
if (rhs == *this)
{
// Perfect state: no conflicts
return rhs;
}
return *this;
}
u32 reg_state_t::get_known_zeroes() const
{
if (flag & vf::is_const)
{
return ~value;
}
return known_zeroes;
}
u32 reg_state_t::get_known_ones() const
{
if (flag & vf::is_const)
{
return value;
}
return known_ones;
}
reg_state_t reg_state_t::from_value(u32 value) noexcept
{
reg_state_t v{};
v.value = value;
v.flag = vf::is_const;
return v;
}
u32 reg_state_t::alloc_tag(bool reset) noexcept
{
static thread_local u32 g_tls_tag = 0;
if (reset)
{
g_tls_tag = 0;
return 0;
}
return ++g_tls_tag;
}
// Converge 2 register states to the same flow in execution
template <usz N>
static void merge(std::array<reg_state_t, N>& result, const std::array<reg_state_t, N>& lhs, const std::array<reg_state_t, N>& rhs)
{
usz index = umax;
for (reg_state_t& state : result)
{
index++;
state = lhs[index].merge(rhs[index]);
}
}
// Override RHS state with the newer LHS state
template <usz N>
static void build_on_top_of(std::array<reg_state_t, N>& result, const std::array<reg_state_t, N>& lhs, const std::array<reg_state_t, N>& rhs)
{
usz index = umax;
for (reg_state_t& state : result)
{
index++;
state = lhs[index].build_on_top_of(rhs[index]);
}
}
struct block_reg_info
{
u32 pc = SPU_LS_SIZE; // Address
std::array<reg_state_t, s_reg_max> local_state{};
bool has_true_state = false;
std::array<reg_state_t, s_reg_max> start_reg_state{};
std::array<reg_state_t, s_reg_max> end_reg_state{};
std::array<reg_state_t, s_reg_max> addend_reg_state{};
std::array<reg_state_t, s_reg_max> walkby_state{}; // State that is made by merging state_predecessor and iterating over instructions for final instrucion walk
usz next_nodes_count = 0;
struct node_t
{
u32 prev_pc = umax;
};
std::vector<node_t> prev_nodes;
static std::unique_ptr<block_reg_info> create(u32 pc) noexcept
{
auto ptr = new block_reg_info{ pc, reg_state_t::make_unknown<s_reg_max>() };
for (reg_state_t& f : ptr->local_state)
{
f.flag += vf::is_null;
}
ptr->start_reg_state = ptr->local_state;
return std::unique_ptr<block_reg_info>(ptr);
}
// Evaluate registers state
std::array<reg_state_t, s_reg_max>& evaluate_start_state(const std::map<u32, std::unique_ptr<block_reg_info>>& map);
// This function creates new node if not found and links the proceeding node to the old node
// In a manner in which no duplicate paths are formed
static void create_node(u32 pc_rhs, u32 parent_pc, std::map<u32, std::unique_ptr<block_reg_info>>& map)
{
//ensure(parent_node != pc_rhs);
ensure(map[parent_pc]);
if (!map[pc_rhs])
{
map[pc_rhs] = create(pc_rhs);
}
node_t prev_node{parent_pc};
map[parent_pc]->next_nodes_count++;
map[pc_rhs]->prev_nodes.emplace_back(prev_node);
}
};
spu_program spu_recompiler_base::analyse(const be_t<u32>* ls, u32 entry_point, std::map<u32, std::basic_string<u32>>* out_target_list)
{
// Result: addr + raw instruction data
spu_program result;
result.data.reserve(10000);
result.entry_point = entry_point;
result.lower_bound = entry_point;
// Initialize block entries
m_block_info.reset();
m_block_info.set(entry_point / 4);
m_entry_info.reset();
m_entry_info.set(entry_point / 4);
m_ret_info.reset();
// Simple block entry workload list
workload.clear();
workload.push_back(entry_point);
std::memset(m_regmod.data(), 0xff, sizeof(m_regmod));
m_use_ra.reset();
m_use_rb.reset();
m_use_rc.reset();
m_targets.clear();
m_preds.clear();
m_preds[entry_point];
m_bbs.clear();
m_chunks.clear();
m_funcs.clear();
m_inst_attrs.clear();
m_patterns.clear();
// SYNC instruction found
bool sync = false;
u32 hbr_loc = 0;
u32 hbr_tg = -1;
// Result bounds
u32 lsa = entry_point;
u32 limit = 0x40000;
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
}
// Weak constant propagation context (for guessing branch targets)
std::array<bs_t<vf>, 128> vflags{};
// Associated constant values for 32-bit preferred slot
std::array<u32, 128> values;
for (u32 wi = 0, wa = workload[0]; wi < workload.size();)
{
const auto next_block = [&]
{
// Reset value information
vflags.fill({});
sync = false;
hbr_loc = 0;
hbr_tg = -1;
wi++;
if (wi < workload.size())
{
wa = workload[wi];
}
};
const u32 pos = wa;
const auto add_block = [&](u32 target)
{
// Validate new target (TODO)
if (target >= lsa && target < limit)
{
// Check for redundancy
if (!m_block_info[target / 4])
{
m_block_info[target / 4] = true;
workload.push_back(target);
}
// Add predecessor
if (m_preds[target].find_first_of(pos) + 1 == 0)
{
m_preds[target].push_back(pos);
}
}
};
2017-02-13 14:12:24 +01:00
if (pos < lsa || pos >= limit)
{
// Don't analyse if already beyond the limit
next_block();
continue;
}
const u32 data = ls[pos / 4];
const auto op = spu_opcode_t{data};
wa += 4;
m_targets.erase(pos);
// Fill register access info
if (auto iflags = g_spu_iflag.decode(data))
{
if (+iflags & +spu_iflag::use_ra)
m_use_ra.set(pos / 4);
if (+iflags & +spu_iflag::use_rb)
m_use_rb.set(pos / 4);
if (+iflags & +spu_iflag::use_rc)
m_use_rc.set(pos / 4);
}
// Analyse instruction
switch (const auto type = g_spu_itype.decode(data))
{
case spu_itype::UNK:
case spu_itype::DFCEQ:
case spu_itype::DFCMEQ:
case spu_itype::DFCGT:
case spu_itype::DFCMGT:
case spu_itype::DFTSV:
{
next_block();
continue;
}
case spu_itype::SYNC:
case spu_itype::STOP:
case spu_itype::STOPD:
{
if (data == 0)
{
// Stop before null data
next_block();
continue;
}
if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
// Stop on special instructions (TODO)
m_targets[pos];
next_block();
break;
}
if (type == spu_itype::SYNC)
{
// Remember
sync = true;
}
break;
}
case spu_itype::IRET:
{
if (op.d && op.e)
{
spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos);
}
m_targets[pos];
next_block();
break;
}
case spu_itype::BI:
case spu_itype::BISL:
case spu_itype::BISLED:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ:
{
if (op.d && op.e)
{
spu_log.error("[0x%x] Invalid interrupt flags (DE)", pos);
}
const auto af = vflags[op.ra];
const auto av = values[op.ra];
const bool sl = type == spu_itype::BISL || type == spu_itype::BISLED;
if (sl)
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
}
if (af & vf::is_const)
{
const u32 target = spu_branch_target(av);
spu_log.warning("[0x%x] At 0x%x: indirect branch to 0x%x%s", entry_point, pos, target, op.d ? " (D)" : op.e ? " (E)" : "");
2023-09-01 14:48:43 +02:00
if (type == spu_itype::BI && target == pos + 4 && op.d)
{
// Disable interrupts idiom
break;
}
m_targets[pos].push_back(target);
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
if (sync)
{
spu_log.notice("[0x%x] At 0x%x: ignoring %scall to 0x%x (SYNC)", entry_point, pos, sl ? "" : "tail ", target);
if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
}
else
{
m_entry_info[target / 4] = true;
add_block(target);
}
}
else if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
}
else if (type == spu_itype::BI && g_cfg.core.spu_block_size != spu_block_size_type::safe && !op.d && !op.e && !sync)
{
// Analyse jump table (TODO)
std::basic_string<u32> jt_abs;
std::basic_string<u32> jt_rel;
const u32 start = pos + 4;
2018-05-04 22:51:35 +02:00
u64 dabs = 0;
u64 drel = 0;
for (u32 i = start; i < limit; i += 4)
{
const u32 target = ls[i / 4];
2018-05-04 22:51:35 +02:00
if (target == 0 || target % 4)
{
// Address cannot be misaligned: abort
break;
}
if (target >= lsa && target < 0x40000)
{
// Possible jump table entry (absolute)
jt_abs.push_back(target);
}
if (target + start >= lsa && target + start < 0x40000)
{
// Possible jump table entry (relative)
jt_rel.push_back(target + start);
}
if (std::max(jt_abs.size(), jt_rel.size()) * 4 + start <= i)
{
// Neither type of jump table completes
jt_abs.clear();
jt_rel.clear();
break;
}
}
2018-05-04 22:51:35 +02:00
// Choose position after the jt as an anchor and compute the average distance
for (u32 target : jt_abs)
{
dabs += std::abs(static_cast<s32>(target - start - jt_abs.size() * 4));
}
for (u32 target : jt_rel)
{
drel += std::abs(static_cast<s32>(target - start - jt_rel.size() * 4));
}
// Add detected jump table blocks
if (jt_abs.size() >= 3 || jt_rel.size() >= 3)
{
2018-05-04 22:51:35 +02:00
if (jt_abs.size() == jt_rel.size())
{
if (dabs < drel)
{
jt_rel.clear();
}
if (dabs > drel)
{
jt_abs.clear();
}
ensure(jt_abs.size() != jt_rel.size());
2018-05-04 22:51:35 +02:00
}
if (jt_abs.size() >= jt_rel.size())
{
2020-03-04 15:08:40 +01:00
const u32 new_size = (start - lsa) / 4 + ::size32(jt_abs);
2018-05-04 22:51:35 +02:00
if (result.data.size() < new_size)
{
result.data.resize(new_size);
2018-05-04 22:51:35 +02:00
}
for (u32 i = 0; i < jt_abs.size(); i++)
{
add_block(jt_abs[i]);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_abs[i]);
m_targets[start + i * 4];
}
m_targets.emplace(pos, std::move(jt_abs));
}
if (jt_rel.size() >= jt_abs.size())
{
2020-03-04 15:08:40 +01:00
const u32 new_size = (start - lsa) / 4 + ::size32(jt_rel);
2018-05-04 22:51:35 +02:00
if (result.data.size() < new_size)
2018-05-04 22:51:35 +02:00
{
result.data.resize(new_size);
2018-05-04 22:51:35 +02:00
}
for (u32 i = 0; i < jt_rel.size(); i++)
{
2018-05-04 22:51:35 +02:00
add_block(jt_rel[i]);
result.data[(start - lsa) / 4 + i] = std::bit_cast<u32, be_t<u32>>(jt_rel[i] - start);
m_targets[start + i * 4];
}
m_targets.emplace(pos, std::move(jt_rel));
}
}
else if (start + 12 * 4 < limit &&
ls[start / 4 + 0] == 0x1ce00408u &&
ls[start / 4 + 1] == 0x24000389u &&
ls[start / 4 + 2] == 0x24004809u &&
ls[start / 4 + 3] == 0x24008809u &&
ls[start / 4 + 4] == 0x2400c809u &&
ls[start / 4 + 5] == 0x24010809u &&
ls[start / 4 + 6] == 0x24014809u &&
ls[start / 4 + 7] == 0x24018809u &&
ls[start / 4 + 8] == 0x1c200807u &&
ls[start / 4 + 9] == 0x2401c809u)
{
spu_log.warning("[0x%x] Pattern 1 detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
// Add 8 targets (TODO)
for (u32 addr = start + 4; addr < start + 36; addr += 4)
{
m_targets[pos].push_back(addr);
add_block(addr);
}
}
else if (hbr_loc > start && hbr_loc < limit && hbr_tg == start)
{
spu_log.warning("[0x%x] No patterns detected (hbr=0x%x:0x%x)", pos, hbr_loc, hbr_tg);
}
}
else if (type == spu_itype::BI && sync)
{
spu_log.notice("[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos);
}
if (type == spu_itype::BI || sl)
{
if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
m_targets[pos];
}
else
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
}
else
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
case spu_itype::BRSL:
case spu_itype::BRASL:
{
const u32 target = spu_branch_target(type == spu_itype::BRASL ? 0 : pos, op.i16);
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = pos + 4;
if (type == spu_itype::BRSL && target == pos + 4)
{
// Get next instruction address idiom
break;
}
m_targets[pos].push_back(target);
if (g_cfg.core.spu_block_size != spu_block_size_type::safe)
{
m_ret_info[pos / 4 + 1] = true;
m_entry_info[pos / 4 + 1] = true;
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync)
{
m_entry_info[target / 4] = true;
add_block(target);
}
else
{
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
spu_log.notice("[0x%x] At 0x%x: ignoring fixed call to 0x%x (SYNC)", entry_point, pos, target);
}
if (target > entry_point)
{
limit = std::min<u32>(limit, target);
}
}
next_block();
break;
}
case spu_itype::BRA:
{
const u32 target = spu_branch_target(0, op.i16);
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync)
{
m_entry_info[target / 4] = true;
}
else
{
m_targets[pos].push_back(target);
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
spu_log.notice("[0x%x] At 0x%x: ignoring fixed tail call to 0x%x (SYNC)", entry_point, pos, target);
}
}
add_block(target);
next_block();
break;
}
case spu_itype::BR:
case spu_itype::BRZ:
case spu_itype::BRNZ:
case spu_itype::BRHZ:
case spu_itype::BRHNZ:
{
const u32 target = spu_branch_target(pos, op.i16);
if (target == pos + 4)
{
// Nop
break;
}
m_targets[pos].push_back(target);
add_block(target);
if (type != spu_itype::BR)
{
m_targets[pos].push_back(pos + 4);
add_block(pos + 4);
}
next_block();
break;
}
case spu_itype::DSYNC:
case spu_itype::HEQ:
case spu_itype::HEQI:
case spu_itype::HGT:
case spu_itype::HGTI:
case spu_itype::HLGT:
case spu_itype::HLGTI:
case spu_itype::LNOP:
case spu_itype::NOP:
case spu_itype::MTSPR:
case spu_itype::FSCRWR:
case spu_itype::STQA:
case spu_itype::STQD:
case spu_itype::STQR:
case spu_itype::STQX:
{
// Do nothing
break;
}
case spu_itype::WRCH:
{
switch (op.ra)
{
case MFC_EAL:
{
m_regmod[pos / 4] = s_reg_mfc_eal;
break;
}
case MFC_LSA:
{
m_regmod[pos / 4] = s_reg_mfc_lsa;
break;
}
case MFC_TagID:
{
m_regmod[pos / 4] = s_reg_mfc_tag;
break;
}
case MFC_Size:
{
m_regmod[pos / 4] = s_reg_mfc_size;
break;
}
2021-04-09 21:12:47 +02:00
default: break;
}
break;
}
case spu_itype::LQA:
case spu_itype::LQD:
case spu_itype::LQR:
case spu_itype::LQX:
{
// Unconst
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = {};
break;
}
case spu_itype::HBR:
{
hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt);
hbr_tg = vflags[op.ra] & vf::is_const && !op.c ? values[op.ra] & 0x3fffc : -1;
break;
}
case spu_itype::HBRA:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(0x0, op.i16);
break;
}
case spu_itype::HBRR:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(pos, op.i16);
break;
}
case spu_itype::IL:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.si16;
break;
}
case spu_itype::ILA:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i18;
break;
}
case spu_itype::ILH:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16 | op.i16;
break;
}
case spu_itype::ILHU:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = +vf::is_const;
values[op.rt] = op.i16 << 16;
break;
}
case spu_itype::IOHL:
{
m_regmod[pos / 4] = op.rt;
values[op.rt] = values[op.rt] | op.i16;
break;
}
case spu_itype::ORI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] | op.si10;
break;
}
case spu_itype::OR:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] | values[op.rb];
break;
}
case spu_itype::ANDI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] & op.si10;
break;
}
case spu_itype::AND:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] & values[op.rb];
break;
}
case spu_itype::AI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] + op.si10;
break;
}
case spu_itype::A:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.ra] + values[op.rb];
break;
}
case spu_itype::SFI:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = op.si10 - values[op.ra];
break;
}
case spu_itype::SF:
{
m_regmod[pos / 4] = op.rt;
vflags[op.rt] = vflags[op.ra] & vflags[op.rb] & vf::is_const;
values[op.rt] = values[op.rb] - values[op.ra];
break;
}
case spu_itype::ROTMI:
{
m_regmod[pos / 4] = op.rt;
2020-03-04 15:08:40 +01:00
if ((0 - op.i7) & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
2020-03-04 15:08:40 +01:00
values[op.rt] = values[op.ra] >> ((0 - op.i7) & 0x1f);
break;
}
case spu_itype::SHLI:
{
m_regmod[pos / 4] = op.rt;
if (op.i7 & 0x20)
{
vflags[op.rt] = +vf::is_const;
values[op.rt] = 0;
break;
}
vflags[op.rt] = vflags[op.ra] & vf::is_const;
values[op.rt] = values[op.ra] << (op.i7 & 0x1f);
break;
}
default:
{
// Unconst
const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
m_regmod[pos / 4] = op_rt;
vflags[op_rt] = {};
break;
}
}
// Insert raw instruction value
const u32 new_size = (pos - lsa) / 4;
if (result.data.size() <= new_size)
{
if (result.data.size() < new_size)
{
result.data.resize(new_size);
}
result.data.emplace_back(std::bit_cast<u32, be_t<u32>>(data));
}
else if (u32& raw_val = result.data[new_size])
{
ensure(raw_val == std::bit_cast<u32, be_t<u32>>(data));
}
else
{
raw_val = std::bit_cast<u32, be_t<u32>>(data);
}
}
while (lsa > 0 || limit < 0x40000)
{
2020-03-04 15:08:40 +01:00
const u32 initial_size = ::size32(result.data);
// Check unreachable blocks
limit = std::min<u32>(limit, lsa + initial_size * 4);
for (auto& pair : m_preds)
{
bool reachable = false;
2018-06-01 23:33:48 +02:00
if (pair.first >= limit)
{
2018-06-01 23:33:48 +02:00
continue;
}
// All (direct and indirect) predecessors to check
std::basic_string<u32> workload;
// Bit array used to deduplicate workload list
workload.push_back(pair.first);
m_bits[pair.first / 4] = true;
2020-12-18 08:39:54 +01:00
for (usz i = 0; !reachable && i < workload.size(); i++)
2018-06-01 23:33:48 +02:00
{
for (u32 j = workload[i];; j -= 4)
{
// Go backward from an address until the entry point is reached
if (j == entry_point)
2018-06-01 23:33:48 +02:00
{
reachable = true;
break;
}
const auto found = m_preds.find(j);
bool had_fallthrough = false;
if (found != m_preds.end())
{
for (u32 new_pred : found->second)
{
// Check whether the predecessor is previous instruction
if (new_pred == j - 4)
{
had_fallthrough = true;
continue;
}
// Check whether in range and not already added
if (new_pred >= lsa && new_pred < limit && !m_bits[new_pred / 4])
{
workload.push_back(new_pred);
m_bits[new_pred / 4] = true;
}
}
}
// Check for possible fallthrough predecessor
if (!had_fallthrough)
{
if (::at32(result.data, (j - lsa) / 4 - 1) == 0 || m_targets.count(j - 4))
2018-06-01 23:33:48 +02:00
{
break;
}
}
if (i == 0)
{
// TODO
}
}
}
2018-06-01 23:33:48 +02:00
for (u32 pred : workload)
{
m_bits[pred / 4] = false;
}
if (!reachable && pair.first < limit)
{
limit = pair.first;
}
}
result.data.resize((limit - lsa) / 4);
// Check holes in safe mode (TODO)
u32 valid_size = 0;
for (u32 i = 0; i < result.data.size(); i++)
{
if (result.data[i] == 0)
{
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
// Allow only NOP or LNOP instructions in holes
if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
{
continue;
}
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
result.data.resize(valid_size);
break;
}
}
else
{
valid_size = i + 1;
}
}
// Even if NOP or LNOP, should be removed at the end
result.data.resize(valid_size);
// Repeat if blocks were removed
if (result.data.size() == initial_size)
{
break;
}
}
limit = std::min<u32>(limit, lsa + ::size32(result.data) * 4);
m_inst_attrs.resize(result.data.size());
// Cleanup block info
for (u32 i = 0; i < workload.size(); i++)
{
const u32 addr = workload[i];
if (addr < lsa || addr >= limit || !result.data[(addr - lsa) / 4])
{
m_block_info[addr / 4] = false;
m_entry_info[addr / 4] = false;
m_ret_info[addr / 4] = false;
m_preds.erase(addr);
}
}
// Complete m_preds and associated m_targets for adjacent blocks
for (auto it = m_preds.begin(); it != m_preds.end();)
{
if (it->first < lsa || it->first >= limit)
{
it = m_preds.erase(it);
continue;
}
// Erase impossible predecessors
const auto new_end = std::remove_if(it->second.begin(), it->second.end(), [&](u32 addr)
{
return addr < lsa || addr >= limit;
});
it->second.erase(new_end, it->second.end());
// Don't add fallthrough target if all predecessors are removed
if (it->second.empty() && !m_entry_info[it->first / 4])
{
// If not an entry point, remove the block completely
m_block_info[it->first / 4] = false;
it = m_preds.erase(it);
continue;
}
// Previous instruction address
const u32 prev = (it->first - 4) & 0x3fffc;
// TODO: check the correctness
if (m_targets.count(prev) == 0 && prev >= lsa && prev < limit && result.data[(prev - lsa) / 4])
{
// Add target and the predecessor
m_targets[prev].push_back(it->first);
it->second.push_back(prev);
}
it++;
}
if (out_target_list)
{
out_target_list->insert(m_targets.begin(), m_targets.end());
}
// Remove unnecessary target lists
for (auto it = m_targets.begin(); it != m_targets.end();)
{
if (it->first < lsa || it->first >= limit)
{
it = m_targets.erase(it);
continue;
}
it++;
}
// Fill holes which contain only NOP and LNOP instructions (TODO: compile)
for (u32 i = 0, nnop = 0, vsize = 0; i <= result.data.size(); i++)
{
if (i >= result.data.size() || result.data[i])
{
if (nnop && nnop == i - vsize)
{
// Write only complete NOP sequence
for (u32 j = vsize; j < i; j++)
{
result.data[j] = std::bit_cast<u32, be_t<u32>>(ls[lsa / 4 + j]);
}
}
nnop = 0;
vsize = i + 1;
}
else
{
const u32 pos = lsa + i * 4;
const u32 data = ls[pos / 4];
if (data == 0x200000 || (data & 0xffffff80) == 0x40200000)
{
nnop++;
}
}
}
// Fill block info
for (auto& pred : m_preds)
{
auto& block = m_bbs[pred.first];
// Copy predeccessors (wrong at this point, needs a fixup later)
block.preds = pred.second;
// Fill register usage info
for (u32 ia = pred.first; ia < limit; ia += 4)
{
block.size++;
// Decode instruction
const spu_opcode_t op{std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 4])};
const auto type = g_spu_itype.decode(op.opcode);
u8 reg_save = 255;
if (type == spu_itype::STQD && op.ra == s_reg_sp && !block.reg_mod[op.rt] && !block.reg_use[op.rt])
{
// Register saved onto the stack before use
block.reg_save_dom[op.rt] = true;
reg_save = op.rt;
}
for (auto _use : std::initializer_list<std::pair<u32, bool>>{{op.ra, m_use_ra.test(ia / 4)}
, {op.rb, m_use_rb.test(ia / 4)}, {op.rc, m_use_rc.test(ia / 4)}})
{
if (_use.second)
{
const u32 reg = _use.first;
// Register reg use only if it happens before reg mod
if (!block.reg_mod[reg])
{
block.reg_use.set(reg);
if (reg_save != reg && block.reg_save_dom[reg])
{
// Register is still used after saving; probably not eligible for optimization
block.reg_save_dom[reg] = false;
}
}
}
}
if (type == spu_itype::WRCH && op.ra == MFC_Cmd)
{
// Expand MFC_Cmd reg use
for (u8 reg : {s_reg_mfc_lsa, s_reg_mfc_tag, s_reg_mfc_size})
{
if (!block.reg_mod[reg])
block.reg_use.set(reg);
}
}
// Register reg modification
if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max)
{
block.reg_mod.set(reg);
block.reg_mod_xf.set(reg, type & spu_itype::xfloat);
if (type == spu_itype::SELB && (block.reg_mod_xf[op.ra] || block.reg_mod_xf[op.rb]))
block.reg_mod_xf.set(reg);
// Possible post-dominating register load
if (type == spu_itype::LQD && op.ra == s_reg_sp)
block.reg_load_mod[reg] = ia + 1;
else
block.reg_load_mod[reg] = 0;
}
// Find targets (also means end of the block)
const auto tfound = m_targets.find(ia);
if (tfound != m_targets.end())
{
// Copy targets
block.targets = tfound->second;
// Assume that the call reads and modifies all volatile registers (TODO)
bool is_call = false;
bool is_tail = false;
switch (type)
{
case spu_itype::BRSL:
is_call = spu_branch_target(ia, op.i16) != ia + 4;
break;
case spu_itype::BRASL:
is_call = spu_branch_target(0, op.i16) != ia + 4;
break;
case spu_itype::BISL:
case spu_itype::BISLED:
is_call = true;
break;
default:
break;
}
if (is_call)
{
for (u32 i = 0; i < s_reg_max; ++i)
{
if (i == s_reg_lr || (i >= 2 && i < s_reg_80) || i > s_reg_127)
{
if (!block.reg_mod[i])
block.reg_use.set(i);
if (!is_tail)
{
block.reg_mod.set(i);
block.reg_mod_xf[i] = false;
}
}
}
}
break;
}
}
}
// Fixup block predeccessors to point to basic blocks, not last instructions
for (auto& bb : m_bbs)
{
const u32 addr = bb.first;
for (u32& pred : bb.second.preds)
{
pred = std::prev(m_bbs.upper_bound(pred))->first;
}
if (m_entry_info[addr / 4] && g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
// Register empty chunk
m_chunks.push_back(addr);
// Register function if necessary
if (!m_ret_info[addr / 4])
{
m_funcs[addr];
}
}
}
// Ensure there is a function at the lowest address
if (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
if (auto emp = m_funcs.try_emplace(m_bbs.begin()->first); emp.second)
{
const u32 addr = emp.first->first;
spu_log.error("[0x%05x] Fixed first function at 0x%05x", entry_point, addr);
m_entry_info[addr / 4] = true;
m_ret_info[addr / 4] = false;
}
}
// Split functions
while (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
bool need_repeat = false;
u32 start = 0;
u32 limit = 0x40000;
// Walk block list in ascending order
for (auto& block : m_bbs)
{
const u32 addr = block.first;
if (m_entry_info[addr / 4] && !m_ret_info[addr / 4])
{
const auto upper = m_funcs.upper_bound(addr);
start = addr;
limit = upper == m_funcs.end() ? 0x40000 : upper->first;
}
// Find targets that exceed [start; limit) range and make new functions from them
for (u32 target : block.second.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
if (target < start || target >= limit)
{
if (!m_entry_info[target / 4] || m_ret_info[target / 4])
{
// Create new function entry (likely a tail call)
m_entry_info[target / 4] = true;
m_ret_info[target / 4] = false;
m_funcs.try_emplace(target);
if (target < limit)
{
need_repeat = true;
}
}
}
}
block.second.func = start;
}
if (!need_repeat)
{
break;
}
}
if (!m_bbs.count(entry_point))
{
// Invalid code
spu_log.error("[0x%x] Invalid code", entry_point);
return {};
}
// Fill entry map
while (true)
{
workload.clear();
workload.push_back(entry_point);
ensure(m_bbs.count(entry_point));
std::basic_string<u32> new_entries;
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
const u32 _new = block.chunk;
if (!m_entry_info[addr / 4])
{
// Check block predecessors
for (u32 pred : block.preds)
{
const u32 _old = ::at32(m_bbs, pred).chunk;
if (_old < 0x40000 && _old != _new)
{
// If block has multiple 'entry' points, it becomes an entry point itself
new_entries.push_back(addr);
}
}
}
// Update chunk address
block.chunk = m_entry_info[addr / 4] ? addr : _new;
// Process block targets
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
const u32 value = m_entry_info[target / 4] ? target : block.chunk;
if (u32& tval = tb.chunk; tval < 0x40000)
{
// TODO: fix condition
if (tval != value && !m_entry_info[target / 4])
{
new_entries.push_back(target);
}
}
else
{
tval = value;
workload.emplace_back(target);
}
}
}
if (new_entries.empty())
{
break;
}
for (u32 entry : new_entries)
{
m_entry_info[entry / 4] = true;
// Acknowledge artificial (reversible) chunk entry point
m_ret_info[entry / 4] = true;
}
for (auto& bb : m_bbs)
{
// Reset chunk info
bb.second.chunk = 0x40000;
}
}
workload.clear();
workload.push_back(entry_point);
// Fill workload adding targets
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
block.analysed = true;
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
if (!tb.analysed)
{
workload.push_back(target);
tb.analysed = true;
}
// Limited xfloat hint propagation (possibly TODO)
if (tb.chunk == block.chunk)
{
tb.reg_maybe_xf &= block.reg_mod_xf;
}
else
{
tb.reg_maybe_xf.reset();
}
}
block.reg_origin.fill(0x80000000);
block.reg_origin_abs.fill(0x80000000);
}
// Fill register origin info
while (true)
{
bool must_repeat = false;
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
// Initialize entry point with default value: unknown origin (requires load)
if (m_entry_info[addr / 4])
{
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin[i] == 0x80000000)
block.reg_origin[i] = 0x40000;
}
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && m_entry_info[addr / 4] && !m_ret_info[addr / 4])
{
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin_abs[i] == 0x80000000)
block.reg_origin_abs[i] = 0x40000;
else if (block.reg_origin_abs[i] + 1 == 0)
block.reg_origin_abs[i] = -2;
}
}
for (u32 target : block.targets)
{
const auto tfound = m_bbs.find(target);
if (tfound == m_bbs.end())
{
continue;
}
auto& tb = tfound->second;
for (u32 i = 0; i < s_reg_max; i++)
{
if (tb.chunk == block.chunk && tb.reg_origin[i] + 1)
{
const u32 expected = block.reg_mod[i] ? addr : block.reg_origin[i];
if (tb.reg_origin[i] == 0x80000000)
{
tb.reg_origin[i] = expected;
}
else if (tb.reg_origin[i] != expected)
{
// Set -1 if multiple origins merged (requires PHI node)
tb.reg_origin[i] = -1;
must_repeat |= !tb.targets.empty();
}
}
if (g_cfg.core.spu_block_size == spu_block_size_type::giga && tb.func == block.func && tb.reg_origin_abs[i] + 2)
{
const u32 expected = block.reg_mod[i] ? addr : block.reg_origin_abs[i];
if (tb.reg_origin_abs[i] == 0x80000000)
{
tb.reg_origin_abs[i] = expected;
}
else if (tb.reg_origin_abs[i] != expected)
{
if (tb.reg_origin_abs[i] == 0x40000 || expected + 2 == 0 || expected == 0x40000)
{
// Set -2: sticky value indicating possible external reg origin (0x40000)
tb.reg_origin_abs[i] = -2;
must_repeat |= !tb.targets.empty();
}
else if (tb.reg_origin_abs[i] + 1)
{
tb.reg_origin_abs[i] = -1;
must_repeat |= !tb.targets.empty();
}
}
}
}
}
}
if (!must_repeat)
{
break;
}
for (u32 wi = 0; wi < workload.size(); wi++)
{
const u32 addr = workload[wi];
auto& block = ::at32(m_bbs, addr);
// Reset values for the next attempt (keep negative values)
for (u32 i = 0; i < s_reg_max; i++)
{
if (block.reg_origin[i] <= 0x40000)
block.reg_origin[i] = 0x80000000;
if (block.reg_origin_abs[i] <= 0x40000)
block.reg_origin_abs[i] = 0x80000000;
}
}
}
// Fill more block info
for (u32 wi = 0; wi < workload.size(); wi++)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
break;
}
const u32 addr = workload[wi];
auto& bb = ::at32(m_bbs, addr);
auto& func = ::at32(m_funcs, bb.func);
// Update function size
func.size = std::max<u16>(func.size, bb.size + (addr - bb.func) / 4);
// Copy constants according to reg origin info
for (u32 i = 0; i < s_reg_max; i++)
{
const u32 orig = bb.reg_origin_abs[i];
if (orig < 0x40000)
{
auto& src = ::at32(m_bbs, orig);
bb.reg_const[i] = src.reg_const[i];
bb.reg_val32[i] = src.reg_val32[i];
}
if (!bb.reg_save_dom[i] && bb.reg_use[i] && (orig == 0x40000 || orig + 2 == 0))
{
// Destroy offset if external reg value is used
func.reg_save_off[i] = -1;
}
}
if (u32 orig = bb.reg_origin_abs[s_reg_sp]; orig < 0x40000)
{
auto& prologue = ::at32(m_bbs, orig);
// Copy stack offset (from the assumed prologue)
bb.stack_sub = prologue.stack_sub;
}
else if (orig > 0x40000)
{
// Unpredictable stack
bb.stack_sub = 0x80000000;
}
spu_opcode_t op{};
auto last_inst = spu_itype::UNK;
for (u32 ia = addr; ia < addr + bb.size * 4; ia += 4)
{
// Decode instruction again
op.opcode = std::bit_cast<be_t<u32>>(result.data[(ia - lsa) / 41]);
last_inst = g_spu_itype.decode(op.opcode);
// Propagate some constants
switch (last_inst)
{
case spu_itype::IL:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.si16;
break;
}
case spu_itype::ILA:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i18;
break;
}
case spu_itype::ILHU:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i16 << 16;
break;
}
case spu_itype::ILH:
{
bb.reg_const[op.rt] = true;
bb.reg_val32[op.rt] = op.i16 << 16 | op.i16;
break;
}
case spu_itype::IOHL:
{
bb.reg_val32[op.rt] = bb.reg_val32[op.rt] | op.i16;
break;
}
case spu_itype::ORI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | op.si10;
break;
}
case spu_itype::OR:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] | bb.reg_val32[op.rb];
break;
}
case spu_itype::AI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + op.si10;
break;
}
case spu_itype::A:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.ra] + bb.reg_val32[op.rb];
break;
}
case spu_itype::SFI:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra];
bb.reg_val32[op.rt] = op.si10 - bb.reg_val32[op.ra];
break;
}
case spu_itype::SF:
{
bb.reg_const[op.rt] = bb.reg_const[op.ra] && bb.reg_const[op.rb];
bb.reg_val32[op.rt] = bb.reg_val32[op.rb] - bb.reg_val32[op.ra];
break;
}
case spu_itype::STQD:
{
if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_save_dom[op.rt])
{
const u32 offset = 0x80000000 + op.si10 * 16 - bb.stack_sub;
if (func.reg_save_off[op.rt] == 0)
{
// Store reg save offset
func.reg_save_off[op.rt] = offset;
}
else if (func.reg_save_off[op.rt] != offset)
{
// Conflict of different offsets
func.reg_save_off[op.rt] = -1;
}
}
break;
}
case spu_itype::LQD:
{
if (op.ra == s_reg_sp && bb.stack_sub != 0x80000000 && bb.reg_load_mod[op.rt] == ia + 1)
{
// Adjust reg load offset
bb.reg_load_mod[op.rt] = 0x80000000 + op.si10 * 16 - bb.stack_sub;
}
// Clear const
bb.reg_const[op.rt] = false;
break;
}
default:
{
// Clear const if reg is modified here
if (u8 reg = m_regmod[ia / 4]; reg < s_reg_max)
bb.reg_const[reg] = false;
break;
}
}
// $SP is modified
if (m_regmod[ia / 4] == s_reg_sp)
{
if (bb.reg_const[s_reg_sp])
{
// Making $SP a constant is a funny thing too.
bb.stack_sub = 0x80000000;
}
if (bb.stack_sub != 0x80000000)
{
switch (last_inst)
{
case spu_itype::AI:
{
if (op.ra == s_reg_sp)
bb.stack_sub -= op.si10;
else
bb.stack_sub = 0x80000000;
break;
}
case spu_itype::A:
{
if (op.ra == s_reg_sp && bb.reg_const[op.rb])
bb.stack_sub -= bb.reg_val32[op.rb];
else if (op.rb == s_reg_sp && bb.reg_const[op.ra])
bb.stack_sub -= bb.reg_val32[op.ra];
else
bb.stack_sub = 0x80000000;
break;
}
case spu_itype::SF:
{
if (op.rb == s_reg_sp && bb.reg_const[op.ra])
bb.stack_sub += bb.reg_val32[op.ra];
else
bb.stack_sub = 0x80000000;
break;
}
default:
{
bb.stack_sub = 0x80000000;
break;
}
}
}
// Check for funny values.
if (bb.stack_sub >= 0x40000 || bb.stack_sub % 16)
{
bb.stack_sub = 0x80000000;
}
}
}
// Analyse terminator instruction
const u32 tia = addr + bb.size * 4 - 4;
switch (last_inst)
{
case spu_itype::BR:
case spu_itype::BRNZ:
case spu_itype::BRZ:
case spu_itype::BRHNZ:
case spu_itype::BRHZ:
case spu_itype::BRSL:
{
const u32 target = spu_branch_target(tia, op.i16);
if (target == tia + 4)
{
bb.terminator = term_type::fallthrough;
}
else if (last_inst != spu_itype::BRSL)
{
// No-op terminator or simple branch instruction
bb.terminator = term_type::br;
if (target == bb.func)
{
// Recursive tail call
bb.terminator = term_type::ret;
}
}
else if (op.rt == s_reg_lr)
{
bb.terminator = term_type::call;
}
else
{
bb.terminator = term_type::interrupt_call;
}
break;
}
case spu_itype::BRA:
case spu_itype::BRASL:
{
bb.terminator = term_type::indirect_call;
break;
}
case spu_itype::BI:
{
if (op.d || op.e || bb.targets.size() == 1)
{
bb.terminator = term_type::interrupt_call;
}
else if (bb.targets.size() > 1)
{
// Jump table
bb.terminator = term_type::br;
}
else if (op.ra == s_reg_lr)
{
// Return (TODO)
bb.terminator = term_type::ret;
}
else
{
// Indirect tail call (TODO)
bb.terminator = term_type::interrupt_call;
}
break;
}
case spu_itype::BISLED:
case spu_itype::IRET:
{
bb.terminator = term_type::interrupt_call;
break;
}
case spu_itype::BISL:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ:
{
if (op.d || op.e || bb.targets.size() != 1)
{
bb.terminator = term_type::interrupt_call;
}
else if (last_inst != spu_itype::BISL && bb.targets[0] == tia + 4 && op.ra == s_reg_lr)
{
// Conditional return (TODO)
bb.terminator = term_type::ret;
}
else if (last_inst == spu_itype::BISL)
{
// Indirect call
bb.terminator = term_type::indirect_call;
}
else
{
// TODO
bb.terminator = term_type::interrupt_call;
}
break;
}
default:
{
// Normal instruction
bb.terminator = term_type::fallthrough;
break;
}
}
}
// Check function blocks, verify and print some reasons
for (auto& f : m_funcs)
{
if (g_cfg.core.spu_block_size != spu_block_size_type::giga)
{
break;
}
bool is_ok = true;
u32 used_stack = 0;
for (auto it = m_bbs.lower_bound(f.first); it != m_bbs.end() && it->second.func == f.first; ++it)
{
auto& bb = it->second;
auto& func = ::at32(m_funcs, bb.func);
const u32 addr = it->first;
const u32 flim = bb.func + func.size * 4;
used_stack |= bb.stack_sub;
if (is_ok && bb.terminator >= term_type::indirect_call)
{
is_ok = false;
}
if (is_ok && bb.terminator == term_type::ret)
{
// Check $LR (alternative return registers are currently not supported)
if (u32 lr_orig = bb.reg_mod[s_reg_lr] ? addr : bb.reg_origin_abs[s_reg_lr]; lr_orig < 0x40000)
{
auto& src = ::at32(m_bbs, lr_orig);
if (src.reg_load_mod[s_reg_lr] != func.reg_save_off[s_reg_lr])
{
spu_log.error("Function 0x%05x: [0x%05x] $LR mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, lr_orig, src.reg_load_mod[0], func.reg_save_off[0]);
is_ok = false;
}
else if (src.reg_load_mod[s_reg_lr] == 0)
{
spu_log.error("Function 0x%05x: [0x%05x] $LR modified (src=0x%x)", f.first, addr, lr_orig);
is_ok = false;
}
}
else if (lr_orig > 0x40000)
{
spu_log.todo("Function 0x%05x: [0x%05x] $LR unpredictable (src=0x%x)", f.first, addr, lr_orig);
is_ok = false;
}
// Check $80..$127 (should be restored or unmodified)
for (u32 i = s_reg_80; is_ok && i <= s_reg_127; i++)
{
if (u32 orig = bb.reg_mod[i] ? addr : bb.reg_origin_abs[i]; orig < 0x40000)
{
auto& src = ::at32(m_bbs, orig);
if (src.reg_load_mod[i] != func.reg_save_off[i])
{
spu_log.error("Function 0x%05x: [0x%05x] $%u mismatch (src=0x%x; 0x%x vs 0x%x)", f.first, addr, i, orig, src.reg_load_mod[i], func.reg_save_off[i]);
is_ok = false;
}
}
else if (orig > 0x40000)
{
spu_log.todo("Function 0x%05x: [0x%05x] $%u unpredictable (src=0x%x)", f.first, addr, i, orig);
is_ok = false;
}
if (func.reg_save_off[i] + 1 == 0)
{
spu_log.error("Function 0x%05x: [0x%05x] $%u used incorrectly", f.first, addr, i);
is_ok = false;
}
}
// Check $SP (should be restored or unmodified)
if (bb.stack_sub != 0 && bb.stack_sub != 0x80000000)
{
spu_log.error("Function 0x%05x: [0x%05x] return with stack frame 0x%x", f.first, addr, bb.stack_sub);
is_ok = false;
}
}
if (is_ok && bb.terminator == term_type::call)
{
// Check call instruction (TODO)
if (bb.stack_sub == 0)
{
// Call without a stack frame
spu_log.error("Function 0x%05x: [0x%05x] frameless call", f.first, addr);
is_ok = false;
}
}
if (is_ok && bb.terminator == term_type::fallthrough)
{
// Can't just fall out of the function
if (bb.targets.size() != 1 || bb.targets[0] >= flim)
{
spu_log.error("Function 0x%05x: [0x%05x] bad fallthrough to 0x%x", f.first, addr, bb.targets[0]);
is_ok = false;
}
}
if (is_ok && bb.stack_sub == 0x80000000)
{
spu_log.error("Function 0x%05x: [0x%05x] bad stack frame", f.first, addr);
is_ok = false;
}
// Fill external function targets (calls, possibly tail calls)
for (u32 target : bb.targets)
{
if (target < bb.func || target >= flim || (bb.terminator == term_type::call && target == bb.func))
{
if (func.calls.find_first_of(target) + 1 == 0)
{
func.calls.push_back(target);
}
}
}
}
if (is_ok && used_stack && f.first == entry_point)
{
spu_log.error("Function 0x%05x: considered possible chunk", f.first);
is_ok = false;
}
// if (is_ok && f.first > 0x1d240 && f.first < 0x1e000)
// {
// spu_log.error("Function 0x%05x: manually disabled", f.first);
// is_ok = false;
// }
f.second.good = is_ok;
}
// Check function call graph
while (g_cfg.core.spu_block_size == spu_block_size_type::giga)
{
bool need_repeat = false;
for (auto& f : m_funcs)
{
if (!f.second.good)
{
continue;
}
for (u32 call : f.second.calls)
{
const auto ffound = std::as_const(m_funcs).find(call);
if (ffound == m_funcs.cend() || ffound->second.good == false)
{
need_repeat = true;
if (f.second.good)
{
spu_log.error("Function 0x%05x: calls bad function (0x%05x)", f.first, call);
f.second.good = false;
}
}
}
}
if (!need_repeat)
{
break;
}
}
struct putllc16_statistics_t
{
atomic_t<u64> all = 0;
atomic_t<u64> single = 0;
atomic_t<u64> nowrite = 0;
std::array<atomic_t<u64>, 128> breaking_reason{};
std::vector<std::pair<u32, u64>> get_reasons()
{
std::vector<std::pair<u32, u64>> map;
for (usz i = 0; i < breaking_reason.size(); i++)
{
if (u64 v = breaking_reason[i])
{
map.emplace_back(i, v);
}
}
std::stable_sort(map.begin(), map.end(), FN(x.second > y.second));
return map;
}
};
struct atomic16_t
{
bool active = false; // GETLLAR happened
u32 lsa_pc = SPU_LS_SIZE; // PC of first LSA write
u32 lsa_last_pc = SPU_LS_SIZE; // PC of first LSA write
u32 get_pc = SPU_LS_SIZE; // PC of GETLLAR
u32 put_pc = SPU_LS_SIZE; // PC of PUTLLC
reg_state_t ls{}; // state of LS load/store address register
reg_state_t ls_offs = reg_state_t::from_value(0); // Added value to ls
reg_state_t lsa{}; // state of LSA register on GETLLAR
reg_state_t ls_reg[8]{}; // stores/loads using register bundles with offset
reg_state_t ls_abs[8]{}; // stores/loads using absolute address
u32 reg = s_reg_max; // Source of address register of LS load/store
u32 reg2 = s_reg_max; // Source 2 of address register of LS load/store (STQX/LQX)
//u32 ls_offs[8]{}; // LS offset from register (0 if const)
bool ls_pc_rel = false; // For STQR/LQR
bool ls_access = false; // LS accessed
bool ls_write = false; // LS written
bool ls_invalid = false; // From this point and on, any store will cancel the optimization
bool select_16_or_0_at_runtime = false;
bool put_active = false; // PUTLLC happened
bool get_rdatomic = false; // True if MFC_RdAtomicStat was read after GETLLAR
u32 mem_count = 0;
// Return old state for error reporting
atomic16_t discard()
{
const u32 pc = lsa_pc;
const u32 last_pc = lsa_last_pc;
const atomic16_t old = *this;
*this = atomic16_t{};
// Keep some members
lsa_pc = pc;
lsa_last_pc = last_pc;
return old;
}
// Conditional breakage (break if a full 128-byte reservation is needed)
atomic16_t set_invalid_ls(bool write)
{
ls_invalid = true;
ls_write |= write;
if (write)
{
return discard();
}
return atomic16_t{};
}
};
// Reset tags
reg_state_t::alloc_tag(true);
std::map<u32, std::unique_ptr<block_reg_info>> infos;
infos.emplace(entry_point, block_reg_info::create(entry_point));
struct block_reg_state_iterator
{
u32 pc{};
usz parent_iterator_index = umax;
usz parent_target_index = 0;
usz iterator_id = 0;
2024-05-08 13:30:12 +02:00
usz temp_child_index = umax;
usz temp_list_index = umax;
// PUTLLC16 optimization analysis tracker
atomic16_t atomic16{};
block_reg_state_iterator(u32 _pc, usz _parent_iterator_index = umax, usz _parent_target_index = 0) noexcept
: pc(_pc)
, parent_iterator_index(_parent_iterator_index)
, parent_target_index(_parent_target_index)
{
}
};
std::vector<block_reg_state_iterator> reg_state_it;
std::map<u32, atomic16_t> atomic16_all; // RdAtomicStat location -> atomic loop optimization state
std::map<u32, bool> getllar_starts; // True for failed loops
std::map<u32, bool> run_on_block;
2024-05-08 13:30:12 +02:00
std::map<u32, bool> logged_block;
std::array<reg_state_t, s_reg_max>* true_state_walkby = nullptr;
atomic16_t dummy16{};
bool likely_putllc_loop = false;
2024-05-08 13:30:12 +02:00
bool had_putllc_evaluation = false;
for (u32 i = 0, count = 0; i < result.data.size(); i++)
{
const u32 inst = std::bit_cast<be_t<u32>>(result.data[i]);
if (spu_opcode_t{inst}.ra == MFC_RdAtomicStat && g_spu_itype.decode(inst) == spu_itype::RDCH)
{
count++;
if (count == 2)
{
likely_putllc_loop = true;
break;
}
}
}
usz target_count = 0;
for (auto& [pc, loc] : m_targets)
{
target_count += loc.size();
}
2024-05-08 13:30:12 +02:00
const bool should_search_patterns = target_count < 300u;
// Treat start of function as an unknown value with tag (because it is)
const reg_state_t start_program_count = reg_state_t::make_unknown();
// Initialize
reg_state_it.emplace_back(entry_point);
run_on_block[entry_point / 4] = true;
enum spu_addr_mask_t : u32
{
SPU_LS_MASK_128 = (SPU_LS_SIZE - 1) & -128,
SPU_LS_MASK_16 = (SPU_LS_SIZE - 1) & -16,
SPU_LS_MASK_4 = (SPU_LS_SIZE - 1) & -4,
SPU_LS_MASK_1 = (SPU_LS_SIZE - 1),
};
u32 iterator_id_alloc = 0;
for (u32 wf = 0, wi = 0, wa = entry_point, bpc = wa; wf <= 1;)
{
const bool is_form_block = wf == 0;
const bool is_pattern_match = wf == 1;
dummy16.active = false;
if (!is_form_block && wa == bpc)
{
if (wi == 0)
{
for (auto& [addr, block] : infos)
{
// Evaluate state for all blocks
block->evaluate_start_state(infos);
}
}
if (!should_search_patterns)
{
break;
}
if (!infos[bpc])
{
std::string out = fmt::format("Blocks:");
for (auto& [pc, _] : infos)
{
if (!_) continue;
fmt::append(out, " [0x%x]", pc);
}
out += '\n';
for (auto& [pc, bb] : m_bbs)
{
if (!m_block_info[pc / 4])
{
continue;
}
out += fmt::format("\nTargets 0x%x:", pc);
for (auto addr : bb.targets)
{
fmt::append(out, " [0x%x]", addr);
}
}
2024-05-08 13:30:12 +02:00
spu_log.fatal("%s", out);
}
2024-05-08 13:30:12 +02:00
true_state_walkby = &ensure(infos[bpc])->evaluate_start_state(infos);
for (reg_state_t& f : *true_state_walkby)
{
if (f.flag & vf::is_null)
{
// Evaluate locally
//f.tag = reg_state_t::alloc_tag();
f.flag -= vf::is_null;
}
}
}
auto& vregs = is_form_block ? infos[bpc]->local_state : *true_state_walkby;
auto& atomic16 = is_pattern_match ? ::at32(reg_state_it, wi).atomic16 : dummy16;
const u32 pos = wa;
wa += 4;
const auto break_putllc16 = [&](u32 cause, atomic16_t previous)
{
if (previous.active && likely_putllc_loop && getllar_starts.contains(previous.lsa_pc))
{
const bool is_first = !std::exchange(getllar_starts[previous.lsa_pc], true);
if (!is_first)
{
return;
}
2024-05-08 13:30:12 +02:00
had_putllc_evaluation = true;
g_fxo->get<putllc16_statistics_t>().breaking_reason[cause]++;
if (!spu_log.notice)
{
return;
}
2024-05-09 09:48:52 +02:00
std::string break_error = fmt::format("PUTLLC pattern breakage [%x mem=%d lsa_const=%d cause=%u] (lsa_pc=0x%x)", pos, previous.mem_count, u32{!previous.ls_offs.is_const()} * 2 + previous.lsa.is_const(), cause, previous.lsa_pc);
const auto values = g_fxo->get<putllc16_statistics_t>().get_reasons();
std::string tracing = "Top Breaking Reasons:";
usz i = 0;
usz fail_count = 0;
bool switched_to_minimal = false;
for (auto it = values.begin(); it != values.end(); i++, it++)
{
fail_count += it->second;
if (i >= 12)
{
continue;
}
if (i < 8 && it->second > 1)
{
fmt::append(tracing, " [cause=%u, n=%d]", it->first, it->second);
}
else
{
if (!std::exchange(switched_to_minimal, true))
{
fmt::append(tracing, "; More:");
}
fmt::append(tracing, " %u", it->first);
}
}
fmt::append(tracing, " of %d failures", fail_count);
spu_log.notice("%s\n%s", break_error, tracing);
if (cause == 17 || cause == 20)
{
}
}
};
const auto calculate_absolute_ls_difference = [](u32 addr1, u32 addr2)
{
addr1 &= SPU_LS_MASK_1;
addr2 &= SPU_LS_MASK_1;
const u32 abs_diff = (addr1 >= addr2 ? addr1 - addr2 : addr2 - addr1);
// Because memory is wrapping-around, take the gap that is smaller
return abs_diff >= SPU_LS_SIZE / 2 ? SPU_LS_SIZE - abs_diff : abs_diff;
};
bool called_next = false;
const auto next_block = [&]()
{
if (called_next)
{
// Guard multiple calles to next_block()
return;
}
called_next = true;
if (wf == 0)
{
wi++;
auto& block = infos[bpc];
if (g_cfg.core.spu_block_size != spu_block_size_type::safe && (m_ret_info[bpc / 4] || m_entry_info[bpc / 4] || pos == entry_point))
{
// Do not allow value passthrough
for (reg_state_t& f : block->start_reg_state)
{
f.flag -= vf::is_null;
}
for (reg_state_t& f : block->local_state)
{
f.flag -= vf::is_null;
}
// Block has an external origin, discard all previous information
block->end_reg_state = block->local_state;
block->has_true_state = true;
}
block->addend_reg_state = block->local_state;
}
else
{
std::vector<u32> to_pop;
usz stackframe_it = wi;
2024-06-02 18:39:54 +02:00
u32 stackframe_pc = SPU_LS_SIZE;
usz entry_index = umax;
2024-05-08 13:30:12 +02:00
auto get_block_targets = [&](u32 pc) -> std::basic_string_view<u32>
{
if (m_block_info[pc / 4] && m_bbs.count(pc))
{
return m_bbs.at(pc).targets;
}
2024-05-08 13:30:12 +02:00
return {};
};
u32 target_pc = SPU_LS_SIZE;
bool insert_entry = false;
bool is_code_backdoor = false;
while (true)
{
const auto state_it = &reg_state_it[stackframe_it];
stackframe_pc = state_it->pc;
entry_index = state_it->parent_target_index;
2024-05-08 13:30:12 +02:00
const auto targets = get_block_targets(stackframe_pc);
const usz target_size = targets.size();
while (entry_index < target_size && (targets[entry_index] < lsa || targets[entry_index] >= limit || !m_block_info[targets[entry_index] / 4]))
{
state_it->parent_target_index++;
entry_index = state_it->parent_target_index;
}
if (entry_index == target_size)
{
const usz parent_index = state_it->parent_iterator_index;
2024-05-08 13:30:12 +02:00
to_pop.emplace_back(stackframe_it);
if (parent_index != umax)
{
stackframe_it = parent_index;
}
else
{
2024-05-08 13:30:12 +02:00
// Final
wi = 0;
break;
}
}
else
{
target_pc = ::at32(targets, entry_index);
2024-05-08 13:30:12 +02:00
usz occurence_count = 0;
std::array<usz, 16> duplicate_positions;
// Virtual concept (there is no really such thing as loop connectors from the ccompiled-code level)
// But it helps to simplify this process
2024-05-08 13:30:12 +02:00
bool is_loop_connector = false;
bool is_too_extensive = false;
bool is_skipable = false;
// Hack to avoid extensive analysis of all code paths possible:
// Allow up to 4 occurences of the upper-most block
// Because, loop "connectors" are usually backward in direction
// The proper solution would be to add a precursry function analysis stage which identifies all loop "connectors" and allows duplicates based on it
for (usz i = stackframe_it, count = 0;; count++)
{
2024-05-08 13:30:12 +02:00
auto& entry = ::at32(reg_state_it, i);
const u32 entry_pc = entry.pc;
if (count == (state_it->atomic16.active ? 40 : 12))
{
2024-05-08 13:30:12 +02:00
if (state_it->atomic16.active && !std::exchange(logged_block[target_pc / 4], true))
{
spu_log.notice("SPU Blcok Analysis is too extensive at 0x%x", entry_pc);
}
is_too_extensive = true;
break;
}
2024-05-08 13:30:12 +02:00
if (entry_pc == target_pc)
{
duplicate_positions[occurence_count++] = i;
2024-05-08 13:30:12 +02:00
if (occurence_count == duplicate_positions.size())
{
is_loop_connector = true;
break;
}
}
2024-05-08 13:30:12 +02:00
const usz parent_idx = entry.parent_iterator_index;
if (parent_idx == umax)
{
break;
}
ensure(i != parent_idx);
// Fill info for later
auto& parent = ::at32(reg_state_it, parent_idx);
parent.temp_child_index = i;
parent.temp_list_index = count;
i = parent_idx;
}
// Scan the code for "code flow" repetitions (entire sequences of blocks equal to each other)
// If found, this is 100% a loop, shoulkd it start a third time ignore it
if (occurence_count >= 2)
{
for (usz it_begin = 0; !is_skipable && it_begin < occurence_count - 1; it_begin++)
{
2024-05-08 13:30:12 +02:00
const usz block_start = duplicate_positions[it_begin + 1];
for (usz it_tail = 0; it_tail < it_begin + 1; it_tail++)
{
2024-05-08 13:30:12 +02:00
const usz block_tail = duplicate_positions[it_begin - it_tail];
// Check if the distance is precisely two times from the end
if (reg_state_it.size() - block_start != utils::rol64(reg_state_it.size() - block_tail, 1))
{
continue;
}
bool is_equal = true;
for (usz j = 1; j < reg_state_it.size() - block_tail; j++)
{
if (reg_state_it[block_start + j].pc != reg_state_it[block_tail + j].pc)
{
is_equal = false;
break;
}
}
2024-05-08 13:30:12 +02:00
if (is_equal)
{
2024-05-08 13:30:12 +02:00
is_skipable = true;
break;
}
}
}
2024-05-08 13:30:12 +02:00
}
2024-05-08 13:30:12 +02:00
if (is_skipable)
{
if (!std::exchange(logged_block[target_pc / 4], true))
{
2024-05-08 13:30:12 +02:00
spu_log.notice("SPU block is a loop at [0x%05x -> 0x%05x]", state_it->pc, target_pc);
}
2024-05-08 13:30:12 +02:00
state_it->parent_target_index++;
continue;
}
if (is_loop_connector && !std::exchange(logged_block[target_pc / 4], true))
{
spu_log.notice("SPU block analysis is too repetitive at [0x%05x -> 0x%05x]", state_it->pc, target_pc);
}
insert_entry = true;
// Test if the code is an opening to external code (start of the function is always respected because it is already assumed to have no origin)
is_code_backdoor = m_ret_info[target_pc / 4] || (m_entry_info[target_pc / 4] && target_pc != entry_point);
2024-05-08 13:30:12 +02:00
if (run_on_block[target_pc / 4])
{
2024-05-08 13:30:12 +02:00
insert_entry = false;
}
else if (is_code_backdoor || is_too_extensive || is_loop_connector)
{
if (reg_state_it[stackframe_it].atomic16.active)
{
2024-05-08 13:30:12 +02:00
break_putllc16(40, reg_state_it[stackframe_it].atomic16.discard());
}
2024-05-08 13:30:12 +02:00
// Allow the block to run only once, to avoid unnecessary iterations
run_on_block[target_pc / 4] = true;
}
state_it->parent_target_index++;
if (!insert_entry)
{
continue;
}
break;
}
}
// Backup analyser information
const auto atomic16_info = reg_state_it[stackframe_it].atomic16;
// Clean from the back possible because it does not affect old indices
// Technically should always do a full cleanup at the moment
// TODO: Proper cleanup with keeping old indices valid
for (u32 it : to_pop)
{
if (it == reg_state_it.size() - 1)
{
reg_state_it.pop_back();
}
else
{
// Should not be reachable at the moment
//ensure(false);
2024-05-08 13:30:12 +02:00
spu_log.error("Failed to clean block analyis steps at block_id %d", reg_state_it[it].iterator_id);
}
}
if (insert_entry)
{
const usz target_size = get_block_targets(stackframe_pc).size();
2024-05-08 13:30:12 +02:00
spu_log.trace("Emplacing: block_id=%d, pc=0x%x, target_it=%d/%d, new_pc=0x%x (has_it=%d)", reg_state_it[stackframe_it].iterator_id, stackframe_pc, entry_index + 1, target_size, target_pc, atomic16_info.active);
auto& next = reg_state_it.emplace_back(target_pc, stackframe_it, 0);
if (!is_code_backdoor)
{
// Restore analyser information (if not an entry)
next.atomic16 = atomic16_info;
}
else if (atomic16_info.active)
{
break_putllc16(39, atomic16_info);
}
next.iterator_id = iterator_id_alloc++;
wi = static_cast<u32>(stackframe_it + 1);
2024-05-08 13:30:12 +02:00
ensure(stackframe_it + 1 == reg_state_it.size() - 1);
}
}
if (wi >= reg_state_it.size())
{
wf++;
wi = 0;
run_on_block.clear();
if (wf == 1)
{
reg_state_it.clear();
if (!infos.empty())
{
reg_state_it.emplace_back(::at32(infos, entry_point)->pc).iterator_id = iterator_id_alloc++;;
}
}
}
if (wi < reg_state_it.size())
{
wa = ::at32(reg_state_it, wi).pc;
bpc = wa;
}
};
const auto get_reg = [&](u32 reg) -> const reg_state_t&
{
return vregs[reg];
};
const auto move_reg = [&](u32 dst, u32 src)
{
if (dst == src || vregs[src] == vregs[dst])
{
return;
}
vregs[dst] = vregs[src];
// Register storage has changed
vregs[dst].flag -= vf::is_null;
};
const auto set_const_value = [&](u32 reg, u32 value)
{
vregs[reg] = reg_state_t::from_value(value);
};
const auto inherit_const_value = [&](u32 reg, bs_t<vf> flag, u32 value)
{
flag -= vf::is_null;
vregs[reg] = reg_state_t{flag, value, flag & vf::is_const ? u32{umax} : reg_state_t::alloc_tag()};
};
const auto inherit_const_mask_value = [&](u32 reg, reg_state_t state, u32 mask_ones, u32 mask_zeroes)
{
if ((mask_ones | mask_zeroes) == 0)
{
state.flag -= vf::is_null;
vregs[reg] = state;
return;
}
if (state.flag & vf::is_const)
{
vregs[reg] = reg_state_t::from_value((state.value | mask_ones) & ~mask_zeroes);
return;
}
const u32 ones = (state.known_ones | mask_ones) & ~mask_zeroes;
const u32 zeroes = (state.known_zeroes | mask_zeroes) & ~mask_ones;
if ((ones ^ zeroes) == umax)
{
// Special case: create a constant from full masks
vregs[reg] = reg_state_t::from_value(ones);
return;
}
ensure(state.tag != umax);
vregs[reg] = reg_state_t{vf::is_mask, 0, state.tag, ones, zeroes};
};
const auto unconst = [&](u32 reg)
{
vregs[reg] = {{}, {}, reg_state_t::alloc_tag()};
};
const auto add_block = [&](u32 target)
{
if (!is_form_block)
{
return;
}
// Validate new target (TODO)
if (target >= lsa && target < limit)
{
if (!infos[target])
{
infos[target] = block_reg_info::create(target);
}
block_reg_info::create_node(target, bpc, infos);
if (!run_on_block[target / 4])
{
reg_state_it.emplace_back(target).iterator_id = iterator_id_alloc++;
run_on_block[target / 4] = true;
}
}
};
if (pos < lsa || pos >= limit)
{
// Don't analyse if already beyond the limit
next_block();
continue;
}
if (bpc != pos && m_preds.count(pos))
{
// End of block reached
next_block();
continue;
}
if (g_cfg.core.spu_block_size != spu_block_size_type::safe && (m_ret_info[pos / 4] || m_entry_info[pos / 4] || pos == entry_point))
{
ensure(bpc == pos);
// Block has an external origin, discard all previous information
// TODO: Make the optimizations conditional at runtime instead
if (!is_form_block)
{
// Call for external code
break_putllc16(25, atomic16.discard());
}
}
const u32 data = std::bit_cast<be_t<u32>>(::at32(result.data, (pos - lsa) / 4));
const auto op = spu_opcode_t{data};
const auto type = g_spu_itype.decode(data);
// For debugging
2024-05-08 13:30:12 +02:00
if (false && likely_putllc_loop && is_pattern_match)
{
SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast<const u8*>(result.data.data()), result.lower_bound);
dis_asm.disasm(pos);
std::string consts;
for (auto _use : std::initializer_list<std::pair<u32, bool>>{{op.ra, m_use_ra.test(pos / 4)}
, {op.rb, m_use_rb.test(pos / 4)}, {op.rc, m_use_rc.test(pos / 4)}})
{
if (!_use.second)
{
continue;
}
if (!consts.empty())
{
consts += ',';
}
const u32 reg_file = _use.first;
const auto& reg = get_reg(reg_file);
if (reg.is_const())
{
fmt::append(consts, " r%d=0x%x", reg_file, reg.value);
}
else
{
if (u32 mask = reg.known_zeroes | reg.known_ones)
{
fmt::append(consts, " r%d=#%d-&|0x%x", reg_file, reg.tag, mask);
}
else
{
fmt::append(consts, " r%d=#%d", reg_file, reg.tag);
}
}
}
if (!consts.empty())
{
consts = " {" + consts + " }";
}
if (dis_asm.last_opcode.ends_with('\n'))
{
dis_asm.last_opcode.pop_back();
}
spu_log.always()("[SPU=0%x, it=%d] %s%s [%d]", pos, reg_state_it[wi].iterator_id, dis_asm.last_opcode, consts, atomic16.active);
}
// Analyse instruction
switch (type)
{
case spu_itype::UNK:
case spu_itype::DFCEQ:
case spu_itype::DFCMEQ:
case spu_itype::DFCGT:
case spu_itype::DFCMGT:
case spu_itype::DFTSV:
{
// Stop before invalid instructions (TODO)
next_block();
continue;
}
case spu_itype::SYNC:
case spu_itype::STOP:
case spu_itype::STOPD:
{
if (data == 0)
{
// Stop before null data
next_block();
continue;
}
if (g_cfg.core.spu_block_size == spu_block_size_type::safe)
{
// Stop on special instructions (TODO)
next_block();
break;
}
if (type == spu_itype::SYNC)
{
// Remember
sync = true;
}
break;
}
case spu_itype::IRET:
case spu_itype::BI:
case spu_itype::BISL:
case spu_itype::BISLED:
case spu_itype::BIZ:
case spu_itype::BINZ:
case spu_itype::BIHZ:
case spu_itype::BIHNZ:
{
2024-06-02 18:39:54 +02:00
if (op.e || op.d)
{
break_putllc16(27, atomic16.discard());
}
break;
}
case spu_itype::BRSL:
case spu_itype::BRASL:
{
break;
}
case spu_itype::BRA:
{
break;
}
case spu_itype::BR:
case spu_itype::BRZ:
case spu_itype::BRNZ:
case spu_itype::BRHZ:
case spu_itype::BRHNZ:
{
break;
}
case spu_itype::DSYNC:
case spu_itype::HEQ:
case spu_itype::HEQI:
case spu_itype::HGT:
case spu_itype::HGTI:
case spu_itype::HLGT:
case spu_itype::HLGTI:
case spu_itype::LNOP:
case spu_itype::NOP:
case spu_itype::MTSPR:
case spu_itype::FSCRWR:
{
// Do nothing
break;
}
case spu_itype::WRCH:
{
switch (op.ra)
{
case MFC_EAL:
{
move_reg(s_reg_mfc_eal, op.rt);
break;
}
case MFC_LSA:
{
auto rt = get_reg(op.rt);
inherit_const_mask_value(s_reg_mfc_lsa, rt, 0, ~SPU_LS_MASK_1);
if (is_pattern_match)
{
atomic16.lsa_last_pc = pos;
}
break;
}
case MFC_TagID:
{
break;
}
case MFC_Size:
{
break;
}
case MFC_Cmd:
{
const auto [af, av, atagg, _3, _5] = get_reg(op.rt);
if (!is_pattern_match)
{
//
}
else if (af & vf::is_const)
{
switch (av)
{
case MFC_GETLLAR_CMD:
{
2024-06-02 18:39:54 +02:00
auto lsa = get_reg(s_reg_mfc_lsa);
inherit_const_mask_value(s_reg_mfc_lsa, lsa, 0, ~SPU_LS_MASK_128);
lsa = get_reg(s_reg_mfc_lsa);
const u32 lsa_pc = atomic16.lsa_last_pc == SPU_LS_SIZE ? bpc : atomic16.lsa_last_pc;
if (atomic16.active)
{
2024-06-02 18:39:54 +02:00
if (atomic16.lsa_pc != lsa_pc || atomic16.get_pc != pos || atomic16.lsa != lsa)
{
break_putllc16(30, atomic16.discard());
}
}
// If LSA write has not happened, use block start
atomic16.lsa_pc = lsa_pc;
atomic16.get_pc = pos;
atomic16.active = true;
2024-06-02 18:39:54 +02:00
atomic16.lsa = lsa;
if (likely_putllc_loop)
{
// Register loop entry
if (getllar_starts.emplace(atomic16.lsa_pc, false).second)
{
g_fxo->get<putllc16_statistics_t>().all++;
spu_log.notice("[0x%05x] GETLLAR pattern entry point", pos);
}
}
break;
}
case MFC_PUTLLC_CMD:
{
if (atomic16.active)
{
const auto _lsa = get_reg(s_reg_mfc_lsa);
// Search the value of LS address stoire/load in latest register file
if (atomic16.ls_access && atomic16.ls_write && !atomic16.ls_pc_rel && !atomic16.ls.is_const())
{
usz reg_it = umax;
u32 regs[2]{s_reg_max, s_reg_max};
for (auto val : {&atomic16.ls, &atomic16.ls_offs})
{
reg_it++;
if (val->is_const())
{
regs[reg_it] = 0;
continue;
}
if (vregs[s_reg_mfc_lsa].compare_with_mask_indifference(*val, SPU_LS_MASK_1))
{
regs[reg_it] = s_reg_mfc_lsa;
continue;
}
for (u32 i = 0; i <= s_reg_127; i++)
{
const auto& _reg = vregs[i];
if (_reg == *val)
{
regs[reg_it] = i;
break;
}
}
}
if (regs[0] == s_reg_max || regs[1] == s_reg_max)
{
break_putllc16(3, atomic16.discard());
break;
}
atomic16.reg = regs[0];
if (!atomic16.ls_offs.is_const())
{
atomic16.reg2 = regs[1];
}
}
if (atomic16.ls_access && atomic16.ls_write && !atomic16.lsa.compare_with_mask_indifference(_lsa, SPU_LS_MASK_128))
{
// LSA latest value mismatches with the one written with GETLLAR
if (atomic16.lsa.flag != _lsa.flag)
{
break_putllc16(1, atomic16.discard());
}
else
{
break_putllc16(2, atomic16.discard());
}
break;
}
if (atomic16.ls_access && atomic16.ls_write)
{
atomic16.select_16_or_0_at_runtime = false;
bool ok = false;
if (atomic16.ls_pc_rel || !atomic16.ls_offs.is_const())
{
//
}
else if (atomic16.lsa.is_const())
{
if (atomic16.ls.is_const())
{
2024-05-09 09:48:52 +02:00
if (atomic16.ls_offs.value != 0)
{
// Rebase constant so we can get rid of ls_offs
atomic16.ls.value = spu_ls_target(atomic16.ls_offs.value + atomic16.ls.value);
atomic16.ls_offs = reg_state_t::from_value(0);
}
2024-05-09 09:48:52 +02:00
if (atomic16.ls.compare_with_mask_indifference(atomic16.lsa, SPU_LS_MASK_128))
{
ok = true;
}
}
2024-05-09 09:48:52 +02:00
else if (atomic16.ls_offs.compare_with_mask_indifference(atomic16.lsa, SPU_LS_MASK_128) && atomic16.ls.is_less_than(128 - (atomic16.ls_offs.value & 127)))
{
2024-05-09 09:48:52 +02:00
// Relative memory access with offset less than 128 bytes
// Common around SPU utilities which have less strict restrictions about memory alignment
ok = true;
}
}
2024-05-09 09:48:52 +02:00
else if (atomic16.lsa.compare_with_mask_indifference(atomic16.ls, SPU_LS_MASK_128) && atomic16.ls_offs == 0)
{
// Unknown value with known offset of less than 128 bytes
ok = true;
}
if (!ok)
{
// This is quite common.. let's try to select between putllc16 and putllc0 at runtime!
// break_putllc16(100);
// atomic16.discard();
// break;
atomic16.select_16_or_0_at_runtime = true;
}
}
if (!atomic16.get_rdatomic)
{
// MFC_RdAtomicStat must have been read, otherwise GETLLAR may not be executed (according to HW tests)
break_putllc16(21, atomic16.discard());
}
atomic16.put_pc = pos;
atomic16.put_active = true;
}
break;
}
default:
{
break_putllc16(4, atomic16.discard());
break;
}
}
}
else
{
break_putllc16(5, atomic16.discard());
}
if (!atomic16.active)
{
// Propagate failure
for (auto& atm : atomic16_all)
{
if (atm.second.active && atm.second.put_pc == pos)
{
break_putllc16(31, atm.second.discard());
}
}
}
break;
}
case MFC_EAH:
case SPU_WrDec:
case SPU_WrSRR0:
case SPU_WrEventAck:
case SPU_Set_Bkmk_Tag:
case SPU_PM_Start_Ev:
case SPU_PM_Stop_Ev:
case MFC_WrTagMask:
//case MFC_WrTagUpdate: // Technically correct to ignore but risky
break;
default:
{
break_putllc16(6, atomic16.discard());
break;
}
}
break;
}
case spu_itype::RCHCNT:
case spu_itype::RDCH:
{
const bool is_read = type == spu_itype::RDCH;
bool invalidate = is_read;
switch (op.ra)
{
case MFC_RdAtomicStat:
{
if (!is_read)
{
break;
}
if (atomic16.active)
{
if (atomic16.put_active)
{
if (getllar_starts.contains(atomic16.lsa_pc) && getllar_starts[atomic16.lsa_pc])
{
break_putllc16(24, atomic16.discard());
break;
}
const auto it = atomic16_all.find(pos);
if (it == atomic16_all.end())
{
// Fresh new pattern detected in a single code path
atomic16_all.emplace(pos, atomic16);
}
else if (it->second.active)
{
// Merge pattern attributes between different code paths, may cause detection of failures
atomic16_t& existing = it->second;
if (existing.lsa_pc != atomic16.lsa_pc || existing.put_pc != atomic16.put_pc || existing.lsa != atomic16.lsa)
{
// Register twice
break_putllc16(22, atomic16.discard());
break_putllc16(22, existing.discard());
}
if (existing.active && existing.ls_access && atomic16.ls_access && (!existing.ls.compare_with_mask_indifference(atomic16.ls, SPU_LS_MASK_1) || existing.ls_offs != atomic16.ls_offs))
{
// Conflicting loads with stores in more than one code path
break_putllc16(27, atomic16.set_invalid_ls(existing.ls_write || atomic16.ls_write));
if (!atomic16.active)
{
existing.active = false;
}
}
if (existing.active && (existing.ls_write || atomic16.ls_write) && (existing.ls_invalid || atomic16.ls_invalid))
{
// Conflicting loads with stores in more than one code path
break_putllc16(33, atomic16.discard());
existing.active = false;
existing.ls_invalid = true;
}
if (existing.active && !existing.ls_access && atomic16.ls_access)
{
// Propagate LS access
existing.ls = atomic16.ls;
existing.ls_offs = atomic16.ls_offs;
}
existing.ls_write |= atomic16.ls_write;
existing.ls_invalid |= atomic16.ls_invalid;
existing.ls_access |= atomic16.ls_access;
existing.mem_count = std::max<u32>(existing.mem_count, atomic16.mem_count);
existing.select_16_or_0_at_runtime |= atomic16.select_16_or_0_at_runtime;
}
atomic16.discard();
}
else if (!atomic16.get_rdatomic)
{
atomic16.get_rdatomic = true;
// Go above and beyond and also set the constant for it
set_const_value(op.rt, MFC_GETLLAR_SUCCESS);
invalidate = false;
}
}
break;
}
// Let's be safe here and no not allow multi-threaded communications
case SPU_WrOutMbox:
case SPU_WrOutIntrMbox:
case SPU_RdSigNotify1:
case SPU_RdSigNotify2:
case SPU_RdInMbox:
//case SPU_RdEventStat:
{
if (is_read)
{
break_putllc16(28, atomic16.discard());
}
else
{
break_putllc16(29, atomic16.discard());
}
break;
}
default:
{
break;
}
}
if (invalidate)
{
unconst(op.rt);
}
break;
}
case spu_itype::STQR:
case spu_itype::LQR:
{
const bool is_store = type == spu_itype::STQR;
if (atomic16.active)
{
atomic16.mem_count++;
// Do not clear lower 16 bytes addressing because the program can move on 4-byte basis
const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16);
if (atomic16.lsa.is_const() && [&]()
{
bool hack = false;
if (offs % 16 == 0 && (pos - result.lower_bound + op.si16 * 4) == offs)
{
2024-05-09 09:48:52 +02:00
const u32 reservation_bound = (atomic16.lsa.value | 127);
const u32 min_offs = offs;
// Hack: assume there is no overflow in relative instruction offset
// Thus, use instruction position + offset as a lower bound for reservation access
2024-05-09 09:48:52 +02:00
if (min_offs > reservation_bound)
{
spu_log.success("STQR/LQR Atomic Loop Hack: abs_pos=0x%x, abs=0x%x, i16*4=0x%x, ls_bound=0x%x", offs, pos + op.si16 * 4, op.si16 * 4, reservation_bound);
hack = true;
}
}
return hack;
}())
{
// Ignore memory access in this case
}
else if (atomic16.ls_invalid && is_store)
{
break_putllc16(35, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && atomic16.ls != start_program_count)
{
break_putllc16(7, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && offs != atomic16.ls_offs)
{
if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1))
{
atomic16.ls_write |= is_store;
}
else
{
// Sad
break_putllc16(8, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = start_program_count;
atomic16.ls_offs = reg_state_t::from_value(offs);
atomic16.ls_pc_rel = true;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
// atomic16.ls_reg[offs % 128 / 16] = start_program_count;
// atomic16.ls_offs[offs % 128 / 16] = offs;
}
if (is_store)
{
break;
}
// Unconst
unconst(op.rt);
break;
}
case spu_itype::STQX:
case spu_itype::LQX:
{
const bool is_store = type == spu_itype::STQX;
if (atomic16.active)
{
atomic16.mem_count++;
auto ra = get_reg(op.ra);
ra.value &= SPU_LS_MASK_1;
auto rb = get_reg(op.rb);
rb.value &= SPU_LS_MASK_1;
const u32 const_flags = u32{ra.is_const()} + u32{rb.is_const()};
switch (const_flags)
{
case 2:
{
auto add_res = ra;
add_res.value += rb.value;
add_res.value &= SPU_LS_MASK_16;
add_res.tag = umax;
if (atomic16.lsa.unequal_with_mask_indifference(add_res, SPU_LS_MASK_128))
{
// Unrelated, ignore
}
else if (atomic16.ls_invalid && is_store)
{
break_putllc16(20, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && add_res != atomic16.ls)
{
if (atomic16.ls.unequal_with_mask_indifference(add_res, SPU_LS_MASK_128) && atomic16.ls_offs == 0)
{
// Ok
}
else if (atomic16.ls_pc_rel)
{
break_putllc16(8, atomic16.set_invalid_ls(is_store));
}
else
{
// Sad
break_putllc16(9, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = reg_state_t::from_value(add_res.value);
atomic16.ls_offs = reg_state_t::from_value(0);
atomic16.ls_pc_rel = false;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
break;
}
case 1:
{
const auto& state = ra.is_const() ? rb : ra;
const auto& _lsa = atomic16.lsa;
const u32 offs = (ra.is_const() ? ra.value : rb.value) & SPU_LS_MASK_1;
const u32 abs_diff = calculate_absolute_ls_difference(offs, 0);
if ((_lsa.unequal_with_mask_indifference(state, SPU_LS_MASK_128) && offs == 0) ||
(_lsa.compare_with_mask_indifference(state, SPU_LS_MASK_1) && abs_diff >= 128u) ||
(_lsa.compare_with_mask_indifference(state, SPU_LS_MASK_128) && abs_diff >= 256u)
)
{
// We already know it's an unrelated load/store
// The reason for SPU_LS_SIZE - 128 check is that in case LSA is not aligned, it detects the possible wraparound
}
else if (atomic16.ls_invalid && is_store)
{
break_putllc16(23, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && atomic16.ls != state)
{
if (atomic16.ls.unequal_with_mask_indifference(state, SPU_LS_MASK_128) && offs == 0)
{
// Ok
}
else if (atomic16.ls_pc_rel)
{
break_putllc16(36, atomic16.set_invalid_ls(is_store));
}
else
{
// Sad
break_putllc16(11, atomic16.set_invalid_ls(is_store));
}
}
else if (atomic16.ls_access)
{
ensure(!atomic16.ls.is_const());
if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1))
{
// Ok
atomic16.ls_write |= is_store;
}
else if (atomic16.ls_offs.is_const() && atomic16.ls_offs.value / 16 == offs / 16 && state.get_known_zeroes() % 16 >= std::max<u32>(offs % 16, atomic16.ls_offs.value % 16))
{
// For special case observed in games (offset cannot cause the address to roll over the next 16 bytes)
atomic16.ls_write |= is_store;
}
else
{
break_putllc16(12, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = state;
atomic16.ls_offs = reg_state_t::from_value(offs);
atomic16.ls_pc_rel = false;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
break;
}
case 0:
{
const bool is_ra_first = atomic16.ls_access ? ra == atomic16.ls : op.ra <= op.rb;
const auto& state1 = is_ra_first ? ra : rb;
const auto& state2 = is_ra_first ? rb : ra;
if (atomic16.ls_access && (atomic16.ls != state1 || atomic16.ls_offs != state2))
{
if (atomic16.ls_pc_rel)
{
break_putllc16(32, atomic16.set_invalid_ls(is_store));
}
else
{
// Sad
break_putllc16(13, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = state1;
atomic16.ls_offs = state2;
atomic16.ls_pc_rel = false;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
break;
}
default: fmt::throw_exception("Unreachable!");
}
}
if (is_store)
{
break;
}
// Unconst
unconst(op.rt);
break;
}
case spu_itype::STQA:
case spu_itype::LQA:
{
const bool is_store = type == spu_itype::STQA;
if (atomic16.active)
{
atomic16.mem_count++;
2018-05-02 20:49:19 +02:00
const reg_state_t ca = reg_state_t::from_value(spu_ls_target(0, op.i16));
2018-05-02 20:49:19 +02:00
if (atomic16.lsa.unequal_with_mask_indifference(ca, SPU_LS_MASK_128))
{
// We already know it's an unrelated load/store
}
else if (atomic16.ls_invalid && is_store)
{
break_putllc16(37, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && ca != atomic16.ls)
{
if (atomic16.ls.unequal_with_mask_indifference(ca, SPU_LS_MASK_128) && atomic16.ls_offs == 0)
{
// Ok
}
else if (atomic16.ls_pc_rel)
{
break_putllc16(14, atomic16.set_invalid_ls(is_store));
}
else
{
// Sad
break_putllc16(15, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = ca;
atomic16.ls_offs = reg_state_t::from_value(0);
atomic16.ls_pc_rel = false;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
}
if (is_store)
{
break;
}
// Unconst
unconst(op.rt);
break;
}
case spu_itype::STQD:
case spu_itype::LQD:
{
const bool is_store = type == spu_itype::STQD;
if (atomic16.active)
{
atomic16.mem_count++;
auto ra = get_reg(op.ra);
const auto& _lsa = atomic16.lsa;
ra.value = ra.is_const() ? spu_ls_target(ra.value, op.si10 * 4) : 0;
const u32 offs = ra.is_const() ? 0 : spu_ls_target(0, op.si10 * 4);
const u32 abs_diff = calculate_absolute_ls_difference(offs, 0);
if ((_lsa.unequal_with_mask_indifference(ra, SPU_LS_MASK_128) && offs == 0) ||
(_lsa.compare_with_mask_indifference(ra, SPU_LS_MASK_1) && abs_diff >= 128u) ||
(_lsa.compare_with_mask_indifference(ra, SPU_LS_MASK_128) && abs_diff >= 256u)
)
{
// We already know it's an unrelated load/store
// The reason for SPU_LS_SIZE - 128 check is that in case LSA is not aligned, it detects the possible wraparound
}
else if (atomic16.ls_invalid && is_store)
{
break_putllc16(34, atomic16.set_invalid_ls(is_store));
}
else if (atomic16.ls_access && atomic16.ls != ra)
{
if (atomic16.ls.unequal_with_mask_indifference(ra, SPU_LS_MASK_128) && (offs == 0 && atomic16.ls_offs == 0))
{
// Ok
}
else if (atomic16.ls_pc_rel)
{
break_putllc16(16, atomic16.set_invalid_ls(is_store));
}
else
{
// Sad
break_putllc16(17, atomic16.set_invalid_ls(is_store));
}
}
else if (atomic16.ls_access)
{
if (atomic16.ls_offs.compare_with_mask_indifference(offs, SPU_LS_MASK_1))
{
atomic16.ls_write |= is_store;
}
else if (atomic16.ls_offs.is_const() && atomic16.ls_offs.value / 16 == offs / 16 && ra.get_known_zeroes() % 16 >= std::max<u32>(offs % 16, atomic16.ls_offs.value % 16))
{
// For special case observed in games (offset cannot cause the address to roll over the next 16 bytes)
atomic16.ls_write |= is_store;
}
else
{
break_putllc16(18, atomic16.set_invalid_ls(is_store));
}
}
else
{
atomic16.ls = ra;
atomic16.ls_offs = reg_state_t::from_value(offs);
atomic16.ls_pc_rel = false;
atomic16.ls_write |= is_store;
atomic16.ls_access = true;
}
}
if (type == spu_itype::STQD)
{
break;
}
// Unconst
unconst(op.rt);
break;
}
case spu_itype::HBR:
{
hbr_loc = spu_branch_target(pos, op.roh << 7 | op.rt);
const auto [af, av, at, ao, az] = get_reg(op.ra);
hbr_tg = af & vf::is_const && !op.c ? av & 0x3fffc : -1;
break;
}
case spu_itype::HBRA:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(0x0, op.i16);
break;
}
case spu_itype::HBRR:
{
hbr_loc = spu_branch_target(pos, op.r0h << 7 | op.rt);
hbr_tg = spu_branch_target(pos, op.i16);
break;
}
case spu_itype::IL:
{
set_const_value(op.rt, op.si16);
break;
}
case spu_itype::ILA:
{
set_const_value(op.rt, op.i18);
break;
}
case spu_itype::ILH:
{
set_const_value(op.rt, op.i16 << 16 | op.i16);
break;
}
case spu_itype::ILHU:
{
set_const_value(op.rt, op.i16 << 16);
break;
}
case spu_itype::IOHL:
{
const auto rt = get_reg(op.rt);
inherit_const_mask_value(op.rt, rt, op.i16, 0);
break;
}
case spu_itype::ORI:
{
if (!op.si10)
{
move_reg(op.rt, op.ra);
break;
}
const auto ra = get_reg(op.ra);
inherit_const_mask_value(op.rt, ra, op.si10, 0);
break;
}
case spu_itype::OR:
{
if (op.ra == op.rb)
{
move_reg(op.rt, op.ra);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
const auto [bf, bv, _2, _4, _6] = get_reg(op.rb);
inherit_const_value(op.rt, af & bf, bv | av);
break;
}
case spu_itype::XORI:
{
if (!op.si10)
{
move_reg(op.rt, op.ra);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
inherit_const_value(op.rt, af, av ^ op.si10);
break;
}
case spu_itype::XOR:
{
if (op.ra == op.rb)
{
set_const_value(op.rt, 0);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
const auto [bf, bv, _2, _4, _6] = get_reg(op.rb);
inherit_const_value(op.rt, af & bf, bv ^ av);
break;
}
case spu_itype::NOR:
{
const auto [af, av, at, ao, az] = get_reg(op.ra);
const auto [bf, bv, _2, _4, _6] = get_reg(op.rb);
inherit_const_value(op.rt, af & bf, ~(bv | av));
break;
}
case spu_itype::ANDI:
{
const auto ra = get_reg(op.ra);
inherit_const_mask_value(op.rt, ra, 0, ~op.si10);
break;
}
case spu_itype::AND:
{
if (op.ra == op.rb)
{
move_reg(op.rt, op.ra);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
const auto [bf, bv, _2, _4, _6] = get_reg(op.rb);
inherit_const_value(op.rt, af & bf, bv & av);
break;
}
case spu_itype::AI:
{
if (!op.si10)
{
move_reg(op.rt, op.ra);
break;
}
const auto ra = get_reg(op.ra);
const auto [af, av, at, ao, az] = ra;
inherit_const_value(op.rt, af, av + op.si10);
if (u32 mask = ra.get_known_zeroes() & ~op.si10; mask & 1)
{
// Added zeroes are always zeroes which comes in handy later
inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1);
}
break;
}
case spu_itype::A:
{
const auto ra = get_reg(op.ra);
const auto rb = get_reg(op.rb);
const auto [af, av, at, ao, az] = ra;
const auto [bf, bv, bt, bo, bz] = rb;
inherit_const_value(op.rt, af & bf, bv + av);
if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1)
{
// Added zeroes are always zeroes which comes in handy later
inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1);
}
break;
}
case spu_itype::SFI:
{
const auto [af, av, at, ao, az] = get_reg(op.ra);
inherit_const_value(op.rt, af, op.si10 - av);
break;
}
case spu_itype::SF:
{
const auto ra = get_reg(op.ra);
const auto rb = get_reg(op.rb);
const auto [af, av, at, ao, az] = ra;
const auto [bf, bv, bt, bo, bz] = rb;
inherit_const_value(op.rt, af & bf, bv - av);
if (u32 mask = ra.get_known_zeroes() & rb.get_known_zeroes(); mask & 1)
{
// Subtracted zeroes are always zeroes which comes in handy later
inherit_const_mask_value(op.rt, vregs[op.rt], 0, (1u << std::countr_one(mask)) - 1);
}
break;
}
case spu_itype::FSMBI:
{
const u32 mask = (op.i16 >> 12);
const u32 value = (mask & 1 ? 0xff : 0) |
(mask & 2 ? 0xff00 : 0) |
(mask & 4 ? 0xff0000 : 0) |
(mask & 8 ? 0xff000000u : 0);
set_const_value(op.rt, value);
break;
}
case spu_itype::ROTMI:
{
if ((0 - op.i7) & 0x20)
{
set_const_value(op.rt, 0);
break;
}
if (!op.i7)
{
move_reg(op.rt, op.ra);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
inherit_const_value(op.rt, af, av >> ((0 - op.i7) & 0x1f));
break;
}
case spu_itype::SHLI:
{
if (op.i7 & 0x20)
{
set_const_value(op.rt, 0);
break;
}
if (!op.i7)
{
move_reg(op.rt, op.ra);
break;
}
const auto [af, av, at, ao, az] = get_reg(op.ra);
inherit_const_value(op.rt, af, av << (op.i7 & 0x1f));
break;
}
case spu_itype::SELB:
{
const auto ra = get_reg(op.ra);
const auto rb = get_reg(op.rb);
// Ignore RC, perform a value merge which also respect bitwise information
vregs[op.rt4] = ra.merge(rb);
break;
}
case spu_itype::SHLQBYI:
{
if (op.i7 & 0x10)
{
set_const_value(op.rt, 0);
break;
}
if (!op.i7)
{
move_reg(op.rt, op.ra);
break;
}
[[fallthrough]];
}
default:
{
// Make unknown value
if (!(type & spu_itype::zregmod))
{
const u32 op_rt = type & spu_itype::_quadrop ? +op.rt4 : +op.rt;
unconst(op_rt);
}
break;
}
}
if (m_targets.count(pos))
{
for (u32 next_target : ::at32(m_targets, pos))
{
add_block(next_target);
}
next_block();
}
}
std::string func_hash;
if (!result.data.empty())
{
sha1_context ctx;
u8 output[20]{};
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(result.data.data()), result.data.size() * 4);
sha1_finish(&ctx, output);
fmt::append(func_hash, "%s", fmt::base57(output));
}
for (const auto& [pc_commited, pattern] : atomic16_all)
{
if (!pattern.active)
{
continue;
}
if (getllar_starts.contains(pattern.lsa_pc) && getllar_starts[pattern.lsa_pc])
{
continue;
}
auto& stats = g_fxo->get<putllc16_statistics_t>();
2024-05-08 13:30:12 +02:00
had_putllc_evaluation = true;
if (!pattern.ls_write)
{
spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all);
add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa);
continue;
}
union putllc16_info
{
u32 data;
bf_t<u32, 30, 2> type;
bf_t<u32, 29, 1> runtime16_select;
bf_t<u32, 28, 1> no_notify;
bf_t<u32, 18, 8> reg;
bf_t<u32, 0, 18> off18;
bf_t<u32, 0, 8> reg2;
} value{};
enum : u32
{
v_const = 0,
v_relative = 1,
v_reg_offs = 2,
v_reg2 = 3,
};
for (auto it = infos.lower_bound(utils::sub_saturate<u32>(pattern.put_pc, 512)); it != infos.end() && it->first < pattern.put_pc + 512; it++)
{
for (auto& state : it->second->end_reg_state)
{
if (state.is_const() && (state.value & -0x20) == (CELL_SYNC_ERROR_ALIGN & -0x20))
{
// Do not notify if it is a cellSync function
value.no_notify = 1;
spu_log.success("Detected cellSync function at 0x%x, disabling reservation notification.", pattern.put_pc);
break;
}
}
if (value.no_notify)
{
break;
}
}
value.runtime16_select = pattern.select_16_or_0_at_runtime;
value.reg = s_reg_max;
if (pattern.ls.is_const())
{
ensure(pattern.reg == s_reg_max && pattern.reg2 == s_reg_max && pattern.ls_offs.is_const(), "Unexpected register usage");
value.type = v_const;
value.off18 = pattern.ls.value & SPU_LS_MASK_1;
}
else if (pattern.ls == start_program_count)
{
ensure(pattern.ls_offs.is_const(), "Unexpected register2 usage");
value.type = v_relative;
value.off18 = pattern.ls_offs.value & SPU_LS_MASK_1;
}
else if (pattern.ls_offs.is_const())
{
ensure(pattern.reg != s_reg_max, "Not found register usage");
value.type = v_reg_offs;
value.reg = pattern.reg;
value.off18 = pattern.ls_offs.value;
}
else
{
ensure(pattern.reg != s_reg_max, "Not found register usage");
ensure(pattern.reg2 != s_reg_max, "Not found register2 usage");
value.type = v_reg2;
value.reg = pattern.reg;
value.reg2 = pattern.reg2;
}
add_pattern(false, inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data);
spu_log.success("PUTLLC16 Pattern Detected! (mem_count=%d, put_pc=0x%x, pc_rel=%d, offset=0x%x, const=%u, two_regs=%d, reg=%u, runtime=%d, 0x%x-%s) (putllc0=%d, putllc16+0=%d, all=%d)"
, pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, +stats.nowrite, ++stats.single, +stats.all);
}
2024-05-08 13:30:12 +02:00
if (likely_putllc_loop && !had_putllc_evaluation)
{
spu_log.notice("Likely missed PUTLLC16 patterns. (entry=0x%x)", entry_point);
}
if (result.data.empty())
{
// Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback
}
return result;
}
void spu_recompiler_base::dump(const spu_program& result, std::string& out)
{
SPUDisAsm dis_asm(cpu_disasm_mode::dump, reinterpret_cast<const u8*>(result.data.data()), result.lower_bound);
std::string hash;
if (!result.data.empty())
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(result.data.data()), result.data.size() * 4);
sha1_finish(&ctx, output);
fmt::append(hash, "%s", fmt::base57(output));
}
else
{
hash = "N/A";
}
fmt::append(out, "========== SPU BLOCK 0x%05x (size %u, %s) ==========\n\n", result.entry_point, result.data.size(), hash);
for (auto& bb : m_bbs)
{
for (u32 pos = bb.first, end = bb.first + bb.second.size * 4; pos < end; pos += 4)
{
dis_asm.disasm(pos);
if (!dis_asm.last_opcode.ends_with('\n'))
{
dis_asm.last_opcode += '\n';
}
fmt::append(out, ">%s", dis_asm.last_opcode);
}
out += '\n';
if (m_block_info[bb.first / 4])
{
fmt::append(out, "A: [0x%05x] %s\n", bb.first, m_entry_info[bb.first / 4] ? (m_ret_info[bb.first / 4] ? "Chunk" : "Entry") : "Block");
fmt::append(out, "\tF: 0x%05x\n", bb.second.func);
for (u32 pred : bb.second.preds)
{
fmt::append(out, "\t<- 0x%05x\n", pred);
}
for (u32 target : bb.second.targets)
{
fmt::append(out, "\t-> 0x%05x%s\n", target, m_bbs.count(target) ? "" : " (null)");
}
}
else
{
fmt::append(out, "A: [0x%05x] ?\n", bb.first);
}
out += '\n';
}
for (auto& f : m_funcs)
{
fmt::append(out, "F: [0x%05x]%s\n", f.first, f.second.good ? " (good)" : " (bad)");
fmt::append(out, "\tN: 0x%05x\n", f.second.size * 4 + f.first);
for (u32 call : f.second.calls)
{
fmt::append(out, "\t>> 0x%05x%s\n", call, m_funcs.count(call) ? "" : " (null)");
}
}
out += '\n';
}
struct spu_llvm_worker
{
lf_queue<std::pair<u64, const spu_program*>> registered;
void operator()()
{
// SPU LLVM Recompiler instance
std::unique_ptr<spu_recompiler_base> compiler;
// Fake LS
std::vector<be_t<u32>> ls;
2024-03-07 17:52:25 +01:00
bool set_relax_flag = false;
for (auto slice = registered.pop_all();; [&]
{
if (slice)
{
slice.pop_front();
}
if (slice || thread_ctrl::state() == thread_state::aborting)
{
return;
}
2024-03-07 17:52:25 +01:00
if (set_relax_flag)
{
spu_thread::g_spu_work_count--;
set_relax_flag = false;
}
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
slice = registered.pop_all();
}())
{
auto* prog = slice.get();
if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
if (!prog)
{
continue;
}
if (!prog->second)
{
break;
}
if (!compiler)
{
// Postponed initialization
compiler = spu_recompiler_base::make_llvm_recompiler();
compiler->init();
ls.resize(SPU_LS_SIZE / sizeof(be_t<u32>));
}
2024-03-07 17:52:25 +01:00
if (!set_relax_flag)
{
spu_thread::g_spu_work_count++;
set_relax_flag = true;
}
const auto& func = *prog->second;
// Get data start
const u32 start = func.lower_bound;
const u32 size0 = ::size32(func.data);
// Initialize LS with function data only
for (u32 i = 0, pos = start; i < size0; i++, pos += 4)
{
ls[pos / 4] = std::bit_cast<be_t<u32>>(func.data[i]);
}
// Call analyser
spu_program func2 = compiler->analyse(ls.data(), func.entry_point);
if (func2 != func)
{
spu_log.error("[0x%05x] SPU Analyser failed, %u vs %u", func2.entry_point, func2.data.size(), size0);
}
else if (const auto target = compiler->compile(std::move(func2)))
{
// Redirect old function (TODO: patch in multiple places)
const s64 rel = reinterpret_cast<u64>(target) - prog->first - 5;
union
{
u8 bytes[8];
u64 result;
};
bytes[0] = 0xe9; // jmp rel32
std::memcpy(bytes + 1, &rel, 4);
bytes[5] = 0x90;
bytes[6] = 0x90;
bytes[7] = 0x90;
atomic_storage<u64>::release(*reinterpret_cast<u64*>(prog->first), result);
}
else
{
spu_log.fatal("[0x%05x] Compilation failed.", func.entry_point);
2024-03-07 17:52:25 +01:00
break;
}
// Clear fake LS
std::memset(ls.data() + start / 4, 0, 4 * (size0 - 1));
}
2024-03-07 17:52:25 +01:00
if (set_relax_flag)
{
spu_thread::g_spu_work_count--;
set_relax_flag = false;
}
}
};
// SPU LLVM recompiler thread context
struct spu_llvm
{
// Workload
lf_queue<std::pair<const u64, spu_item*>> registered;
atomic_ptr<named_thread_group<spu_llvm_worker>> m_workers;
spu_llvm()
{
// Dependency
g_fxo->init<spu_cache>();
}
void operator()()
{
if (g_cfg.core.spu_decoder != spu_decoder_type::llvm)
{
return;
}
while (!registered && thread_ctrl::state() != thread_state::aborting)
{
// Wait for the first SPU block before launching any thread
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
}
if (thread_ctrl::state() == thread_state::aborting)
{
return;
}
// To compile (hash -> item)
std::unordered_multimap<u64, spu_item*, value_hash<u64>> enqueued;
// Mini-profiler (hash -> number of occurrences)
std::unordered_map<u64, atomic_t<u64>, value_hash<u64>> samples;
// For synchronization with profiler thread
stx::init_mutex prof_mutex;
named_thread profiler("SPU LLVM Profiler"sv, [&]()
{
while (thread_ctrl::state() != thread_state::aborting)
{
{
// Lock if enabled
const auto lock = prof_mutex.access();
if (!lock)
{
// Wait when the profiler is disabled
prof_mutex.wait_for_initialized();
continue;
}
// Collect profiling samples
2021-03-05 20:05:37 +01:00
idm::select<named_thread<spu_thread>>([&](u32 /*id*/, spu_thread& spu)
{
const u64 name = atomic_storage<u64>::load(spu.block_hash);
2021-04-17 14:39:13 +02:00
if (auto state = +spu.state; !::is_paused(state) && !::is_stopped(state) && cpu_flag::wait - state)
{
2021-01-12 11:01:06 +01:00
const auto found = std::as_const(samples).find(name);
if (found != std::as_const(samples).end())
{
const_cast<atomic_t<u64>&>(found->second)++;
}
}
});
}
// Sleep for a short period if enabled
thread_ctrl::wait_for(20, false);
}
});
u32 worker_count = 1;
if (uint hc = utils::get_thread_count(); hc >= 12)
{
2024-03-07 17:52:25 +01:00
worker_count = hc - 12 + 3;
}
else if (hc >= 6)
{
worker_count = 2;
}
u32 worker_index = 0;
u32 notify_compile_count = 0;
u32 compile_pending = 0;
std::vector<u8> notify_compile(worker_count);
m_workers = make_single<named_thread_group<spu_llvm_worker>>("SPUW.", worker_count);
auto workers_ptr = m_workers.load();
auto& workers = *workers_ptr;
while (thread_ctrl::state() != thread_state::aborting)
{
for (const auto& pair : registered.pop_all())
{
enqueued.emplace(pair);
// Interrupt and kick profiler thread
const auto lock = prof_mutex.init_always([&]{});
// Register new blocks to collect samples
samples.emplace(pair.first, 0);
}
if (enqueued.empty())
{
// Send pending notifications
if (notify_compile_count)
{
for (usz i = 0; i < worker_count; i++)
{
if (notify_compile[i])
{
(workers.begin() + i)->registered.notify();
}
}
}
// Interrupt profiler thread and put it to sleep
static_cast<void>(prof_mutex.reset());
thread_ctrl::wait_on(utils::bless<atomic_t<u32>>(&registered)[1], 0);
std::fill(notify_compile.begin(), notify_compile.end(), 0); // Reset notification flags
notify_compile_count = 0;
compile_pending = 0;
continue;
}
// Find the most used enqueued item
u64 sample_max = 0;
auto found_it = enqueued.begin();
for (auto it = enqueued.begin(), end = enqueued.end(); it != end; ++it)
{
const u64 cur = ::at32(std::as_const(samples), it->first);
if (cur > sample_max)
{
sample_max = cur;
found_it = it;
}
}
// Start compiling
const spu_program& func = found_it->second->data;
2019-10-25 23:52:56 +02:00
// Old function pointer (pre-recompiled)
const spu_function_t _old = found_it->second->compiled;
// Remove item from the queue
enqueued.erase(found_it);
// Prefer using an inactive thread
for (usz i = 0; i < worker_count && !!(workers.begin() + (worker_index % worker_count))->registered; i++)
{
worker_index++;
}
// Push the workload
const bool notify = (workers.begin() + (worker_index % worker_count))->registered.template push<false>(reinterpret_cast<u64>(_old), &func);
if (notify && !notify_compile[worker_index % worker_count])
{
notify_compile[worker_index % worker_count] = 1;
notify_compile_count++;
}
compile_pending++;
// Notify all before queue runs out if there is considerable excess
// Optimized that: if there are many workers, it acts soon
// If there are only a few workers, it postpones notifications until there is some more workload
if (notify_compile_count && std::min<u32>(7, utils::aligned_div<u32>(worker_count * 2, 3) + 2) <= compile_pending)
{
for (usz i = 0; i < worker_count; i++)
{
if (notify_compile[i])
{
(workers.begin() + i)->registered.notify();
}
}
std::fill(notify_compile.begin(), notify_compile.end(), 0); // Reset notification flags
notify_compile_count = 0;
compile_pending = 0;
}
worker_index++;
}
2021-02-21 15:16:06 +01:00
static_cast<void>(prof_mutex.init_always([&]{ samples.clear(); }));
m_workers.reset();
for (u32 i = 0; i < worker_count; i++)
{
(workers.begin() + i)->operator=(thread_state::aborting);
}
}
spu_llvm& operator=(thread_state)
{
if (const auto workers = m_workers.load())
{
for (u32 i = 0; i < workers->size(); i++)
{
(workers->begin() + i)->operator=(thread_state::aborting);
}
}
return *this;
}
static constexpr auto thread_name = "SPU LLVM"sv;
};
using spu_llvm_thread = named_thread<spu_llvm>;
struct spu_fast : public spu_recompiler_base
{
virtual void init() override
{
if (!m_spurt)
{
2021-03-02 12:59:19 +01:00
m_spurt = &g_fxo->get<spu_runtime>();
}
}
virtual spu_function_t compile(spu_program&& _func) override
{
2019-10-25 23:52:56 +02:00
const auto add_loc = m_spurt->add_empty(std::move(_func));
2019-10-25 23:52:56 +02:00
if (!add_loc)
{
2019-10-25 23:52:56 +02:00
return nullptr;
}
2019-10-25 23:52:56 +02:00
if (add_loc->compiled)
{
2019-10-25 23:52:56 +02:00
return add_loc->compiled;
}
const spu_program& func = add_loc->data;
2019-10-25 23:52:56 +02:00
if (g_cfg.core.spu_debug && !add_loc->logged.exchange(1))
{
std::string log;
2019-10-25 23:52:56 +02:00
this->dump(func, log);
fs::write_file(m_spurt->get_cache_path() + "spu.log", fs::create + fs::write + fs::append, log);
}
// Allocate executable area with necessary size
const auto result = jit_runtime::alloc(22 + 1 + 9 + ::size32(func.data) * (16 + 16) + 36 + 47, 16);
if (!result)
{
return nullptr;
}
m_pos = func.lower_bound;
m_size = ::size32(func.data) * 4;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
be_t<u64> hash_start;
std::memcpy(&hash_start, output, sizeof(hash_start));
m_hash_start = hash_start;
}
u8* raw = result;
// 8-byte intruction for patching (long NOP)
*raw++ = 0x0f;
*raw++ = 0x1f;
*raw++ = 0x84;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
*raw++ = 0;
// mov rax, m_hash_start
*raw++ = 0x48;
*raw++ = 0xb8;
std::memcpy(raw, &m_hash_start, sizeof(m_hash_start));
raw += 8;
// Update block_hash: mov [r13 + spu_thread::m_block_hash], rax
*raw++ = 0x49;
*raw++ = 0x89;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::block_hash));
// Load PC: mov eax, [r13 + spu_thread::pc]
*raw++ = 0x41;
*raw++ = 0x8b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Get LS address starting from PC: lea rcx, [rbp + rax]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x4c;
*raw++ = 0x05;
*raw++ = 0x00;
// Verification (slow)
for (u32 i = 0; i < func.data.size(); i++)
{
if (!func.data[i])
{
continue;
}
// cmp dword ptr [rcx + off], opc
*raw++ = 0x81;
*raw++ = 0xb9;
const u32 off = i * 4;
const u32 opc = func.data[i];
std::memcpy(raw + 0, &off, 4);
std::memcpy(raw + 4, &opc, 4);
raw += 8;
// jne tr_dispatch
const s64 rel = reinterpret_cast<u64>(spu_runtime::tr_dispatch) - reinterpret_cast<u64>(raw) - 6;
*raw++ = 0x0f;
*raw++ = 0x85;
std::memcpy(raw + 0, &rel, 4);
raw += 4;
}
// trap
//*raw++ = 0xcc;
// Secondary prologue: sub rsp,0x28
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xec;
*raw++ = 0x28;
// Fix args: xchg r13,rbp
*raw++ = 0x49;
*raw++ = 0x87;
*raw++ = 0xed;
// mov r12d, eax
*raw++ = 0x41;
*raw++ = 0x89;
*raw++ = 0xc4;
// mov esi, 0x7f0
*raw++ = 0xbe;
*raw++ = 0xf0;
*raw++ = 0x07;
*raw++ = 0x00;
*raw++ = 0x00;
// lea rdi, [rbp + spu_thread::gpr]
*raw++ = 0x48;
*raw++ = 0x8d;
*raw++ = 0x7d;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::gpr));
// Save base pc: mov [rbp + spu_thread::base_pc], eax
*raw++ = 0x89;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
// inc block_counter
*raw++ = 0x48;
*raw++ = 0xff;
*raw++ = 0x85;
const u32 blc_off = ::offset32(&spu_thread::block_counter);
std::memcpy(raw, &blc_off, 4);
raw += 4;
// lea r14, [local epilogue]
*raw++ = 0x4c;
*raw++ = 0x8d;
*raw++ = 0x35;
const u32 epi_off = ::size32(func.data) * 16;
std::memcpy(raw, &epi_off, 4);
raw += 4;
// Instructions (each instruction occupies fixed number of bytes)
for (u32 i = 0; i < func.data.size(); i++)
{
const u32 pos = m_pos + i * 4;
if (!func.data[i])
{
// Save pc: mov [rbp + spu_thread::pc], r12d
*raw++ = 0x44;
*raw++ = 0x89;
*raw++ = 0x65;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Epilogue: add rsp,0x28
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x28;
// ret (TODO)
*raw++ = 0xc3;
std::memset(raw, 0xcc, 16 - 9);
raw += 16 - 9;
continue;
}
// Fix endianness
const spu_opcode_t op{std::bit_cast<be_t<u32>>(func.data[i])};
switch (auto type = g_spu_itype.decode(op.opcode))
{
case spu_itype::BRZ:
case spu_itype::BRHZ:
case spu_itype::BRNZ:
case spu_itype::BRHNZ:
{
const u32 target = spu_branch_target(pos, op.i16);
if (0 && target >= m_pos && target < m_pos + m_size)
{
*raw++ = type == spu_itype::BRHZ || type == spu_itype::BRHNZ ? 0x66 : 0x90;
*raw++ = 0x83;
*raw++ = 0xbd;
const u32 off = ::offset32(&spu_thread::gpr, op.rt) + 12;
std::memcpy(raw, &off, 4);
raw += 4;
*raw++ = 0x00;
*raw++ = 0x0f;
*raw++ = type == spu_itype::BRZ || type == spu_itype::BRHZ ? 0x84 : 0x85;
const u32 dif = (target - (pos + 4)) / 4 * 16 + 2;
std::memcpy(raw, &dif, 4);
raw += 4;
*raw++ = 0x66;
*raw++ = 0x90;
break;
}
[[fallthrough]];
}
default:
{
// Ballast: mov r15d, pos
*raw++ = 0x41;
*raw++ = 0xbf;
std::memcpy(raw, &pos, 4);
raw += 4;
// mov ebx, opc
*raw++ = 0xbb;
std::memcpy(raw, &op, 4);
raw += 4;
// call spu_* (specially built interpreter function)
2023-09-06 05:53:10 +02:00
const s64 rel = spu_runtime::g_interpreter_table[static_cast<usz>(type)] - reinterpret_cast<u64>(raw) - 5;
*raw++ = 0xe8;
std::memcpy(raw, &rel, 4);
raw += 4;
break;
}
}
}
// Local dispatcher/epilogue: fix stack after branch instruction, then dispatch or return
// add rsp, 8
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x08;
// and rsp, -16
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xe4;
*raw++ = 0xf0;
// lea rax, [r12 - size]
*raw++ = 0x49;
*raw++ = 0x8d;
*raw++ = 0x84;
*raw++ = 0x24;
const u32 msz = 0u - m_size;
std::memcpy(raw, &msz, 4);
raw += 4;
// sub eax, [rbp + spu_thread::base_pc]
*raw++ = 0x2b;
*raw++ = 0x45;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::base_pc));
// cmp eax, (0 - size)
*raw++ = 0x3d;
std::memcpy(raw, &msz, 4);
raw += 4;
// jb epilogue
*raw++ = 0x72;
*raw++ = +12;
// movsxd rax, eax
*raw++ = 0x48;
*raw++ = 0x63;
*raw++ = 0xc0;
// shl rax, 2
*raw++ = 0x48;
*raw++ = 0xc1;
*raw++ = 0xe0;
*raw++ = 0x02;
// add rax, r14
*raw++ = 0x4c;
*raw++ = 0x01;
*raw++ = 0xf0;
// jmp rax
*raw++ = 0xff;
*raw++ = 0xe0;
// Save pc: mov [rbp + spu_thread::pc], r12d
*raw++ = 0x44;
*raw++ = 0x89;
*raw++ = 0x65;
*raw++ = ::narrow<s8>(::offset32(&spu_thread::pc));
// Epilogue: add rsp,0x28 ; ret
*raw++ = 0x48;
*raw++ = 0x83;
*raw++ = 0xc4;
*raw++ = 0x28;
*raw++ = 0xc3;
2019-10-25 23:52:56 +02:00
const auto fn = reinterpret_cast<spu_function_t>(result);
// Install pointer carefully
const bool added = !add_loc->compiled && add_loc->compiled.compare_and_swap_test(nullptr, fn);
// Check hash against allowed bounds
const bool inverse_bounds = g_cfg.core.spu_llvm_lower_bound > g_cfg.core.spu_llvm_upper_bound;
if ((!inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound || m_hash_start > g_cfg.core.spu_llvm_upper_bound)) ||
(inverse_bounds && (m_hash_start < g_cfg.core.spu_llvm_lower_bound && m_hash_start > g_cfg.core.spu_llvm_upper_bound)))
{
spu_log.error("[Debug] Skipped function %s", fmt::base57(be_t<u64>{m_hash_start}));
}
else if (added)
2019-10-25 23:52:56 +02:00
{
// Send work to LLVM compiler thread
2021-03-02 12:59:19 +01:00
g_fxo->get<spu_llvm_thread>().registered.push(m_hash_start, add_loc);
2019-10-25 23:52:56 +02:00
}
// Rebuild trampoline if necessary
if (!m_spurt->rebuild_ubertrampoline(func.data[0]))
{
return nullptr;
}
2019-10-25 23:52:56 +02:00
if (added)
{
add_loc->compiled.notify_all();
}
2019-10-25 23:52:56 +02:00
return fn;
}
};
std::unique_ptr<spu_recompiler_base> spu_recompiler_base::make_fast_llvm_recompiler()
{
return std::make_unique<spu_fast>();
}
std::array<reg_state_t, s_reg_max>& block_reg_info::evaluate_start_state(const std::map<u32, std::unique_ptr<block_reg_info>>& map)
{
if (!has_true_state)
{
std::array<reg_state_t, s_reg_max> temp;
std::basic_string<u32> been_there;
struct iterator_info
{
u32 block_pc = SPU_LS_SIZE;
struct state_t
{
u32 block_pc = SPU_LS_SIZE;
std::array<reg_state_t, s_reg_max> reg_state;
bool disconnected = false;
bool state_written = false;
};
std::vector<state_t> state_prev;
usz completed = 0;
usz parent_iterator_index = umax;
usz parent_state_index = umax;
};
std::vector<iterator_info> info_queue;
iterator_info first_entry{pc, {}, 0, umax, umax};
info_queue.emplace_back(std::move(first_entry));
// info_queue may grow
for (usz qi = 0; qi < info_queue.size();)
{
const auto it = std::addressof(info_queue[qi]);
ensure(qi == info_queue.size() - 1);
auto& cur_node = ::at32(map, it->block_pc);
ensure(it->parent_iterator_index == qi - 1);
if (cur_node->has_true_state)
{
// Evaluted somewhen before
if (qi != 0)
{
ensure(!been_there.empty());
been_there.pop_back();
info_queue.pop_back();
qi--;
continue;
}
else
{
break;
}
}
if (it->state_prev.empty())
{
// Build the list here to avoid code duplication
const usz real_size = cur_node->prev_nodes.size();
if (real_size)
{
it->state_prev.resize(real_size);
for (usz i = 0; i < real_size; i++)
{
it->state_prev[i].block_pc = cur_node->prev_nodes[i].prev_pc;
}
}
}
const usz next_entry_idx = it->completed;
if (next_entry_idx == it->state_prev.size())
{
// Result merge from all predecessors
// Flag to mark the state as resolved
bool is_all_resolved = true;
bool has_past_state = false;
for (usz bi = 0; bi < it->state_prev.size(); bi++)
{
if (it->state_prev[bi].disconnected)
{
is_all_resolved = false;
continue;
}
has_past_state = true;
const u32 node_pc = it->state_prev[bi].block_pc;
const auto& node = ::at32(map, node_pc);
// Check if the node is resolved
if (!node->has_true_state)
{
// Assume this block cannot be resolved at the moment
is_all_resolved = false;
break;
}
}
if (qi == 0)
{
// TODO: First block is always resolved here, but this logic can be improved to detect more cases of opportunistic resolving
is_all_resolved = true;
}
auto& res_state = is_all_resolved ? cur_node->start_reg_state : temp;
for (usz bi = 0; bi < it->state_prev.size(); bi++)
{
if (it->state_prev[bi].disconnected)
{
// Loop state, even if not ignored for a million times the result would still be the same
// So ignore it
continue;
}
std::array<reg_state_t, s_reg_max>* arg_state{};
const auto& node = ::at32(map, it->state_prev[bi].block_pc);
if (node->has_true_state)
{
// State is resolved, use the entry's state
arg_state = std::addressof(node->end_reg_state);
}
else
{
// Use accumulated state from one path of code history
arg_state = std::addressof(it->state_prev[bi].reg_state);
ensure(it->state_prev[bi].state_written);
}
if (bi == 0)
{
res_state = *arg_state;
}
else
{
merge(res_state, res_state, *arg_state);
}
}
std::array<reg_state_t, s_reg_max>* result_storage{};
if (is_all_resolved)
{
// Complete state of this block
result_storage = std::addressof(cur_node->end_reg_state);
cur_node->has_true_state = true;
}
else
{
// Patch incomplete state into saved state entry of parent block
ensure(it->parent_iterator_index != qi);
ensure(it->parent_iterator_index != umax);
auto& state_vec = ::at32(info_queue, it->parent_iterator_index).state_prev;
auto& state = ::at32(state_vec, it->parent_state_index);
ensure(state.block_pc == it->block_pc);
result_storage = std::addressof(state.reg_state);
ensure(!state.state_written);
state.state_written = true;
}
// Stack the newer state on top of the old (if exists)
if (has_past_state)
{
build_on_top_of(*result_storage, cur_node->addend_reg_state, res_state);
}
else
{
*result_storage = cur_node->addend_reg_state;
}
if (qi != 0)
{
ensure(!been_there.empty());
been_there.pop_back();
info_queue.pop_back();
qi--;
}
else
{
ensure(cur_node->has_true_state);
break;
}
}
else
{
const u32 prev_pc = cur_node->prev_nodes[it->completed++].prev_pc;
const auto& prev_node = ::at32(map, prev_pc);
// Queue for resolving if needed
if (!prev_node->has_true_state)
{
// TODO: The true maximum occurence count need to depend on the amount of branching-outs passed through
// Currently allow 2 for short-term code and 1 for long-term code
const bool loop_terminator_detected = std::count(been_there.begin(), been_there.end(), prev_pc) >= (qi < 20 ? 2u : 1u);
const bool avoid_extensive_analysis = qi >= 25;
if (!loop_terminator_detected && !avoid_extensive_analysis)
{
info_queue.emplace_back(iterator_info{prev_pc, {}, 0, qi, it->completed - 1});
been_there.push_back(prev_pc);
qi++;
}
else
{
auto& state = ::at32(it->state_prev, it->completed - 1);
// Loop state, even if not ignored for a million times the result would be the same
// This is similar to multiplying zero a million times
// This is true at least for now, that any register difference is considered an unknown state change
// So ignore it
ensure(state.block_pc == prev_pc);
ensure(!state.disconnected);
state.disconnected = true;
// Repeat
// qi += 0;
}
}
else
{
// Repeat
// qi += 0;
}
}
}
ensure(has_true_state);
}
walkby_state = start_reg_state;
return walkby_state;
}
void spu_recompiler_base::add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end)
{
if (end == umax)
{
end = start;
}
m_patterns[start] = pattern_info{utils::address_range::start_end(start, end)};
for (u32 i = start; i <= (fill_all ? end : start); i += 4)
{
m_inst_attrs[i / 4] = attr;
}
}
extern std::string format_spu_func_info(u32 addr, cpu_thread* spu)
{
spu_thread* _spu = static_cast<spu_thread*>(spu);
std::unique_ptr<spu_recompiler_base> compiler = spu_recompiler_base::make_asmjit_recompiler();
compiler->init();
auto func = compiler->analyse(reinterpret_cast<const be_t<u32>*>(_spu->ls), addr);
std::string info;
{
sha1_context ctx;
u8 output[20];
sha1_starts(&ctx);
sha1_update(&ctx, reinterpret_cast<const u8*>(func.data.data()), func.data.size() * 4);
sha1_finish(&ctx, output);
fmt::append(info, "size=%d, end=0x%x, hash=%s", func.data.size(), addr + func.data.size() * 4, fmt::base57(output));
}
return info;
}