#include "stdafx.h" #include "AArch64JIT.h" #include "../Hypervisor.h" LOG_CHANNEL(jit_log, "JIT"); #define STDOUT_DEBUG 0 #define DPRINT1(...)\ do {\ printf(__VA_ARGS__);\ printf("\n");\ fflush(stdout);\ } while (0) #if STDOUT_DEBUG #define DPRINT DPRINT1 #else #define DPRINT jit_log.trace #endif namespace aarch64 { // FIXME: This really should be part of fmt static std::string join_strings(const std::vector& v, const char* delim) { std::string result; for (const auto& s : v) { if (!result.empty()) { result += delim; } result += s; } return result; } using instruction_info_t = GHC_frame_preservation_pass::instruction_info_t; using function_info_t = GHC_frame_preservation_pass::function_info_t; GHC_frame_preservation_pass::GHC_frame_preservation_pass(const config_t& configuration) : m_config(configuration) {} void GHC_frame_preservation_pass::reset() { m_visited_functions.clear(); } void GHC_frame_preservation_pass::force_tail_call_terminators(llvm::Function& f) { // GHC functions are not call-stack preserving and can therefore never return if they make any external calls at all. // Replace every terminator clause with a tail call explicitly. This is already done for X64 to work, but better safe than sorry. for (auto& bb : f) { auto bit = bb.begin(), prev = bb.end(); for (; bit != bb.end(); prev = bit, ++bit) { if (prev == bb.end()) { continue; } if (auto ri = llvm::dyn_cast(&*bit)) { if (auto ci = llvm::dyn_cast(&*prev)) { // This is a "ret" that is coming after a "call" to another funciton. // Enforce that it must be a tail call. if (!ci->isTailCall()) { ci->setTailCall(); } } } } } } function_info_t GHC_frame_preservation_pass::preprocess_function(llvm::Function& f) { function_info_t result{}; result.instruction_count = f.getInstructionCount(); // Blanket exclusions. Stubs or dispatchers that do not compute anything themselves. if (f.getName() == "__spu-null") { // Don't waste the effort processing this stub. It has no points of concern result.num_external_calls = 1; return result; } if (m_config.use_stack_frames) { // Stack frame estimation. SPU code can be very long and consumes several KB of stack. u32 stack_frame_size = 128u; // Actual ratio is usually around 1:4 const u32 expected_compiled_instr_count = f.getInstructionCount() * 4; // Because GHC doesn't preserve stack (all stack is scratch), we know we'll start to spill once we go over the number of actual regs. // We use a naive allocator that just assumes each instruction consumes a register slot. We "spill" every 32 instructions. // FIXME: Aggressive spill is only really a thing with vector operations. We can detect those instead. // A proper fix is to port this to a MF pass, but I have PTSD from working at MF level. const u32 spill_pages = (expected_compiled_instr_count + 127u) / 128u; stack_frame_size *= std::min(spill_pages, 32u); // 128 to 4k dynamic. It is unlikely that any frame consumes more than 4096 bytes result.stack_frame_size = stack_frame_size; } result.instruction_count = f.getInstructionCount(); result.num_external_calls = 0; // The LR is not spared by LLVM in cases where there is a lot of spilling. // This is much easier to manage with a custom LLVM branch as we can just mark X30 as off-limits as a GPR. // This is another thing to be moved to a MachineFunction pass. Ideally we should check the instruction stream for writes to LR and reload it on exit. // For now, assume it is dirtied if the function is of any reasonable length. result.clobbers_x30 = result.instruction_count > 32; for (auto& bb : f) { for (auto& inst : bb) { if (auto ci = llvm::dyn_cast(&inst)) { result.num_external_calls++; result.clobbers_x30 |= (!ci->isTailCall()); } } } return result; } instruction_info_t GHC_frame_preservation_pass::decode_instruction(llvm::Function& f, llvm::Instruction* i) { instruction_info_t result{}; if (auto ci = llvm::dyn_cast(i)) { // Watch out for injected ASM blocks... if (llvm::isa(ci->getCalledOperand())) { // Not a real call. This is just an insert of inline asm return result; } result.is_call_inst = true; result.is_returning = true; result.preserve_stack = !ci->isTailCall(); result.callee = ci->getCalledFunction(); result.is_tail_call = ci->isTailCall(); if (!result.callee) { // Indirect call (call from raw value). result.is_indirect = true; result.callee_is_GHC = ci->getCallingConv() == llvm::CallingConv::GHC; result.callee_name = "__indirect_call"; } else { result.callee_is_GHC = result.callee->getCallingConv() == llvm::CallingConv::GHC; result.callee_name = result.callee->getName().str(); } return result; } if (auto bi = llvm::dyn_cast(i)) { // More likely to jump out via an unconditional... if (!bi->isConditional()) { ensure(bi->getNumSuccessors() == 1); auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); result.callee_name = result.callee->getName().str(); result.is_call_inst = result.callee_name != f.getName(); } return result; } if (auto bi = llvm::dyn_cast(i)) { // Very unlikely to be the same function. Can be considered a function exit. ensure(bi->getNumDestinations() == 1); auto targetbb = ensure(bi->getSuccessor(0)); // This is guaranteed to fail but I've yet to encounter this result.callee = targetbb->getParent(); result.callee_name = result.callee->getName().str(); result.is_call_inst = result.callee_name != f.getName(); return result; } if (auto bi = llvm::dyn_cast(i)) { ensure(bi->getNumSuccessors() == 1); auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); result.callee_name = result.callee->getName().str(); result.is_call_inst = result.callee_name != f.getName(); return result; } if (auto bi = llvm::dyn_cast(i)) { ensure(bi->getNumSuccessors() == 2); auto targetbb = bi->getSuccessor(0); result.callee = targetbb->getParent(); result.callee_name = result.callee->getName().str(); result.is_call_inst = result.callee_name != f.getName(); return result; } return result; } gpr GHC_frame_preservation_pass::get_base_register_for_call(const std::string& callee_name) { // We go over the base_register_lookup table and find the first matching pattern for (const auto& pattern : m_config.base_register_lookup) { if (callee_name.starts_with(pattern.first)) { return pattern.second; } } // Default is x19 return aarch64::x19; } void GHC_frame_preservation_pass::run(llvm::IRBuilder<>* irb, llvm::Function& f) { if (f.getCallingConv() != llvm::CallingConv::GHC) { // If we're not doing GHC, the calling conv will have stack fixup on its own via prologue/epilogue return; } if (f.getInstructionCount() == 0) { // Nothing to do. Happens with placeholder functions such as branch patchpoints return; } const auto this_name = f.getName().str(); if (m_visited_functions.find(this_name) != m_visited_functions.end()) { // Already processed. Only useful when recursing which is currently not used. DPRINT("Function %s was already processed. Skipping.\n", this_name.c_str()); return; } if (this_name != "__spu-null") // This name is meaningless and doesn't uniquely identify a function { m_visited_functions.insert(this_name); } if (m_config.exclusion_callback && m_config.exclusion_callback(this_name)) { // Function is explicitly excluded return; } // Preprocessing. auto function_info = preprocess_function(f); if (function_info.num_external_calls == 0 && function_info.stack_frame_size == 0) { // No stack frame injection and no external calls to patch up. This is a leaf function, nothing to do. DPRINT("Ignoring function %s", this_name.c_str()); return; } // Force tail calls on all terminators force_tail_call_terminators(f); // Asm snippets for patching stack frame std::string frame_prologue, frame_epilogue; if (function_info.stack_frame_size > 0) { // NOTE: The stack frame here is purely optional, we can pre-allocate scratch on the gateway. // However, that is an optimization for another time, this helps make debugging easier. frame_prologue = fmt::format("sub sp, sp, #%u;", function_info.stack_frame_size); frame_epilogue = fmt::format("add sp, sp, #%u;", function_info.stack_frame_size); // Emit the frame prologue. We use a BB here for extra safety as it solves the problem of backwards jumps re-executing the prologue. auto functionStart = &f.front(); auto prologueBB = llvm::BasicBlock::Create(f.getContext(), "", &f, functionStart); irb->SetInsertPoint(prologueBB, prologueBB->begin()); LLVM_ASM_VOID(frame_prologue, irb, f.getContext()); irb->CreateBr(functionStart); } // Now we start processing bool terminator_found = false; for (auto& bb : f) { for (auto bit = bb.begin(); bit != bb.end();) { const auto instruction_info = decode_instruction(f, &(*bit)); if (!instruction_info.is_call_inst) { ++bit; continue; } std::string callee_name = "__unknown"; if (const auto cf = instruction_info.callee) { callee_name = cf->getName().str(); if (cf->hasFnAttribute(llvm::Attribute::AlwaysInline) || callee_name.starts_with("llvm.")) { // Always inlined call. Likely inline Asm. Skip ++bit; continue; } // Technically We should also ignore any host functions linked in, usually starting with ppu_ or spu_ prefix. // However, there is not much guarantee that those are safe with only rare exceptions, and it doesn't hurt to patch the frame around them that much anyway. } terminator_found |= instruction_info.is_tail_call; if (!instruction_info.preserve_stack) { // Now we patch the call if required. For normal calls that 'return' (i.e calls to C/C++ ABI), we do not patch them as they will manage the stack themselves (callee-managed) llvm::Instruction* original_inst = llvm::dyn_cast(bit); irb->SetInsertPoint(ensure(llvm::dyn_cast(bit))); // We're about to make a tail call. This means after this call, we're supposed to return immediately. In that case, don't link, lower to branch only. // Note that branches have some undesirable side-effects. For one, we lose the argument inputs, which the callee is expecting. // This means we burn some cycles on every exit, but in return we do not require one instruction on the prologue + the ret chain is eliminated. // No ret-chain also means two BBs can call each other indefinitely without running out of stack without relying on llvm to optimize that away. std::string exit_fn; auto ci = ensure(llvm::dyn_cast(original_inst)); auto operand_count = ci->getNumOperands() - 1; // The last operand is the callee, not a real operand std::vector constraints; std::vector args; // We now load the callee args in reverse order to avoid self-clobbering of dependencies. // FIXME: This is often times redundant and wastes cycles, we'll clean this up in a MachineFunction pass later. int args_base_reg = instruction_info.callee_is_GHC ? aarch64::x19 : aarch64::x0; // GHC args are always x19..x25 for (auto i = static_cast(operand_count) - 1; i >= 0; --i) { args.push_back(ci->getOperand(i)); exit_fn += fmt::format("mov x%d, $%u;\n", (args_base_reg + i), ::size32(args) - 1); constraints.push_back("r"); } // Restore LR to the exit gate if we think it may have been trampled. if (function_info.clobbers_x30) { // Load the context "base" thread register to restore the link register from auto context_base_reg = get_base_register_for_call(instruction_info.callee_name); if (!instruction_info.callee_is_GHC) { // For non-GHC calls, we have to remap the arguments to x0... context_base_reg = static_cast(context_base_reg - 19); } // We want to do this after loading the arguments in case there was any spilling involved. DPRINT("Patching call from %s to %s on register %d...", this_name.c_str(), instruction_info.callee_name.c_str(), static_cast(context_base_reg)); const auto x30_tail_restore = fmt::format( "ldr x30, [x%u, #%u];\n", // Load x30 from thread context static_cast(context_base_reg), m_config.hypervisor_context_offset); exit_fn += x30_tail_restore; } // Stack cleanup. We need to do this last to allow the spiller to find it's own spilled variables. if (function_info.stack_frame_size > 0) { exit_fn += frame_epilogue; } if (m_config.debug_info) { // Store x27 as our current address taking the place of LR (for debugging since bt is now useless) // x28 and x29 are used as breadcrumb registers in this mode to form a pseudo-backtrace. exit_fn += "mov x29, x28;\n" "mov x28, x27;\n" "adr x27, .;\n"; } auto target = ensure(ci->getCalledOperand()); args.push_back(target); if (instruction_info.is_indirect) { // NOTE: For indirect calls, we read the callee register before we load the operands // If we don't do that the operands will overwrite our callee address if it lies in the x19-x25 range // There is no safe temp register to stuff the call address to either, you just have to stuff it below sp and load it after operands are all assigned. constraints.push_back("r"); exit_fn = fmt::format("str $%u, [sp, #-8];\n", operand_count) + exit_fn; exit_fn += "ldr x15, [sp, #-8];\n" "br x15;\n"; } else { constraints.push_back("i"); exit_fn += fmt::format("b $%u;\n", operand_count); } // Emit the branch llvm_asm(irb, exit_fn, args, join_strings(constraints, ","), f.getContext()); // Delete original call instruction bit = ci->eraseFromParent(); } // Next if (bit != bb.end()) { ++bit; } } } ensure(terminator_found, "Could not find terminator for function!"); } }