xenia/src/xenia/cpu/backend/x64/x64_backend.cc
chss95cs@gmail.com 457296850e Add OPCODE_NEGATED_MUL_ADD/OPCODE_NEGATED_MUL_SUB
Proper handling of nans for VMX max/min on x64 (minps/maxps has special behavior depending on the operand order that vmx does not have for vminfp/vmaxfp)
Add extremely unintrusive guest code profiler utilizing KUSER_SHARED systemtime. This profiler is disabled on platforms other than windows, and on windows is disabled by default by a cvar
Repurpose GUEST_SCRATCH64 stack offset to instead be for storing guest function profile times, define GUEST_SCRATCH as 0 instead, since thats already meant to be a scratch area
Fix xenia silently closing on config errors/other fatal errors by setting has_console_attached_'s default to false
Add alternative code path for guest clock that uses kusershared systemtime instead of QueryPerformanceCounter. This is way faster and I have tested it and found it to be working, but i have disabled it because i do not know how well it works on wine or on processors other than mine
Significantly reduce log spam by setting XELOGAPU and XELOGGPU to be LogLevel::Debug
Changed some LOGI to LOGD in places to reduce log spam
Mark VdSwap as kHighFrequency, it was spamming up logs
Make logging calls less intrusive for the caller by forcing the test of log level inline and moving the format/AppendLogLine stuff to an outlined cold function
Add swcache namespace for software cache operations like prefetches, streaming stores and streaming loads.
Add XE_MSVC_REORDER_BARRIER for preventing msvc from propagating a value too close to its store or from its load
Add xe_unlikely_mutex for locks we know have very little contention
add XE_HOST_CACHE_LINE_SIZE and XE_RESTRICT to platform.h
Microoptimization: Changed most uses of size_t to ring_size_t in RingBuffer, this reduces the size of the inlined ringbuffer operations slightly by eliminating rex prefixes, depending on register allocation
Add BeginPrefetchedRead to ringbuffer, which prefetches the second range if there is one according to the provided PrefetchTag
added inline_loadclock cvar, which will directly use the value of the guest clock from clock.cc in jitted guest code. off by default
change uses of GUEST_SCRATCH64 to GUEST_SCRATCH
Add fast vectorized xenos_half_to_float/xenos_float_to_half (currently resides in x64_seq_vector, move to gpu code maybe at some point)
Add fast x64 codegen for PackFloat16_4/UnpackFloat16_4. Same code can be used for Float16_2 in future commit. This should speed up some games that use these functions heavily
Remove cvar for toggling old float16 behavior
Add VRSAVE register, support mfspr/mtspr vrsave
Add cvar for toggling off codegen for trap instructions and set it to true by default.
Add specialized methods to CommandProcessor: WriteRegistersFromMem, WriteRegisterRangeFromRing, and WriteOneRegisterFromRing. These reduce the overall cost of WriteRegister
Use a fixed size vmem vector for upload ranges, realloc/memsetting on resize  in the inner loop of requestranges was showing up on the profiler (the search in requestranges itself needs work)
Rename fixed_vmem_vector to better fit xenia's naming convention
Only log unknown register writes in WriteRegister if DEBUG :/. We're stuck on MSVC with c++17 so we have no way of influencing the branch ordering for that function without profile guided optimization
Remove binding stride assert in shader_translator.cc, triangle told me its leftover ogl stuff
Mark xe::FatalError as noreturn
If a controller is not connected, delay by 1.1 seconds before checking if it has been reconnected. Asking Xinput about a controller slot that is unused is extremely slow, and XinputGetState/SetState were taking up
an enormous amount of time in profiles. this may have caused a bit of input lag
Protect accesses to input_system with a lock
Add proper handling for user_index>= 4 in XamInputGetState/SetState, properly return zeroed state in GetState
Add missing argument to NtQueryVirtualMemory_entry
Fixed RtlCompareMemoryUlong_entry, it actually does not care if the source is misaligned, and for length it aligns down
Fixed RtlUpperChar and RtlLowerChar, added a table that has their correct return values precomputed
2022-08-20 11:40:19 -07:00

852 lines
28 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2019 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/cpu/backend/x64/x64_backend.h"
#include <stddef.h>
#include <algorithm>
#include "third_party/capstone/include/capstone/capstone.h"
#include "third_party/capstone/include/capstone/x86.h"
#include "xenia/base/exception_handler.h"
#include "xenia/base/logging.h"
#include "xenia/cpu/backend/x64/x64_assembler.h"
#include "xenia/cpu/backend/x64/x64_code_cache.h"
#include "xenia/cpu/backend/x64/x64_emitter.h"
#include "xenia/cpu/backend/x64/x64_function.h"
#include "xenia/cpu/backend/x64/x64_sequences.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
#include "xenia/cpu/breakpoint.h"
#include "xenia/cpu/processor.h"
#include "xenia/cpu/stack_walker.h"
#include "xenia/cpu/xex_module.h"
DEFINE_int32(x64_extension_mask, -1,
"Allow the detection and utilization of specific instruction set "
"features.\n"
" 0 = x86_64 + AVX1\n"
" 1 = AVX2\n"
" 2 = FMA\n"
" 4 = LZCNT\n"
" 8 = BMI1\n"
" 16 = BMI2\n"
" 32 = F16C\n"
" 64 = Movbe\n"
" 128 = GFNI\n"
" 256 = AVX512F\n"
" 512 = AVX512VL\n"
" 1024 = AVX512BW\n"
" 2048 = AVX512DQ\n"
" -1 = Detect and utilize all possible processor features\n",
"x64");
DEFINE_bool(record_mmio_access_exceptions, true,
"For guest addresses records whether we caught any mmio accesses "
"for them. This info can then be used on a subsequent run to "
"instruct the recompiler to emit checks",
"CPU");
#if XE_X64_PROFILER_AVAILABLE == 1
DECLARE_bool(instrument_call_times);
#endif
namespace xe {
namespace cpu {
namespace backend {
namespace x64 {
class X64ThunkEmitter : public X64Emitter {
public:
X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator);
~X64ThunkEmitter() override;
HostToGuestThunk EmitHostToGuestThunk();
GuestToHostThunk EmitGuestToHostThunk();
ResolveFunctionThunk EmitResolveFunctionThunk();
private:
// The following four functions provide save/load functionality for registers.
// They assume at least StackLayout::THUNK_STACK_SIZE bytes have been
// allocated on the stack.
void EmitSaveVolatileRegs();
void EmitLoadVolatileRegs();
void EmitSaveNonvolatileRegs();
void EmitLoadNonvolatileRegs();
};
X64Backend::X64Backend() : Backend(), code_cache_(nullptr) {
if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) {
assert_always("Failed to initialize capstone");
}
cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL);
cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON);
cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF);
}
X64Backend::~X64Backend() {
if (capstone_handle_) {
cs_close(&capstone_handle_);
}
X64Emitter::FreeConstData(emitter_data_);
ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
}
static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
reinterpret_cast<X64Backend*>(context)
->RecordMMIOExceptionForGuestInstruction(hostaddr);
}
#if XE_X64_PROFILER_AVAILABLE == 1
// todo: better way of passing to atexit. maybe do in destructor instead?
// nope, destructor is never called
static GuestProfilerData* backend_profiler_data = nullptr;
static uint64_t nanosecond_lifetime_start = 0;
static void WriteGuestProfilerData() {
if (cvars::instrument_call_times) {
uint64_t end = Clock::QueryHostSystemTime();
uint64_t total = end - nanosecond_lifetime_start;
double totaltime_divisor = static_cast<double>(total);
FILE* output_file = nullptr;
std::vector<std::pair<uint32_t, uint64_t>> unsorted_profile{};
for (auto&& entry : *backend_profiler_data) {
if (entry.second) { // skip times of 0
unsorted_profile.emplace_back(entry.first, entry.second);
}
}
std::sort(unsorted_profile.begin(), unsorted_profile.end(),
[](auto& x, auto& y) { return x.second < y.second; });
fopen_s(&output_file, "profile_times.txt", "w");
FILE* idapy_file = nullptr;
fopen_s(&idapy_file, "profile_print_times.py", "w");
for (auto&& sorted_entry : unsorted_profile) {
// double time_in_seconds =
// static_cast<double>(sorted_entry.second) / 10000000.0;
double time_in_milliseconds =
static_cast<double>(sorted_entry.second) / (10000000.0 / 1000.0);
double slice = static_cast<double>(sorted_entry.second) /
static_cast<double>(totaltime_divisor);
fprintf(output_file,
"%X took %.20f milliseconds, totaltime slice percentage %.20f \n",
sorted_entry.first, time_in_milliseconds, slice);
fprintf(idapy_file,
"print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n",
sorted_entry.first, time_in_milliseconds, slice);
}
fclose(output_file);
fclose(idapy_file);
}
}
static void GuestProfilerUpdateThreadProc() {
nanosecond_lifetime_start = Clock::QueryHostSystemTime();
do {
xe::threading::Sleep(std::chrono::seconds(30));
WriteGuestProfilerData();
} while (true);
}
static std::unique_ptr<xe::threading::Thread> g_profiler_update_thread{};
#endif
bool X64Backend::Initialize(Processor* processor) {
if (!Backend::Initialize(processor)) {
return false;
}
Xbyak::util::Cpu cpu;
if (!cpu.has(Xbyak::util::Cpu::tAVX)) {
XELOGE("This CPU does not support AVX. The emulator will now crash.");
return false;
}
// Need movbe to do advanced LOAD/STORE tricks.
if (cvars::x64_extension_mask & kX64EmitMovbe) {
machine_info_.supports_extended_load_store =
cpu.has(Xbyak::util::Cpu::tMOVBE);
} else {
machine_info_.supports_extended_load_store = false;
}
auto& gprs = machine_info_.register_sets[0];
gprs.id = 0;
std::strcpy(gprs.name, "gpr");
gprs.types = MachineInfo::RegisterSet::INT_TYPES;
gprs.count = X64Emitter::GPR_COUNT;
auto& xmms = machine_info_.register_sets[1];
xmms.id = 1;
std::strcpy(xmms.name, "xmm");
xmms.types = MachineInfo::RegisterSet::FLOAT_TYPES |
MachineInfo::RegisterSet::VEC_TYPES;
xmms.count = X64Emitter::XMM_COUNT;
code_cache_ = X64CodeCache::Create();
Backend::code_cache_ = code_cache_.get();
if (!code_cache_->Initialize()) {
return false;
}
// Generate thunks used to transition between jitted code and host code.
XbyakAllocator allocator;
X64ThunkEmitter thunk_emitter(this, &allocator);
host_to_guest_thunk_ = thunk_emitter.EmitHostToGuestThunk();
guest_to_host_thunk_ = thunk_emitter.EmitGuestToHostThunk();
resolve_function_thunk_ = thunk_emitter.EmitResolveFunctionThunk();
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
code_cache_->set_indirection_default(
uint32_t(uint64_t(resolve_function_thunk_)));
// Allocate some special indirections.
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
// Allocate emitter constant data.
emitter_data_ = X64Emitter::PlaceConstData();
// Setup exception callback
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
processor->memory()->SetMMIOExceptionRecordingCallback(
ForwardMMIOAccessForRecording, (void*)this);
#if XE_X64_PROFILER_AVAILABLE == 1
if (cvars::instrument_call_times) {
backend_profiler_data = &profiler_data_;
xe::threading::Thread::CreationParameters slimparams;
slimparams.create_suspended = false;
slimparams.initial_priority = xe::threading::ThreadPriority::kLowest;
slimparams.stack_size = 65536 * 4;
g_profiler_update_thread = std::move(xe::threading::Thread::Create(
slimparams, GuestProfilerUpdateThreadProc));
}
#endif
return true;
}
void X64Backend::CommitExecutableRange(uint32_t guest_low,
uint32_t guest_high) {
code_cache_->CommitExecutableRange(guest_low, guest_high);
}
std::unique_ptr<Assembler> X64Backend::CreateAssembler() {
return std::make_unique<X64Assembler>(this);
}
std::unique_ptr<GuestFunction> X64Backend::CreateGuestFunction(
Module* module, uint32_t address) {
return std::make_unique<X64Function>(module, address);
}
uint64_t ReadCapstoneReg(HostThreadContext* context, x86_reg reg) {
switch (reg) {
case X86_REG_RAX:
return context->rax;
case X86_REG_RCX:
return context->rcx;
case X86_REG_RDX:
return context->rdx;
case X86_REG_RBX:
return context->rbx;
case X86_REG_RSP:
return context->rsp;
case X86_REG_RBP:
return context->rbp;
case X86_REG_RSI:
return context->rsi;
case X86_REG_RDI:
return context->rdi;
case X86_REG_R8:
return context->r8;
case X86_REG_R9:
return context->r9;
case X86_REG_R10:
return context->r10;
case X86_REG_R11:
return context->r11;
case X86_REG_R12:
return context->r12;
case X86_REG_R13:
return context->r13;
case X86_REG_R14:
return context->r14;
case X86_REG_R15:
return context->r15;
default:
assert_unhandled_case(reg);
return 0;
}
}
#define X86_EFLAGS_CF 0x00000001 // Carry Flag
#define X86_EFLAGS_PF 0x00000004 // Parity Flag
#define X86_EFLAGS_ZF 0x00000040 // Zero Flag
#define X86_EFLAGS_SF 0x00000080 // Sign Flag
#define X86_EFLAGS_OF 0x00000800 // Overflow Flag
bool TestCapstoneEflags(uint32_t eflags, uint32_t insn) {
// https://www.felixcloutier.com/x86/Jcc.html
switch (insn) {
case X86_INS_JAE:
// CF=0 && ZF=0
return ((eflags & X86_EFLAGS_CF) == 0) && ((eflags & X86_EFLAGS_ZF) == 0);
case X86_INS_JA:
// CF=0
return (eflags & X86_EFLAGS_CF) == 0;
case X86_INS_JBE:
// CF=1 || ZF=1
return ((eflags & X86_EFLAGS_CF) == X86_EFLAGS_CF) ||
((eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF);
case X86_INS_JB:
// CF=1
return (eflags & X86_EFLAGS_CF) == X86_EFLAGS_CF;
case X86_INS_JE:
// ZF=1
return (eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF;
case X86_INS_JGE:
// SF=OF
return (eflags & X86_EFLAGS_SF) == (eflags & X86_EFLAGS_OF);
case X86_INS_JG:
// ZF=0 && SF=OF
return ((eflags & X86_EFLAGS_ZF) == 0) &&
((eflags & X86_EFLAGS_SF) == (eflags & X86_EFLAGS_OF));
case X86_INS_JLE:
// ZF=1 || SF!=OF
return ((eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF) ||
((eflags & X86_EFLAGS_SF) != X86_EFLAGS_OF);
case X86_INS_JL:
// SF!=OF
return (eflags & X86_EFLAGS_SF) != (eflags & X86_EFLAGS_OF);
case X86_INS_JNE:
// ZF=0
return (eflags & X86_EFLAGS_ZF) == 0;
case X86_INS_JNO:
// OF=0
return (eflags & X86_EFLAGS_OF) == 0;
case X86_INS_JNP:
// PF=0
return (eflags & X86_EFLAGS_PF) == 0;
case X86_INS_JNS:
// SF=0
return (eflags & X86_EFLAGS_SF) == 0;
case X86_INS_JO:
// OF=1
return (eflags & X86_EFLAGS_OF) == X86_EFLAGS_OF;
case X86_INS_JP:
// PF=1
return (eflags & X86_EFLAGS_PF) == X86_EFLAGS_PF;
case X86_INS_JS:
// SF=1
return (eflags & X86_EFLAGS_SF) == X86_EFLAGS_SF;
default:
assert_unhandled_case(insn);
return false;
}
}
uint64_t X64Backend::CalculateNextHostInstruction(ThreadDebugInfo* thread_info,
uint64_t current_pc) {
auto machine_code_ptr = reinterpret_cast<const uint8_t*>(current_pc);
size_t remaining_machine_code_size = 64;
uint64_t host_address = current_pc;
cs_insn insn = {0};
cs_detail all_detail = {0};
insn.detail = &all_detail;
cs_disasm_iter(capstone_handle_, &machine_code_ptr,
&remaining_machine_code_size, &host_address, &insn);
auto& detail = all_detail.x86;
switch (insn.id) {
default:
// Not a branching instruction - just move over it.
return current_pc + insn.size;
case X86_INS_CALL: {
assert_true(detail.op_count == 1);
assert_true(detail.operands[0].type == X86_OP_REG);
uint64_t target_pc =
ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
return target_pc;
} break;
case X86_INS_RET: {
assert_zero(detail.op_count);
auto stack_ptr =
reinterpret_cast<uint64_t*>(thread_info->host_context.rsp);
uint64_t target_pc = stack_ptr[0];
return target_pc;
} break;
case X86_INS_JMP: {
assert_true(detail.op_count == 1);
if (detail.operands[0].type == X86_OP_IMM) {
uint64_t target_pc = static_cast<uint64_t>(detail.operands[0].imm);
return target_pc;
} else if (detail.operands[0].type == X86_OP_REG) {
uint64_t target_pc =
ReadCapstoneReg(&thread_info->host_context, detail.operands[0].reg);
return target_pc;
} else {
// TODO(benvanik): find some more uses of this.
assert_always("jmp branch emulation not yet implemented");
return current_pc + insn.size;
}
} break;
case X86_INS_JCXZ:
case X86_INS_JECXZ:
case X86_INS_JRCXZ:
assert_always("j*cxz branch emulation not yet implemented");
return current_pc + insn.size;
case X86_INS_JAE:
case X86_INS_JA:
case X86_INS_JBE:
case X86_INS_JB:
case X86_INS_JE:
case X86_INS_JGE:
case X86_INS_JG:
case X86_INS_JLE:
case X86_INS_JL:
case X86_INS_JNE:
case X86_INS_JNO:
case X86_INS_JNP:
case X86_INS_JNS:
case X86_INS_JO:
case X86_INS_JP:
case X86_INS_JS: {
assert_true(detail.op_count == 1);
assert_true(detail.operands[0].type == X86_OP_IMM);
uint64_t target_pc = static_cast<uint64_t>(detail.operands[0].imm);
bool test_passed =
TestCapstoneEflags(thread_info->host_context.eflags, insn.id);
if (test_passed) {
return target_pc;
} else {
return current_pc + insn.size;
}
} break;
}
}
void X64Backend::InstallBreakpoint(Breakpoint* breakpoint) {
breakpoint->ForEachHostAddress([breakpoint](uint64_t host_address) {
auto ptr = reinterpret_cast<void*>(host_address);
auto original_bytes = xe::load_and_swap<uint16_t>(ptr);
assert_true(original_bytes != 0x0F0B);
xe::store_and_swap<uint16_t>(ptr, 0x0F0B);
breakpoint->backend_data().emplace_back(host_address, original_bytes);
});
}
void X64Backend::InstallBreakpoint(Breakpoint* breakpoint, Function* fn) {
assert_true(breakpoint->address_type() == Breakpoint::AddressType::kGuest);
assert_true(fn->is_guest());
auto guest_function = reinterpret_cast<cpu::GuestFunction*>(fn);
auto host_address =
guest_function->MapGuestAddressToMachineCode(breakpoint->guest_address());
if (!host_address) {
assert_always();
return;
}
// Assume we haven't already installed a breakpoint in this spot.
auto ptr = reinterpret_cast<void*>(host_address);
auto original_bytes = xe::load_and_swap<uint16_t>(ptr);
assert_true(original_bytes != 0x0F0B);
xe::store_and_swap<uint16_t>(ptr, 0x0F0B);
breakpoint->backend_data().emplace_back(host_address, original_bytes);
}
void X64Backend::UninstallBreakpoint(Breakpoint* breakpoint) {
for (auto& pair : breakpoint->backend_data()) {
auto ptr = reinterpret_cast<uint8_t*>(pair.first);
auto instruction_bytes = xe::load_and_swap<uint16_t>(ptr);
assert_true(instruction_bytes == 0x0F0B);
xe::store_and_swap<uint16_t>(ptr, static_cast<uint16_t>(pair.second));
}
breakpoint->backend_data().clear();
}
bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) {
auto backend = reinterpret_cast<X64Backend*>(data);
return backend->ExceptionCallback(ex);
}
void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) {
uint64_t host_addr_u64 = (uint64_t)host_address;
auto fnfor = code_cache()->LookupFunction(host_addr_u64);
if (fnfor) {
uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64);
Module* guest_module = fnfor->module();
if (guest_module) {
XexModule* xex_guest_module = dynamic_cast<XexModule*>(guest_module);
if (xex_guest_module) {
cpu::InfoCacheFlags* icf =
xex_guest_module->GetInstructionAddressFlags(guestaddr);
if (icf) {
icf->accessed_mmio = true;
}
}
}
}
}
bool X64Backend::ExceptionCallback(Exception* ex) {
if (ex->code() != Exception::Code::kIllegalInstruction) {
// We only care about illegal instructions. Other things will be handled by
// other handlers (probably). If nothing else picks it up we'll be called
// with OnUnhandledException to do real crash handling.
return false;
}
// processor_->memory()->LookupVirtualMappedRange()
// Verify an expected illegal instruction.
auto instruction_bytes =
xe::load_and_swap<uint16_t>(reinterpret_cast<void*>(ex->pc()));
if (instruction_bytes != 0x0F0B) {
// Not our ud2 - not us.
return false;
}
// Let the processor handle things.
return processor()->OnThreadBreakpointHit(ex);
}
X64ThunkEmitter::X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator)
: X64Emitter(backend, allocator) {}
X64ThunkEmitter::~X64ThunkEmitter() {}
HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() {
// rcx = target
// rdx = arg0 (context)
// r8 = arg1 (guest return address)
struct _code_offsets {
size_t prolog;
size_t prolog_stack_alloc;
size_t body;
size_t epilog;
size_t tail;
} code_offsets = {};
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
code_offsets.prolog = getSize();
// rsp + 0 = return address
mov(qword[rsp + 8 * 3], r8);
mov(qword[rsp + 8 * 2], rdx);
mov(qword[rsp + 8 * 1], rcx);
sub(rsp, stack_size);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
// Save nonvolatile registers.
EmitSaveNonvolatileRegs();
mov(rax, rcx);
mov(rsi, rdx); // context
mov(rdi, ptr[rdx + offsetof(ppc::PPCContext, virtual_membase)]); // membase
mov(rcx, r8); // return address
call(rax);
vzeroupper();
EmitLoadNonvolatileRegs();
code_offsets.epilog = getSize();
add(rsp, stack_size);
mov(rcx, qword[rsp + 8 * 1]);
mov(rdx, qword[rsp + 8 * 2]);
mov(r8, qword[rsp + 8 * 3]);
ret();
code_offsets.tail = getSize();
assert_zero(code_offsets.prolog);
EmitFunctionInfo func_info = {};
func_info.code_size.total = getSize();
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
func_info.code_size.tail = getSize() - code_offsets.tail;
func_info.prolog_stack_alloc_offset =
code_offsets.prolog_stack_alloc - code_offsets.prolog;
func_info.stack_size = stack_size;
void* fn = Emplace(func_info);
return (HostToGuestThunk)fn;
}
GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
// rcx = target function
// rdx = arg0
// r8 = arg1
// r9 = arg2
struct _code_offsets {
size_t prolog;
size_t prolog_stack_alloc;
size_t body;
size_t epilog;
size_t tail;
} code_offsets = {};
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
code_offsets.prolog = getSize();
// rsp + 0 = return address
sub(rsp, stack_size);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
// chrispy: added this for proper vmsum impl, avx2 bitshifts
vzeroupper();
// Save off volatile registers.
EmitSaveVolatileRegs();
mov(rax, rcx); // function
mov(rcx, GetContextReg()); // context
call(rax);
EmitLoadVolatileRegs();
code_offsets.epilog = getSize();
add(rsp, stack_size);
ret();
code_offsets.tail = getSize();
assert_zero(code_offsets.prolog);
EmitFunctionInfo func_info = {};
func_info.code_size.total = getSize();
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
func_info.code_size.tail = getSize() - code_offsets.tail;
func_info.prolog_stack_alloc_offset =
code_offsets.prolog_stack_alloc - code_offsets.prolog;
func_info.stack_size = stack_size;
void* fn = Emplace(func_info);
return (GuestToHostThunk)fn;
}
// X64Emitter handles actually resolving functions.
uint64_t ResolveFunction(void* raw_context, uint64_t target_address);
ResolveFunctionThunk X64ThunkEmitter::EmitResolveFunctionThunk() {
// ebx = target PPC address
// rcx = context
struct _code_offsets {
size_t prolog;
size_t prolog_stack_alloc;
size_t body;
size_t epilog;
size_t tail;
} code_offsets = {};
const size_t stack_size = StackLayout::THUNK_STACK_SIZE;
code_offsets.prolog = getSize();
// rsp + 0 = return address
sub(rsp, stack_size);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
// Save volatile registers
EmitSaveVolatileRegs();
mov(rcx, rsi); // context
mov(rdx, rbx);
mov(rax, reinterpret_cast<uint64_t>(&ResolveFunction));
call(rax);
EmitLoadVolatileRegs();
code_offsets.epilog = getSize();
add(rsp, stack_size);
jmp(rax);
code_offsets.tail = getSize();
assert_zero(code_offsets.prolog);
EmitFunctionInfo func_info = {};
func_info.code_size.total = getSize();
func_info.code_size.prolog = code_offsets.body - code_offsets.prolog;
func_info.code_size.body = code_offsets.epilog - code_offsets.body;
func_info.code_size.epilog = code_offsets.tail - code_offsets.epilog;
func_info.code_size.tail = getSize() - code_offsets.tail;
func_info.prolog_stack_alloc_offset =
code_offsets.prolog_stack_alloc - code_offsets.prolog;
func_info.stack_size = stack_size;
void* fn = Emplace(func_info);
return (ResolveFunctionThunk)fn;
}
void X64ThunkEmitter::EmitSaveVolatileRegs() {
// Save off volatile registers.
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rcx);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rdx);
#if XE_PLATFORM_LINUX
mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
#endif
mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r8);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r9);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r10);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r11);
// vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm0);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm1);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm2);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm3);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm4);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm5);
}
void X64ThunkEmitter::EmitLoadVolatileRegs() {
// mov(rax, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
mov(rdx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
#if XE_PLATFORM_LINUX
mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
#endif
mov(r8, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
mov(r9, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
mov(r10, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
mov(r11, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
// vmovaps(xmm0, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
vmovaps(xmm1, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
vmovaps(xmm2, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
vmovaps(xmm3, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
vmovaps(xmm4, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
vmovaps(xmm5, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
}
void X64ThunkEmitter::EmitSaveNonvolatileRegs() {
mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rbx);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[1])], rbp);
#if XE_PLATFORM_WIN32
mov(qword[rsp + offsetof(StackLayout::Thunk, r[2])], rcx);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[3])], rsi);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[4])], rdi);
#endif
mov(qword[rsp + offsetof(StackLayout::Thunk, r[5])], r12);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[6])], r13);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[7])], r14);
mov(qword[rsp + offsetof(StackLayout::Thunk, r[8])], r15);
// SysV does not have nonvolatile XMM registers.
#if XE_PLATFORM_WIN32
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[0])], xmm6);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[1])], xmm7);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[2])], xmm8);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[3])], xmm9);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[4])], xmm10);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[5])], xmm11);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[6])], xmm12);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[7])], xmm13);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[8])], xmm14);
vmovaps(qword[rsp + offsetof(StackLayout::Thunk, xmm[9])], xmm15);
#endif
}
void X64ThunkEmitter::EmitLoadNonvolatileRegs() {
mov(rbx, qword[rsp + offsetof(StackLayout::Thunk, r[0])]);
mov(rbp, qword[rsp + offsetof(StackLayout::Thunk, r[1])]);
#if XE_PLATFORM_WIN32
mov(rcx, qword[rsp + offsetof(StackLayout::Thunk, r[2])]);
mov(rsi, qword[rsp + offsetof(StackLayout::Thunk, r[3])]);
mov(rdi, qword[rsp + offsetof(StackLayout::Thunk, r[4])]);
#endif
mov(r12, qword[rsp + offsetof(StackLayout::Thunk, r[5])]);
mov(r13, qword[rsp + offsetof(StackLayout::Thunk, r[6])]);
mov(r14, qword[rsp + offsetof(StackLayout::Thunk, r[7])]);
mov(r15, qword[rsp + offsetof(StackLayout::Thunk, r[8])]);
#if XE_PLATFORM_WIN32
vmovaps(xmm6, qword[rsp + offsetof(StackLayout::Thunk, xmm[0])]);
vmovaps(xmm7, qword[rsp + offsetof(StackLayout::Thunk, xmm[1])]);
vmovaps(xmm8, qword[rsp + offsetof(StackLayout::Thunk, xmm[2])]);
vmovaps(xmm9, qword[rsp + offsetof(StackLayout::Thunk, xmm[3])]);
vmovaps(xmm10, qword[rsp + offsetof(StackLayout::Thunk, xmm[4])]);
vmovaps(xmm11, qword[rsp + offsetof(StackLayout::Thunk, xmm[5])]);
vmovaps(xmm12, qword[rsp + offsetof(StackLayout::Thunk, xmm[6])]);
vmovaps(xmm13, qword[rsp + offsetof(StackLayout::Thunk, xmm[7])]);
vmovaps(xmm14, qword[rsp + offsetof(StackLayout::Thunk, xmm[8])]);
vmovaps(xmm15, qword[rsp + offsetof(StackLayout::Thunk, xmm[9])]);
#endif
}
void X64Backend::InitializeBackendContext(void* ctx) {
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
bctx->ResolveFunction_Ptr = reinterpret_cast<void*>(&ResolveFunction);
bctx->mxcsr_fpu =
DEFAULT_FPU_MXCSR; // idk if this is right, check on rgh what the
// rounding on ppc is at startup
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
bctx->flags = 0;
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000;
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
}
const uint32_t mxcsr_table[8] = {
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
};
void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
uint32_t control = mode & 7;
_mm_setcsr(mxcsr_table[control]);
bctx->mxcsr_fpu = mxcsr_table[control];
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
}
#if XE_X64_PROFILER_AVAILABLE == 1
uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) {
// who knows, we might want to compile different versions of a function one
// day
auto entry = profiler_data_.find(guest_address);
if (entry != profiler_data_.end()) {
return &entry->second;
} else {
profiler_data_[guest_address] = 0;
return &profiler_data_[guest_address];
}
}
#endif
} // namespace x64
} // namespace backend
} // namespace cpu
} // namespace xe