From 43fd396db7c2d8d933f80822fee0ef186a7fe4be Mon Sep 17 00:00:00 2001 From: disjtqz Date: Wed, 11 Oct 2023 11:58:15 -0400 Subject: [PATCH] implement dynamically allocateable guest to host callbacks --- src/xenia/base/bit_map.cc | 70 ++++++++----- src/xenia/base/bit_map.h | 3 +- src/xenia/cpu/backend/backend.h | 67 ++++++++++++- src/xenia/cpu/backend/x64/x64_backend.cc | 122 ++++++++++++++++++++--- src/xenia/cpu/backend/x64/x64_backend.h | 24 +++++ 5 files changed, 243 insertions(+), 43 deletions(-) diff --git a/src/xenia/base/bit_map.cc b/src/xenia/base/bit_map.cc index 0cbb4c2d0..66a7a4923 100644 --- a/src/xenia/base/bit_map.cc +++ b/src/xenia/base/bit_map.cc @@ -25,37 +25,57 @@ BitMap::BitMap(uint64_t* data, size_t size_bits) { data_.resize(size_bits / kDataSizeBits); std::memcpy(data_.data(), data, size_bits / kDataSizeBits); } +inline size_t BitMap::TryAcquireAt(size_t i) { + uint64_t entry = 0; + uint64_t new_entry = 0; + int64_t acquired_idx = -1LL; + do { + entry = data_[i]; + uint8_t index = lzcnt(entry); + if (index == kDataSizeBits) { + // None free. + acquired_idx = -1; + break; + } + + // Entry has a free bit. Acquire it. + uint64_t bit = 1ull << (kDataSizeBits - index - 1); + new_entry = entry & ~bit; + assert_not_zero(entry & bit); + + acquired_idx = index; + } while (!atomic_cas(entry, new_entry, &data_[i])); + + if (acquired_idx != -1) { + // Acquired. + return (i * kDataSizeBits) + acquired_idx; + } + return -1LL; +} size_t BitMap::Acquire() { for (size_t i = 0; i < data_.size(); i++) { - uint64_t entry = 0; - uint64_t new_entry = 0; - int64_t acquired_idx = -1; - - do { - entry = data_[i]; - uint8_t index = lzcnt(entry); - if (index == kDataSizeBits) { - // None free. - acquired_idx = -1; - break; - } - - // Entry has a free bit. Acquire it. - uint64_t bit = 1ull << (kDataSizeBits - index - 1); - new_entry = entry & ~bit; - assert_not_zero(entry & bit); - - acquired_idx = index; - } while (!atomic_cas(entry, new_entry, &data_[i])); - - if (acquired_idx != -1) { - // Acquired. - return (i * kDataSizeBits) + acquired_idx; + size_t attempt_result = TryAcquireAt(i); + if (attempt_result != -1LL) { + return attempt_result; } } - return -1; + return -1LL; +} + +size_t BitMap::AcquireFromBack() { + if (!data_.size()) { + return -1LL; + } + for (ptrdiff_t i = data_.size() - 1; i >= 0; i--) { + size_t attempt_result = TryAcquireAt(static_cast(i)); + if (attempt_result != -1LL) { + return attempt_result; + } + } + + return -1LL; } void BitMap::Release(size_t index) { diff --git a/src/xenia/base/bit_map.h b/src/xenia/base/bit_map.h index 29bbb3925..0509cff7d 100644 --- a/src/xenia/base/bit_map.h +++ b/src/xenia/base/bit_map.h @@ -32,7 +32,7 @@ class BitMap { // (threadsafe) Acquires an entry and returns its index. Returns -1 if there // are no more free entries. size_t Acquire(); - + size_t AcquireFromBack(); // (threadsafe) Releases an entry by an index. void Release(size_t index); @@ -49,6 +49,7 @@ class BitMap { const static size_t kDataSize = 8; const static size_t kDataSizeBits = kDataSize * 8; std::vector data_; + inline size_t TryAcquireAt(size_t i); }; } // namespace xe diff --git a/src/xenia/cpu/backend/backend.h b/src/xenia/cpu/backend/backend.h index 565dde35d..2e247fc55 100644 --- a/src/xenia/cpu/backend/backend.h +++ b/src/xenia/cpu/backend/backend.h @@ -38,7 +38,9 @@ struct GuestPseudoStackTrace { }; class Assembler; class CodeCache; - +using GuestTrampolineProc = void (*)(ppc::PPCContext* context, void* userarg1, + void* userarg2); +using SimpleGuestTrampolineProc = void (*)(ppc::PPCContext*); class Backend { public: explicit Backend(); @@ -95,11 +97,74 @@ class Backend { virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) { return false; } + + virtual uint32_t CreateGuestTrampoline(GuestTrampolineProc proc, + void* userdata1, void* userdata2, + bool long_term = false) { + return 0; + } + uint32_t CreateGuestTrampoline(void (*func)(ppc::PPCContext*), + bool long_term = false) { + return CreateGuestTrampoline( + reinterpret_cast(reinterpret_cast(func)), + nullptr, nullptr); + } + // if long-term, allocate towards the back of bitset to make allocating short + // term ones faster + uint32_t CreateLongTermGuestTrampoline(void (*func)(ppc::PPCContext*)) { + return CreateGuestTrampoline( + reinterpret_cast(reinterpret_cast(func)), + nullptr, nullptr, true); + } + virtual void FreeGuestTrampoline(uint32_t trampoline_addr) {} + protected: Processor* processor_ = nullptr; MachineInfo machine_info_; CodeCache* code_cache_ = nullptr; }; +/* + * a set of guest trampolines that all have shared ownership. + */ +struct GuestTrampolineGroup + : public std::map { + Backend* const m_backend; + xe_mutex m_mutex; + + uint32_t _NewTrampoline(SimpleGuestTrampolineProc proc, bool longterm) { + uint32_t result; + m_mutex.lock(); + auto iter = this->find(proc); + if (iter == this->end()) { + uint32_t new_entry = longterm + ? m_backend->CreateLongTermGuestTrampoline(proc) + : m_backend->CreateGuestTrampoline(proc); + this->emplace_hint(iter, proc, new_entry); + result = new_entry; + } else { + result = iter->second; + } + m_mutex.unlock(); + return result; + } + + public: + GuestTrampolineGroup(Backend* backend) : m_backend(backend) {} + ~GuestTrampolineGroup() { + m_mutex.lock(); + for (auto&& entry : *this) { + m_backend->FreeGuestTrampoline(entry.second); + } + m_mutex.unlock(); + } + + uint32_t NewLongtermTrampoline(SimpleGuestTrampolineProc proc) { + return _NewTrampoline(proc, true); + } + uint32_t NewTrampoline(SimpleGuestTrampolineProc proc) { + return _NewTrampoline(proc, false); + } +}; } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 1f525c38e..53c54b1b0 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -90,6 +90,25 @@ class X64HelperEmitter : public X64Emitter { void EmitLoadNonvolatileRegs(); }; +#if XE_PLATFORM_WIN32 +static constexpr unsigned char guest_trampoline_template[] = { + 0x48, 0xBA, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x49, + 0xB8, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9, + 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99, + 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0}; + +#else +// sysv x64 abi, exact same offsets for args +static constexpr unsigned char guest_trampoline_template[] = { + 0x48, 0xBF, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, + 0xBE, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB9, + 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0x48, 0xB8, 0x99, + 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x00, 0xFF, 0xE0}; +#endif +static constexpr uint32_t guest_trampoline_template_offset_arg1 = 2, + guest_trampoline_template_offset_arg2 = 0xC, + guest_trampoline_template_offset_rcx = 0x16, + guest_trampoline_template_offset_rax = 0x20; X64Backend::X64Backend() : Backend(), code_cache_(nullptr) { if (cs_open(CS_ARCH_X86, CS_MODE_64, &capstone_handle_) != CS_ERR_OK) { assert_always("Failed to initialize capstone"); @@ -97,6 +116,23 @@ X64Backend::X64Backend() : Backend(), code_cache_(nullptr) { cs_option(capstone_handle_, CS_OPT_SYNTAX, CS_OPT_SYNTAX_INTEL); cs_option(capstone_handle_, CS_OPT_DETAIL, CS_OPT_ON); cs_option(capstone_handle_, CS_OPT_SKIPDATA, CS_OPT_OFF); + uint32_t base_address = 0x10000; + void* buf_trampoline_code = nullptr; + while (base_address < 0x80000000) { + buf_trampoline_code = memory::AllocFixed( + (void*)(uintptr_t)base_address, + sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES, + xe::memory::AllocationType::kReserveCommit, + xe::memory::PageAccess::kExecuteReadWrite); + if (!buf_trampoline_code) { + base_address += 65536; + } else { + break; + } + } + xenia_assert(buf_trampoline_code); + guest_trampoline_memory_ = (uint8_t*)buf_trampoline_code; + guest_trampoline_address_bitmap_.Resize(MAX_GUEST_TRAMPOLINES); } X64Backend::~X64Backend() { @@ -106,6 +142,13 @@ X64Backend::~X64Backend() { X64Emitter::FreeConstData(emitter_data_); ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this); + if (guest_trampoline_memory_) { + memory::DeallocFixed( + guest_trampoline_memory_, + sizeof(guest_trampoline_template) * MAX_GUEST_TRAMPOLINES, + memory::DeallocationType::kRelease); + guest_trampoline_memory_ = nullptr; + } } static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) { @@ -212,6 +255,9 @@ bool X64Backend::Initialize(Processor* processor) { if (!code_cache_->Initialize()) { return false; } + // HV range + code_cache()->CommitExecutableRange(GUEST_TRAMPOLINE_BASE, + GUEST_TRAMPOLINE_END); // Allocate emitter constant data. emitter_data_ = X64Emitter::PlaceConstData(); @@ -241,7 +287,8 @@ bool X64Backend::Initialize(Processor* processor) { reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false); reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true); vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper(); - vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper); + vrsqrtefp_vector_helper = + thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper); frsqrtefp_helper = thunk_emitter.EmitFrsqrteHelper(); // Set the code cache to use the ResolveFunction thunk for default // indirections. @@ -850,7 +897,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk( _code_offsets code_offsets = {}; code_offsets.prolog = getSize(); pop(r8); // return address - + switch (stack_element_size) { case 4: mov(r11d, ptr[r8]); @@ -919,11 +966,11 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() { bt(GetBackendFlagsPtr(), kX64BackendNJMOn); jnc(handle_denormal_input, CodeGenerator::T_NEAR); - - // handle denormal input with NJM on + + // handle denormal input with NJM on // denorms get converted to zero w/ input sign, jump to our label // that handles inputs of 0 for this - + jmp(convert_to_signed_inf_and_ret); L(L35); @@ -1038,7 +1085,6 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() { L(L1); ret(); - L(handle_denormal_input); mov(r9d, r8d); and_(r9d, 0x7FFFFFFF); @@ -1089,7 +1135,6 @@ void* X64HelperEmitter::EmitScalarVRsqrteHelper() { dd(0x7FC00000); dd(0x5F34FD00); - code_offsets.prolog_stack_alloc = getSize(); code_offsets.body = getSize(); code_offsets.prolog = getSize(); @@ -1126,18 +1171,16 @@ void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) { jnz(actual_vector_version); vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3)); call(scalar_helper); - // this->DebugBreak(); + // this->DebugBreak(); vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6)); vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)], 0b0111); - + ret(); - L(actual_vector_version); - xor_(ecx, ecx); vmovaps(result_ptr, xmm0); @@ -1172,7 +1215,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() { code_offsets.epilog = getSize(); code_offsets.tail = getSize(); code_offsets.prolog = getSize(); - + Xbyak::Label L2, L7, L6, L9, L1, L12, L24, L3, L25, frsqrte_table2, LC1; bt(GetBackendFlagsPtr(), kX64BackendNonIEEEMode); vmovq(rax, xmm0); @@ -1190,7 +1233,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() { not_(rcx); and_(rcx, rdx); } - + jne(L6); cmp(rax, rdx); je(L1, CodeGenerator::T_NEAR); @@ -1199,7 +1242,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() { jne(L7); vcomisd(xmm0, xmm1); jb(L12, CodeGenerator::T_NEAR); - + L(L7); mov(rdx, 0x7ff8000000000000ULL); or_(rax, rdx); @@ -1236,7 +1279,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() { sal(rax, 44); or_(rax, rdx); vmovq(xmm1, rax); - + L(L1); vmovapd(xmm0, xmm1); ret(); @@ -1255,7 +1298,7 @@ void* X64HelperEmitter::EmitFrsqrteHelper() { jne(L2); mov(rdx, 0x8000000000000000ULL); and_(rax, rdx); - + L(L3); mov(rdx, 0x8000000000000000ULL); and_(rax, rdx); @@ -1617,6 +1660,53 @@ uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) { } #endif + +// todo:flush cache +uint32_t X64Backend::CreateGuestTrampoline(GuestTrampolineProc proc, + void* userdata1, void* userdata2, + bool longterm) { + size_t new_index; + if (longterm) { + new_index = guest_trampoline_address_bitmap_.AcquireFromBack(); + } else { + new_index = guest_trampoline_address_bitmap_.Acquire(); + } + + xenia_assert(new_index != (size_t)-1); + + uint8_t* write_pos = + &guest_trampoline_memory_[sizeof(guest_trampoline_template) * new_index]; + + memcpy(write_pos, guest_trampoline_template, + sizeof(guest_trampoline_template)); + + *reinterpret_cast(&write_pos[guest_trampoline_template_offset_arg1]) = + userdata1; + *reinterpret_cast(&write_pos[guest_trampoline_template_offset_arg2]) = + userdata2; + *reinterpret_cast( + &write_pos[guest_trampoline_template_offset_rcx]) = proc; + *reinterpret_cast( + &write_pos[guest_trampoline_template_offset_rax]) = guest_to_host_thunk_; + + uint32_t indirection_guest_addr = + GUEST_TRAMPOLINE_BASE + + (static_cast(new_index) * GUEST_TRAMPOLINE_MIN_LEN); + + code_cache()->AddIndirection( + indirection_guest_addr, + static_cast(reinterpret_cast(write_pos))); + + return indirection_guest_addr; +} + +void X64Backend::FreeGuestTrampoline(uint32_t trampoline_addr) { + xenia_assert(trampoline_addr >= GUEST_TRAMPOLINE_BASE && + trampoline_addr < GUEST_TRAMPOLINE_END); + size_t index = + (trampoline_addr - GUEST_TRAMPOLINE_BASE) / GUEST_TRAMPOLINE_MIN_LEN; + guest_trampoline_address_bitmap_.Release(index); +} } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index a06d18b4a..665b337b0 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -13,6 +13,7 @@ #include #include "xenia/base/cvar.h" +#include "xenia/base/bit_map.h" #include "xenia/cpu/backend/backend.h" #if XE_PLATFORM_WIN32 == 1 @@ -42,6 +43,19 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); typedef void (*ResolveFunctionThunk)(); +/* + place guest trampolines in the memory range that the HV normally occupies. + This way guests can call in via the indirection table and we don't have to clobber/reuse an existing memory range + The xboxkrnl range is already used by export trampolines (see kernel/kernel_module.cc) +*/ +static constexpr uint32_t GUEST_TRAMPOLINE_BASE = 0x80000000; +static constexpr uint32_t GUEST_TRAMPOLINE_END = 0x80040000; + +static constexpr uint32_t GUEST_TRAMPOLINE_MIN_LEN = 8; + +static constexpr uint32_t MAX_GUEST_TRAMPOLINES = + (GUEST_TRAMPOLINE_END - GUEST_TRAMPOLINE_BASE) / GUEST_TRAMPOLINE_MIN_LEN; + #define RESERVE_BLOCK_SHIFT 16 #define RESERVE_NUM_ENTRIES \ @@ -155,6 +169,11 @@ class X64Backend : public Backend { return reinterpret_cast( reinterpret_cast(ctx) - sizeof(X64BackendContext)); } + virtual uint32_t CreateGuestTrampoline(GuestTrampolineProc proc, + void* userdata1, + void* userdata2, bool long_term) override; + + virtual void FreeGuestTrampoline(uint32_t trampoline_addr) override; virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override; void RecordMMIOExceptionForGuestInstruction(void* host_address); @@ -200,6 +219,11 @@ class X64Backend : public Backend { #endif alignas(64) ReserveHelper reserve_helper_; + // allocates 8-byte aligned addresses in a normally not executable guest + // address + // range that will be used to dispatch to host code + BitMap guest_trampoline_address_bitmap_; + uint8_t* guest_trampoline_memory_; }; } // namespace x64