diff --git a/.gitmodules b/.gitmodules index a73061e22..3e780194f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -48,7 +48,7 @@ url = https://github.com/fmtlib/fmt.git [submodule "third_party/disruptorplus"] path = third_party/disruptorplus - url = https://github.com/xenia-project/disruptorplus.git + url = https://github.com/chrisps/disruptorpus.git [submodule "third_party/DirectXShaderCompiler"] path = third_party/DirectXShaderCompiler url = https://github.com/microsoft/DirectXShaderCompiler.git @@ -63,7 +63,7 @@ url = https://github.com/Cyan4973/xxHash.git [submodule "third_party/FFmpeg"] path = third_party/FFmpeg - url = https://github.com/xenia-project/FFmpeg.git + url = https://github.com/chrisps/FFmpeg_radixsplit.git [submodule "third_party/premake-androidndk"] path = third_party/premake-androidndk url = https://github.com/Triang3l/premake-androidndk.git diff --git a/src/xenia/base/byte_order.h b/src/xenia/base/byte_order.h index 1a3c63b2f..5a076f319 100644 --- a/src/xenia/base/byte_order.h +++ b/src/xenia/base/byte_order.h @@ -46,7 +46,9 @@ static_assert((std::endian::native == std::endian::big) || namespace xe { -#if XE_COMPILER_MSVC +// chrispy: added workaround for clang, otherwise byteswap_ulong becomes calls +// to ucrtbase +#if XE_COMPILER_MSVC == 1 && !defined(__clang__) #define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort #define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong #define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64 diff --git a/src/xenia/base/mapped_memory_win.cc b/src/xenia/base/mapped_memory_win.cc index f0af5ee7b..6e12954b6 100644 --- a/src/xenia/base/mapped_memory_win.cc +++ b/src/xenia/base/mapped_memory_win.cc @@ -28,7 +28,8 @@ namespace xe { class Win32MappedMemory : public MappedMemory { public: // CreateFile returns INVALID_HANDLE_VALUE in case of failure. - static constexpr HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE; + // chrispy: made inline const to get around clang error + static inline const HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE; // CreateFileMapping returns nullptr in case of failure. static constexpr HANDLE kMappingHandleInvalid = nullptr; diff --git a/src/xenia/base/memory_win.cc b/src/xenia/base/memory_win.cc index cbed1b362..807e3911c 100644 --- a/src/xenia/base/memory_win.cc +++ b/src/xenia/base/memory_win.cc @@ -15,7 +15,15 @@ WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES) #define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS #endif - +/* + these two dont bypass much ms garbage compared to the threading ones, + but Protect is used by PhysicalHeap::EnableAccessCallbacks which eats a lot + of cpu time, so every bit counts +*/ +XE_NTDLL_IMPORT(NtProtectVirtualMemory, cls_NtProtectVirtualMemory, + NtProtectVirtualMemoryPointer); +XE_NTDLL_IMPORT(NtQueryVirtualMemory, cls_NtQueryVirtualMemory, + NtQueryVirtualMemoryPointer); namespace xe { namespace memory { @@ -139,6 +147,18 @@ bool Protect(void* base_address, size_t length, PageAccess access, *out_old_access = PageAccess::kNoAccess; } DWORD new_protect = ToWin32ProtectFlags(access); + +#if XE_USE_NTDLL_FUNCTIONS == 1 + + DWORD old_protect = 0; + SIZE_T MemoryLength = length; + PVOID MemoryCache = base_address; + + BOOL result = NtProtectVirtualMemoryPointer.invoke( + (HANDLE)0xFFFFFFFFFFFFFFFFLL, &MemoryCache, &MemoryLength, + new_protect, &old_protect) >= 0; + +#else #ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS DWORD old_protect = 0; BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect); @@ -146,6 +166,7 @@ bool Protect(void* base_address, size_t length, PageAccess access, ULONG old_protect = 0; BOOL result = VirtualProtectFromApp(base_address, length, ULONG(new_protect), &old_protect); +#endif #endif if (!result) { return false; @@ -161,8 +182,17 @@ bool QueryProtect(void* base_address, size_t& length, PageAccess& access_out) { MEMORY_BASIC_INFORMATION info; ZeroMemory(&info, sizeof(info)); +#if XE_USE_NTDLL_FUNCTIONS == 1 + ULONG_PTR ResultLength; + NTSTATUS query_result = NtQueryVirtualMemoryPointer.invoke( + (HANDLE)0xFFFFFFFFFFFFFFFFLL, (PVOID)base_address, + 0 /* MemoryBasicInformation*/, &info, length, &ResultLength); + SIZE_T result = query_result >= 0 ? ResultLength : 0; +#else SIZE_T result = VirtualQuery(base_address, &info, length); + +#endif if (!result) { return false; } diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc index 80bdb8411..322985594 100644 --- a/src/xenia/base/mutex.cc +++ b/src/xenia/base/mutex.cc @@ -10,10 +10,9 @@ #include "xenia/base/mutex.h" namespace xe { - -std::recursive_mutex& global_critical_region::mutex() { - static std::recursive_mutex global_mutex; - return global_mutex; -} +// chrispy: moved this out of body of function to eliminate the initialization +// guards +static std::recursive_mutex global_mutex; +std::recursive_mutex& global_critical_region::mutex() { return global_mutex; } } // namespace xe diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 439d0c467..6d1a6d5f9 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -41,19 +41,33 @@ #error Unsupported target OS. #endif -#if defined(__clang__) +#if defined(__clang__) && !defined(_MSC_VER) // chrispy: support clang-cl #define XE_COMPILER_CLANG 1 +#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1 #elif defined(__GNUC__) #define XE_COMPILER_GNUC 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 #elif defined(_MSC_VER) #define XE_COMPILER_MSVC 1 +#define XE_COMPILER_HAS_MSVC_EXTENSIONS 1 #elif defined(__MINGW32) #define XE_COMPILER_MINGW32 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 #elif defined(__INTEL_COMPILER) #define XE_COMPILER_INTEL 1 #else #define XE_COMPILER_UNKNOWN 1 #endif +// chrispy: had to place this here. +#if defined(__clang__) && defined(_MSC_VER) +#define XE_COMPILER_CLANG_CL 1 +#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1 +#endif + +// clang extensions == superset of gnu extensions +#if XE_COMPILER_HAS_CLANG_EXTENSIONS == 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 +#endif #if defined(_M_AMD64) || defined(__amd64__) #define XE_ARCH_AMD64 1 @@ -93,6 +107,29 @@ #define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value) #define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value) +#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1 +#define XE_FORCEINLINE __forceinline +#define XE_NOINLINE __declspec(noinline) +// can't properly emulate "cold" in msvc, but can still segregate the function +// into its own seg +#define XE_COLD __declspec(code_seg(".cold")) +#define XE_LIKELY(...) (!!(__VA_ARGS__)) +#define XE_UNLIKELY(...) (!!(__VA_ARGS__)) + +#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1 +#define XE_FORCEINLINE __attribute__((always_inline)) +#define XE_NOINLINE __attribute__((noinline)) +#define XE_COLD __attribute__((cold)) +#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true) +#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false) +#else +#define XE_FORCEINLINE inline +#define XE_NOINLINE +#define XE_COLD +#define XE_LIKELY(...) (!!(__VA_ARGS__)) +#define XE_UNLIKELY(...) (!!(__VA_ARGS__)) +#endif + namespace xe { #if XE_PLATFORM_WIN32 diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index 22cad5d93..3013a9c14 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -34,4 +34,31 @@ #undef DeleteFile #undef GetFirstChild +#define XE_USE_NTDLL_FUNCTIONS 1 +#if XE_USE_NTDLL_FUNCTIONS==1 +/* + ntdll versions of functions often skip through a lot of extra garbage in KernelBase +*/ +#define XE_NTDLL_IMPORT(name, cls, clsvar) \ + static class cls { \ + public: \ + FARPROC fn;\ + cls() : fn(nullptr) {\ + auto ntdll = GetModuleHandleA("ntdll.dll");\ + if (ntdll) { \ + fn = GetProcAddress(ntdll, #name );\ + }\ + } \ + template \ + inline TRet invoke(TArgs... args) {\ + return reinterpret_cast(fn)(args...);\ + }\ + inline operator bool() const {\ + return fn!=nullptr;\ + }\ + } clsvar +#else +#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false + +#endif #endif // XENIA_BASE_PLATFORM_WIN_H_ diff --git a/src/xenia/base/profiling.h b/src/xenia/base/profiling.h index 0e45b6cc2..b754bcf31 100644 --- a/src/xenia/base/profiling.h +++ b/src/xenia/base/profiling.h @@ -20,7 +20,7 @@ #include "xenia/ui/virtual_key.h" #include "xenia/ui/window_listener.h" -#if XE_PLATFORM_WIN32 +#if XE_PLATFORM_WIN32 && 0 #define XE_OPTION_PROFILING 1 #define XE_OPTION_PROFILING_UI 1 #else diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index 3165d6b7d..9925622de 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -19,7 +19,26 @@ #include "xenia/base/byte_order.h" namespace xe { +/* + todo: this class is CRITICAL to the performance of the entire emulator + currently, about 0.74% cpu time is still taken up by ReadAndSwap, 0.23 + is used by read_count I believe that part of the issue is that smaller + ringbuffers are kicking off an automatic prefetcher stream, that ends up + reading ahead of the end of the ring because it can only go in a straight + line it then gets a cache miss when it eventually wraps around to the start + of the ring? really hard to tell whats going on there honestly, maybe we can + occasionally prefetch the first line of the ring to L1? For the automatic + prefetching i don't think there are any good options. I don't know if we have + any control over where these buffers will be (they seem to be in guest memory + :/), but if we did we could right-justify the buffer so that the final byte + of the ring ends at the end of a page. i think most automatic prefetchers + cannot cross page boundaries it does feel like something isnt right here + though + todo: microoptimization, we can change our size members to be uint32 so + that the registers no longer need the rex prefix, shrinking the generated + code a bit.. like i said, every bit helps in this class +*/ class RingBuffer { public: RingBuffer(uint8_t* buffer, size_t capacity); @@ -32,6 +51,8 @@ class RingBuffer { uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; } void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; } size_t read_count() const { +// chrispy: these branches are unpredictable +#if 0 if (read_offset_ == write_offset_) { return 0; } else if (read_offset_ < write_offset_) { @@ -39,6 +60,33 @@ class RingBuffer { } else { return (capacity_ - read_offset_) + write_offset_; } +#else + size_t read_offs = read_offset_; + size_t write_offs = write_offset_; + size_t cap = capacity_; + + size_t offset_delta = write_offs - read_offs; + size_t wrap_read_count = (cap - read_offs) + write_offs; + + size_t comparison_value = read_offs <= write_offs; +#if 0 + size_t selector = + static_cast(-static_cast(comparison_value)); + offset_delta &= selector; + + wrap_read_count &= ~selector; + return offset_delta | wrap_read_count; +#else + + if (XE_LIKELY(read_offs <= write_offs)) { + return offset_delta; // will be 0 if they are equal, semantically + // identical to old code (i checked the asm, msvc + // does not automatically do this) + } else { + return wrap_read_count; + } +#endif +#endif } size_t write_offset() const { return write_offset_; } @@ -113,6 +161,28 @@ class RingBuffer { size_t write_offset_ = 0; }; +template <> +inline uint32_t RingBuffer::ReadAndSwap() { + size_t read_offset = this->read_offset_; + xenia_assert(this->capacity_ >= 4); + + size_t next_read_offset = read_offset + 4; + #if 0 + size_t zerotest = next_read_offset - this->capacity_; + // unpredictable branch, use bit arith instead + // todo: it would be faster to use lzcnt, but we need to figure out if all + // machines we support support it + next_read_offset &= -static_cast(!!zerotest); + #else + if (XE_UNLIKELY(next_read_offset == this->capacity_)) { + next_read_offset = 0; + //todo: maybe prefetch next? or should that happen much earlier? + } + #endif + this->read_offset_ = next_read_offset; + unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset]; + return xe::byte_swap(ring_value); +} } // namespace xe #endif // XENIA_BASE_RING_BUFFER_H_ diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 79546b9d6..7d6e612cf 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -10,12 +10,12 @@ #include #include +#include "third_party/disruptorplus/include/disruptorplus/blocking_wait_strategy.hpp" #include "third_party/disruptorplus/include/disruptorplus/multi_threaded_claim_strategy.hpp" #include "third_party/disruptorplus/include/disruptorplus/ring_buffer.hpp" #include "third_party/disruptorplus/include/disruptorplus/sequence_barrier.hpp" #include "third_party/disruptorplus/include/disruptorplus/spin_wait.hpp" #include "third_party/disruptorplus/include/disruptorplus/spin_wait_strategy.hpp" - #include "xenia/base/assert.h" #include "xenia/base/threading.h" #include "xenia/base/threading_timer_queue.h" @@ -26,6 +26,17 @@ namespace xe { namespace threading { using WaitItem = TimerQueueWaitItem; +/* + chrispy: changed this to a blocking wait from a spin-wait, the spin was + monopolizing a ton of cpu time (depending on the game 2-4% of total cpu time) + on my 3990x no complaints since that change +*/ + +/* + edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds + +*/ +using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy; class TimerQueue { public: @@ -147,9 +158,10 @@ class TimerQueue { // This ring buffer will be used to introduce timers queued by the public API static constexpr size_t kWaitCount = 512; dp::ring_buffer> buffer_; - dp::spin_wait_strategy wait_strategy_; - dp::multi_threaded_claim_strategy claim_strategy_; - dp::sequence_barrier consumed_; + + WaitStrat wait_strategy_; + dp::multi_threaded_claim_strategy claim_strategy_; + dp::sequence_barrier consumed_; // This is a _sorted_ (ascending due_) list of active timers managed by a // dedicated thread diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 8f6087b05..5c00400e2 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -7,19 +7,49 @@ ****************************************************************************** */ +#include #include "xenia/base/assert.h" #include "xenia/base/chrono_steady_cast.h" #include "xenia/base/logging.h" #include "xenia/base/platform_win.h" #include "xenia/base/threading.h" #include "xenia/base/threading_timer_queue.h" - -#define LOG_LASTERROR() \ - { XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); } - +#if defined(__clang__) +// chrispy: i do not understand why this is an error for clang here +// something about the quoted __FUNCTION__ freaks it out (clang 14.0.1) +#define LOG_LASTERROR() \ + do { \ + XELOGI("Win32 Error 0x{:08X} in {} (...)", GetLastError(), __FUNCTION__); \ + } while (false) +#else +#define LOG_LASTERROR() \ + do { \ + XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); \ + } while (false) +#endif typedef HANDLE (*SetThreadDescriptionFn)(HANDLE hThread, PCWSTR lpThreadDescription); +// sys function for ntyieldexecution, by calling it we sidestep +// RtlGetCurrentUmsThread +XE_NTDLL_IMPORT(NtYieldExecution, cls_NtYieldExecution, + NtYieldExecutionPointer); +// sidestep the activation context/remapping special windows handles like stdout +XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject, + NtWaitForSingleObjectPointer); + +XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer); +// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns +// the events state prior to the call, but we dont need that. might need to +// check whether one or the other is faster in the kernel though yeah, just +// checked, the code in ntoskrnl is way simpler for clearevent than resetevent +XE_NTDLL_IMPORT(NtClearEvent, cls_NtClearEvent, NtClearEventPointer); +XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer); + +// heavily called, we dont skip much garbage by calling this, but every bit +// counts +XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore, + NtReleaseSemaphorePointer); namespace xe { namespace threading { @@ -80,7 +110,13 @@ void set_name(const std::string_view name) { } void MaybeYield() { +#if defined(XE_USE_NTDLL_FUNCTIONS) + NtYieldExecutionPointer.invoke(); +#else SwitchToThread(); +#endif + + // memorybarrier is really not necessary here... MemoryBarrier(); } @@ -134,8 +170,26 @@ class Win32Handle : public T { WaitResult Wait(WaitHandle* wait_handle, bool is_alertable, std::chrono::milliseconds timeout) { HANDLE handle = wait_handle->native_handle(); - DWORD result = WaitForSingleObjectEx(handle, DWORD(timeout.count()), - is_alertable ? TRUE : FALSE); + DWORD result; + DWORD timeout_dw = DWORD(timeout.count()); + BOOL bAlertable = is_alertable ? TRUE : FALSE; + // todo: we might actually be able to use NtWaitForSingleObject even if its + // alertable, just need to study whether + // RtlDeactivateActivationContextUnsafeFast/RtlActivateActivationContext are + // actually needed for us +#if XE_USE_NTDLL_FUNCTIONS == 1 + if (bAlertable) { + result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable); + } else { + LARGE_INTEGER timeout_big; + timeout_big.QuadPart = -10000LL * static_cast(timeout_dw); + + result = NtWaitForSingleObjectPointer.invoke( + handle, bAlertable, timeout_dw == INFINITE ? nullptr : &timeout_big); + } +#else + result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable); +#endif switch (result) { case WAIT_OBJECT_0: return WaitResult::kSuccess; @@ -178,7 +232,9 @@ std::pair WaitMultiple(WaitHandle* wait_handles[], size_t wait_handle_count, bool wait_all, bool is_alertable, std::chrono::milliseconds timeout) { - std::vector handles(wait_handle_count); + std::vector handles( + wait_handle_count); // max handles is like 64, so it would make more + // sense to just do a fixed size array here for (size_t i = 0; i < wait_handle_count; ++i) { handles[i] = wait_handles[i]->native_handle(); } @@ -208,9 +264,16 @@ class Win32Event : public Win32Handle { public: explicit Win32Event(HANDLE handle) : Win32Handle(handle) {} ~Win32Event() override = default; +#if XE_USE_NTDLL_FUNCTIONS == 1 + void Set() override { NtSetEventPointer.invoke(handle_, nullptr); } + void Reset() override { NtClearEventPointer.invoke(handle_); } + void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); } +#else void Set() override { SetEvent(handle_); } void Reset() override { ResetEvent(handle_); } void Pulse() override { PulseEvent(handle_); } + +#endif }; std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { @@ -220,6 +283,7 @@ std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { return std::make_unique(handle); } else { LOG_LASTERROR(); + return nullptr; } } @@ -240,10 +304,15 @@ class Win32Semaphore : public Win32Handle { explicit Win32Semaphore(HANDLE handle) : Win32Handle(handle) {} ~Win32Semaphore() override = default; bool Release(int release_count, int* out_previous_count) override { +#if XE_USE_NTDLL_FUNCTIONS == 1 + return NtReleaseSemaphorePointer.invoke(handle_, release_count, + out_previous_count) >= 0; +#else return ReleaseSemaphore(handle_, release_count, reinterpret_cast(out_previous_count)) ? true : false; +#endif } }; diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc index 6405aa2f8..65f798f54 100644 --- a/src/xenia/base/utf8.cc +++ b/src/xenia/base/utf8.cc @@ -82,8 +82,9 @@ std::string upper_ascii(const std::string_view view) { template inline size_t hash_fnv1a(const std::string_view view) { const size_t offset_basis = 0xCBF29CE484222325ull; - const size_t prime = 0x00000100000001B3ull; - auto work = [&prime](size_t hash, uint8_t byte_of_data) { + // chrispy: constant capture errors on clang + auto work = [](size_t hash, uint8_t byte_of_data) { + const size_t prime = 0x00000100000001B3ull; hash ^= byte_of_data; hash *= prime; return hash; diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 7d15d0e63..c3711f239 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -25,7 +25,7 @@ #include "xenia/cpu/breakpoint.h" #include "xenia/cpu/processor.h" #include "xenia/cpu/stack_walker.h" - +#include "xenia/cpu/xex_module.h" DEFINE_int32(x64_extension_mask, -1, "Allow the detection and utilization of specific instruction set " "features.\n" @@ -45,6 +45,12 @@ DEFINE_int32(x64_extension_mask, -1, " -1 = Detect and utilize all possible processor features\n", "x64"); +DEFINE_bool(record_mmio_access_exceptions, true, + "For guest addresses records whether we caught any mmio accesses " + "for them. This info can then be used on a subsequent run to " + "instruct the recompiler to emit checks", + "CPU"); + namespace xe { namespace cpu { namespace backend { @@ -86,6 +92,11 @@ X64Backend::~X64Backend() { ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this); } +static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) { + reinterpret_cast(context) + ->RecordMMIOExceptionForGuestInstruction(hostaddr); +} + bool X64Backend::Initialize(Processor* processor) { if (!Backend::Initialize(processor)) { return false; @@ -146,6 +157,8 @@ bool X64Backend::Initialize(Processor* processor) { // Setup exception callback ExceptionHandler::Install(&ExceptionCallbackThunk, this); + processor->memory()->SetMMIOExceptionRecordingCallback( + ForwardMMIOAccessForRecording, (void*)this); return true; } @@ -390,7 +403,28 @@ bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) { auto backend = reinterpret_cast(data); return backend->ExceptionCallback(ex); } +void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) { + uint64_t host_addr_u64 = (uint64_t)host_address; + auto fnfor = code_cache()->LookupFunction(host_addr_u64); + if (fnfor) { + uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64); + + Module* guest_module = fnfor->module(); + if (guest_module) { + XexModule* xex_guest_module = dynamic_cast(guest_module); + + if (xex_guest_module) { + cpu::InfoCacheFlags* icf = + xex_guest_module->GetInstructionAddressFlags(guestaddr); + + if (icf) { + icf->accessed_mmio = true; + } + } + } + } +} bool X64Backend::ExceptionCallback(Exception* ex) { if (ex->code() != Exception::Code::kIllegalInstruction) { // We only care about illegal instructions. Other things will be handled by @@ -399,6 +433,8 @@ bool X64Backend::ExceptionCallback(Exception* ex) { return false; } + // processor_->memory()->LookupVirtualMappedRange() + // Verify an expected illegal instruction. auto instruction_bytes = xe::load_and_swap(reinterpret_cast(ex->pc())); diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index a87cdc102..4ec930698 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -92,6 +92,8 @@ class X64Backend : public Backend { } virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; + void RecordMMIOExceptionForGuestInstruction(void* host_address); + private: static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index e481788c3..dc435c39f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -156,7 +156,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, void** out_code_address, size_t* out_code_size, std::vector* out_source_map) { SCOPE_profile_cpu_f("cpu"); - + guest_module_ = dynamic_cast(function->module()); // Reset. debug_info_ = debug_info; debug_info_flags_ = debug_info_flags; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 93a7babaf..93ac9915f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -18,8 +18,8 @@ #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/hir/instr.h" #include "xenia/cpu/hir/value.h" +#include "xenia/cpu/xex_module.h" #include "xenia/memory.h" - // NOTE: must be included last as it expects windows.h to already be included. #include "third_party/xbyak/xbyak/xbyak.h" #include "third_party/xbyak/xbyak/xbyak_util.h" @@ -65,11 +65,7 @@ enum class SimdDomain : uint32_t { // CONFLICTING means its used in multiple domains) }; -enum class MXCSRMode : uint32_t { - Unknown, - Fpu, - Vmx -}; +enum class MXCSRMode : uint32_t { Unknown, Fpu, Vmx }; static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { if (dom1 == dom2) { @@ -326,16 +322,21 @@ class X64Emitter : public Xbyak::CodeGenerator { size_t stack_size() const { return stack_size_; } SimdDomain DeduceSimdDomain(const hir::Value* for_value); - void ForgetMxcsrMode() { - mxcsr_mode_ = MXCSRMode::Unknown; - } + void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; } /* - returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr + returns true if had to load mxcsr. DOT_PRODUCT can use this to skip + clearing the overflow flag, as it will never be set in the vmx fpscr */ - bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE + bool ChangeMxcsrMode( + MXCSRMode new_mode, + bool already_set = false); // already_set means that the caller already + // did vldmxcsr, used for SET_ROUNDING_MODE + + void LoadFpuMxcsrDirect(); // unsafe, does not change mxcsr_mode_ + void LoadVmxMxcsrDirect(); // unsafe, does not change mxcsr_mode_ + + XexModule* GuestModule() { return guest_module_; } - void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_ - void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_ protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); @@ -348,6 +349,7 @@ class X64Emitter : public Xbyak::CodeGenerator { X64Backend* backend_ = nullptr; X64CodeCache* code_cache_ = nullptr; XbyakAllocator* allocator_ = nullptr; + XexModule* guest_module_ = nullptr; Xbyak::util::Cpu cpu_; uint32_t feature_flags_ = 0; diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h index 745603032..b9257f179 100644 --- a/src/xenia/cpu/backend/x64/x64_op.h +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -60,23 +60,46 @@ union InstrKey { InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); } InstrKey(uint32_t v) : value(v) {} + + // this used to take about 1% cpu while precompiling + // it kept reloading opcode, and also constantly repacking and unpacking the + // bitfields. instead, we pack the fields at the very end InstrKey(const Instr* i) : value(0) { - opcode = i->opcode->num; - uint32_t sig = i->opcode->signature; - dest = - GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; - src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); - if (src1 == OPCODE_SIG_TYPE_V) { - src1 += i->src1.value->type; + const OpcodeInfo* info = i->GetOpcodeInfo(); + + uint32_t sig = info->signature; + + OpcodeSignatureType dest_type, src1_type, src2_type, src3_type; + + UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type); + + uint32_t out_desttype = (uint32_t)dest_type; + uint32_t out_src1type = (uint32_t)src1_type; + uint32_t out_src2type = (uint32_t)src2_type; + uint32_t out_src3type = (uint32_t)src3_type; + + Value* destv = i->dest; + // pre-deref, even if not value + Value* src1v = i->src1.value; + Value* src2v = i->src2.value; + Value* src3v = i->src3.value; + + if (out_src1type == OPCODE_SIG_TYPE_V) { + out_src1type += src1v->type; } - src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); - if (src2 == OPCODE_SIG_TYPE_V) { - src2 += i->src2.value->type; + + if (out_src2type == OPCODE_SIG_TYPE_V) { + out_src2type += src2v->type; } - src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); - if (src3 == OPCODE_SIG_TYPE_V) { - src3 += i->src3.value->type; + + if (out_src3type == OPCODE_SIG_TYPE_V) { + out_src3type += src3v->type; } + opcode = info->num; + dest = out_desttype ? OPCODE_SIG_TYPE_V + destv->type : 0; + src1 = out_src1type; + src2 = out_src2type; + src3 = out_src3type; } template GuestAddressFor(); + if (!guestaddr) { + return false; + } + + auto flags = e.GuestModule()->GetInstructionAddressFlags(guestaddr); + + return flags && flags->accessed_mmio; +} // ============================================================================ // OPCODE_LOAD_OFFSET @@ -1030,6 +1049,28 @@ struct LOAD_OFFSET_I64 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, LOAD_OFFSET_I32, LOAD_OFFSET_I64); +template +static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) { + if (swap) { + value = xe::byte_swap(value); + } + if (guestaddr >= 0xE0000000) { + guestaddr += 0x1000; + } + + auto ctx = reinterpret_cast(_ctx); + + auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr); + if (!gaddr) { + *reinterpret_cast(ctx->virtual_membase + guestaddr) = value; + } else { + value = xe::byte_swap(value); /* + was having issues, found by comparing the values used with exceptions + to these that we were reversed... + */ + gaddr->write(nullptr, gaddr->callback_context, guestaddr, value); + } +} // ============================================================================ // OPCODE_STORE_OFFSET // ============================================================================ @@ -1038,6 +1079,7 @@ struct STORE_OFFSET_I8 I> { static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.src3.is_constant) { e.mov(e.byte[addr], i.src3.constant()); } else { @@ -1076,23 +1118,48 @@ struct STORE_OFFSET_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src3.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src3); - } else { - assert_always("not implemented"); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareStore; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareStore; + } + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.constant()); + } else { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + if (i.src2.is_constant) { + e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant()); + } else { + e.add(e.GetNativeParam(0).cvt32(), i.src2); } - } else { if (i.src3.is_constant) { - if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { - e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + e.mov(e.GetNativeParam(1).cvt32(), i.src3.constant()); + } else { + e.mov(e.GetNativeParam(1).cvt32(), i.src3); + } + e.CallNativeSafe(addrptr); + + } else { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src3); } else { - e.mov(e.dword[addr], i.src3.constant()); + assert_always("not implemented"); } } else { - e.mov(e.dword[addr], i.src3); + if (i.src3.is_constant) { + if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + } else { + e.mov(e.dword[addr], i.src3.constant()); + } + } else { + e.mov(e.dword[addr], i.src3); + } } } } @@ -1290,23 +1357,43 @@ struct STORE_I16 : Sequence> { }; struct STORE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src2); - } else { - assert_always("not implemented"); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareStore; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareStore; } - } else { - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant()); } else { - e.mov(e.dword[addr], i.src2); + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + if (i.src2.is_constant) { + e.mov(e.GetNativeParam(1).cvt32(), i.src2.constant()); + } else { + e.mov(e.GetNativeParam(1).cvt32(), i.src2); + } + e.CallNativeSafe(addrptr); + + } else { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src2); + } else { + assert_always("not implemented"); + } + } else { + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } } } if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]); e.lea(e.GetNativeParam(0), e.ptr[addr]); e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index e99628728..3fe52857b 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1683,6 +1683,9 @@ struct DIV_I16 : Sequence> { assert_impossible_sequence(DIV_I16); } }; +/* + TODO: hoist the overflow/zero checks into HIR +*/ struct DIV_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; @@ -1766,6 +1769,9 @@ struct DIV_I32 : Sequence> { e.mov(i.dest, e.eax); } }; +/* + TODO: hoist the overflow/zero checks into HIR +*/ struct DIV_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; @@ -1811,7 +1817,7 @@ struct DIV_I64 : Sequence> { } else { // check for signed overflow if (i.src1.is_constant) { - if (i.src1.constant() != (1 << 31)) { + if (i.src1.constant() != (1ll << 63)) { // we're good, overflow is impossible } else { e.cmp(i.src2, -1); // otherwise, if src2 is -1 then we have diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index f7d882279..a4e39e78c 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -149,7 +149,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Remove(); } result = true; + } else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from + // const indirect call true + auto function = processor_->LookupFunction( + uint32_t(i->src2.value->constant.i32)); + if (!function) { + break; + } + // i->Replace(&OPCODE_CALL_TRUE_info, i->flags); + i->opcode = &OPCODE_CALL_TRUE_info; + i->set_src2(nullptr); + i->src2.symbol = function; + result = true; } + break; case OPCODE_BRANCH_TRUE: diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 894c4423b..a5100cff6 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -796,10 +796,13 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, if (var_definition) { var_definition = var_definition->GetDestDefSkipAssigns(); - if (var_definition != NULL) - { - def_opcode = var_definition->opcode->num; + if (!var_definition) { + return false; } + def_opcode = var_definition->opcode->num; + } + if (!var_definition) { + return false; } // x == 0 -> !x if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) { @@ -1231,13 +1234,12 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) { result = false; return value; } - -bool SimplificationPass::SimplifyAddArith(hir::Instr* i, - hir::HIRBuilder* builder) { +bool SimplificationPass::SimplifyAddWithSHL(hir::Instr* i, + hir::HIRBuilder* builder) { /* - example: (x <<1 ) + x == (x*3) + example: (x <<1 ) + x == (x*3) - */ +*/ auto [shlinsn, addend] = i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info); if (!shlinsn) { @@ -1278,11 +1280,81 @@ bool SimplificationPass::SimplifyAddArith(hir::Instr* i, return true; } +bool SimplificationPass::SimplifyAddToSelf(hir::Instr* i, + hir::HIRBuilder* builder) { + /* + heres a super easy one + */ + + if (i->src1.value != i->src2.value) { + return false; + } + + i->opcode = &OPCODE_SHL_info; + + i->set_src2(builder->LoadConstantUint8(1)); + + return true; +} +bool SimplificationPass::SimplifyAddArith(hir::Instr* i, + hir::HIRBuilder* builder) { + if (SimplifyAddWithSHL(i, builder)) { + return true; + } + if (SimplifyAddToSelf(i, builder)) { + return true; + } + return false; +} bool SimplificationPass::SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder) { + /* + todo: handle expressions like (x*8) - (x*5) == (x*3)...if these can even + happen of course */ return false; } +bool SimplificationPass::SimplifySHLArith(hir::Instr* i, + hir::HIRBuilder* builder) { + Value* sh = i->src2.value; + + Value* shifted = i->src1.value; + + if (!sh->IsConstant()) { + return false; + } + + hir::Instr* definition = shifted->GetDefSkipAssigns(); + + if (!definition) { + return false; + } + + if (definition->GetOpcodeNum() != OPCODE_MUL) { + return false; + } + + if (definition->flags != ARITHMETIC_UNSIGNED) { + return false; + } + + auto [mulconst, mulnonconst] = definition->BinaryValueArrangeAsConstAndVar(); + + if (!mulconst) { + return false; + } + + auto newmul = builder->AllocValue(mulconst->type); + newmul->set_from(mulconst); + + newmul->Shl(sh); + + i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED); + i->set_src1(mulnonconst); + i->set_src2(newmul); + + return true; +} bool SimplificationPass::SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder) { if (!i->dest) { @@ -1301,6 +1373,9 @@ bool SimplificationPass::SimplifyBasicArith(hir::Instr* i, case OPCODE_SUB: { return SimplifySubArith(i, builder); } + case OPCODE_SHL: { + return SimplifySHLArith(i, builder); + } } return false; } @@ -1317,6 +1392,97 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) { } return result; } + +/* + todo: add load-store simplification pass + + do things like load-store byteswap elimination, for instance, + + if a value is loaded, ored with a constant mask, and then stored, we + simply have to byteswap the mask it will be ored with and then we can + eliminate the two byteswaps + + the same can be done for and, or, xor, andn with constant masks + + + this can also be done for comparisons with 0 for equality and not equal + + + another optimization: with ppc you cannot move a floating point register + directly to a gp one, a gp one directly to a floating point register, or a + vmx one to either. so guest code will store the result to the stack, and then + load it to the register it needs in HIR we can sidestep this. we will still + need to byteswap and store the result for correctness, but we can eliminate + the load and byteswap by grabbing the original value from the store + + skyth's sanic idb, 0x824D7724 + lis r11, + lfs f0, flt_8200CBCC@l(r11) + fmuls f0, time, f0 + fctidz f0, f0 # vcvttss2si + stfd f0, 0x190+var_138(r1) + lwz r30, 0x190+var_138+4(r1) + cmplwi cr6, r30, 0x63 # 'c' + ble cr6, counter_op + + + +*/ + +/* + todo: simple loop unrolling + skyth sanic 0x831D9908 + + mr r30, r4 + mr r29, r5 + mr r11, r7 + li r31, 0 + +loc_831D9928: + slwi r9, r11, 1 + addi r10, r11, 1 + addi r8, r1, 0xD0+var_80 + clrlwi r11, r10, 16 + cmplwi cr6, r11, 0x10 + sthx r31, r9, r8 + ble cr6, loc_831D9928 + + v5 = 1; + do + { + v6 = 2 * v5; + v5 = (unsigned __int16)(v5 + 1); + *(_WORD *)&v24[v6] = 0; + } + while ( v5 <= 0x10 ); + v7 = 0; + do + { + v8 = __ROL4__(*(unsigned __int8 *)(v7 + a2), 1); + v7 = (unsigned __int16)(v7 + 1); + ++*(_WORD *)&v24[v8]; + } + while ( v7 < 8 ); + v9 = 1; + v25[0] = 0; + do + { + v10 = 2 * v9; + v11 = 16 - v9; + v9 = (unsigned __int16)(v9 + 1); + v25[v10 / 2] = (*(_WORD *)&v24[v10] << v11) + *(_WORD +*)&v24[v10 + 48]; + } + while ( v9 <= 0x10 ); + + + skyth sanic: + sub_831BBAE0 + + sub_831A41A8 + + +*/ } // namespace passes } // namespace compiler } // namespace cpu diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 8a5d3ee4c..078187eb1 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -36,9 +36,11 @@ class SimplificationPass : public ConditionalGroupSubpass { // handles simple multiplication/addition rules bool SimplifyBasicArith(hir::HIRBuilder* builder); bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder); - + bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder); + bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder); + bool SimplifySHLArith(hir::Instr* i, hir::HIRBuilder* builder); // handle either or or xor with 0 bool CheckOrXorZero(hir::Instr* i); bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 92e2848f8..149103d43 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -200,6 +200,20 @@ const Instr* Instr::GetNonFakePrev() const { } return curr; } + +uint32_t Instr::GuestAddressFor() const { + Instr* srch = prev; + + while (srch) { + if (srch->GetOpcodeNum() == OPCODE_SOURCE_OFFSET) { + return (uint32_t)srch->src1.offset; + } + srch = srch->prev; + } + + return 0; // eek. +} + } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 47f629227..38afef241 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -169,6 +169,8 @@ if both are constant, return nullptr, nullptr // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER, // OPCODE_SOURCE_OFFSET const hir::Instr* GetNonFakePrev() const; + + uint32_t GuestAddressFor() const; }; } // namespace hir diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index eb28703d1..61f420eaa 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -30,7 +30,8 @@ std::unique_ptr MMIOHandler::Install( HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context) { + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context) { // There can be only one handler at a time. assert_null(global_handler_); if (global_handler_) { @@ -40,7 +41,8 @@ std::unique_ptr MMIOHandler::Install( auto handler = std::unique_ptr(new MMIOHandler( virtual_membase, physical_membase, membase_end, host_to_guest_virtual, host_to_guest_virtual_context, access_violation_callback, - access_violation_callback_context)); + access_violation_callback_context, record_mmio_callback, + record_mmio_context)); // Install the exception handler directed at the MMIOHandler. ExceptionHandler::Install(ExceptionCallbackThunk, handler.get()); @@ -54,14 +56,18 @@ MMIOHandler::MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase, HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context) + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, + void* record_mmio_context) : virtual_membase_(virtual_membase), physical_membase_(physical_membase), memory_end_(membase_end), host_to_guest_virtual_(host_to_guest_virtual), host_to_guest_virtual_context_(host_to_guest_virtual_context), access_violation_callback_(access_violation_callback), - access_violation_callback_context_(access_violation_callback_context) {} + access_violation_callback_context_(access_violation_callback_context), + record_mmio_callback_(record_mmio_callback), + record_mmio_context_(record_mmio_context) {} MMIOHandler::~MMIOHandler() { ExceptionHandler::Uninstall(ExceptionCallbackThunk, this); @@ -412,6 +418,8 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { // Quick kill anything outside our mapping. return false; } + uint64_t hostip = ex->pc(); + void* fault_host_address = reinterpret_cast(ex->fault_address()); // Access violations are pretty rare, so we can do a linear search here. @@ -561,6 +569,13 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { } #endif // XE_ARCH_ARM64 + if (record_mmio_callback_) { + // record that the guest address corresponding to the faulting instructions' + // host address reads/writes mmio. we can backpropagate this info on future + // compilations + record_mmio_callback_(record_mmio_context_, (void*)ex->pc()); + } + // Advance RIP to the next instruction so that we resume properly. ex->set_resume_pc(rip + decoded_load_store.length); diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 6240544e0..d9f6dc04c 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -29,7 +29,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context, uint32_t addr); typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context, uint32_t addr, uint32_t value); - +typedef void (*MmioAccessRecordCallback)(void* context, + void* host_insn_address); struct MMIORange { uint32_t address; uint32_t mask; @@ -58,7 +59,8 @@ class MMIOHandler { HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context); + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context); static MMIOHandler* global_handler() { return global_handler_; } bool RegisterRange(uint32_t virtual_address, uint32_t mask, uint32_t size, @@ -68,13 +70,20 @@ class MMIOHandler { bool CheckLoad(uint32_t virtual_address, uint32_t* out_value); bool CheckStore(uint32_t virtual_address, uint32_t value); + void SetMMIOExceptionRecordingCallback(MmioAccessRecordCallback callback, + void* context) { + record_mmio_context_ = context; + record_mmio_callback_ = callback; + } protected: MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase, uint8_t* membase_end, HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context); + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, + void* record_mmio_context); static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); @@ -90,7 +99,9 @@ class MMIOHandler { AccessViolationCallback access_violation_callback_; void* access_violation_callback_context_; + MmioAccessRecordCallback record_mmio_callback_; + void* record_mmio_context_; static MMIOHandler* global_handler_; xe::global_critical_region global_critical_region_; diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 5719357a4..6274dfb71 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -1439,11 +1439,23 @@ int InstrEmit_vsel(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vsel128(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_vsel_(f, VX128_VD128, VX128_VA128, VX128_VB128, VX128_VD128); } +// chrispy: this is test code for checking whether a game takes advantage of the +// VSR/VSL undocumented/undefined variable shift behavior +static void AssertShiftElementsOk(PPCHIRBuilder& f, Value* v) { +#if 0 + Value* splatted = f.Splat(f.Extract(v, (uint8_t)0, INT8_TYPE), VEC128_TYPE); + Value* checkequal = f.Xor(splatted, v); + f.DebugBreakTrue(f.IsTrue(checkequal)); +#endif +} int InstrEmit_vsl(PPCHIRBuilder& f, const InstrData& i) { - Value* v = f.Shl(f.LoadVR(i.VX.VA), - f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE), - f.LoadConstantInt8(0b111))); + Value* va = f.LoadVR(i.VX.VA); + Value* vb = f.LoadVR(i.VX.VB); + + AssertShiftElementsOk(f, vb); + Value* v = + f.Shl(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111))); f.StoreVR(i.VX.VD, v); return 0; } @@ -1623,9 +1635,13 @@ int InstrEmit_vspltisw128(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_vsr(PPCHIRBuilder& f, const InstrData& i) { - Value* v = f.Shr(f.LoadVR(i.VX.VA), - f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE), - f.LoadConstantInt8(0b111))); + Value* va = f.LoadVR(i.VX.VA); + Value* vb = f.LoadVR(i.VX.VB); + + AssertShiftElementsOk(f, vb); + + Value* v = + f.Shr(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111))); f.StoreVR(i.VX.VD, v); return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_control.cc b/src/xenia/cpu/ppc/ppc_emit_control.cc index c990237a7..0fe8e2d54 100644 --- a/src/xenia/cpu/ppc/ppc_emit_control.cc +++ b/src/xenia/cpu/ppc/ppc_emit_control.cc @@ -769,8 +769,14 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) { // bit 62 = RI; recoverable interrupt // return 8000h if unlocked (interrupts enabled), else 0 f.MemoryBarrier(); - f.CallExtern(f.builtins()->check_global_lock); - f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE)); + if (cvars::disable_global_lock || true) { + f.StoreGPR(i.X.RT, f.LoadConstantUint64(0)); + + } else { + f.CallExtern(f.builtins()->check_global_lock); + f.StoreGPR(i.X.RT, + f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE)); + } return 0; } @@ -782,6 +788,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { f.StoreContext( offsetof(PPCContext, scratch), f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE)); +#if 0 if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). if (!cvars::disable_global_lock) { @@ -793,6 +800,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { f.CallExtern(f.builtins()->leave_global_lock); } } +#endif return 0; } else { // L = 0 @@ -807,6 +815,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { f.MemoryBarrier(); f.StoreContext(offsetof(PPCContext, scratch), f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE)); +#if 0 if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). if (!cvars::disable_global_lock) { @@ -818,6 +827,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { f.CallExtern(f.builtins()->leave_global_lock); } } +#endif return 0; } else { // L = 0 diff --git a/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc b/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc index 4323bdfac..3a2772bd5 100644 --- a/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc +++ b/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc @@ -5406,6 +5406,7 @@ PPCOpcodeDisasmInfo ppc_opcode_disasm_table[] = { INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral, "XOR Immediate Shifted" , (PPCOpcodeField::kRS,PPCOpcodeField::kUIMM), (PPCOpcodeField::kRA), PrintDisasm_xoris), INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral, "XOR" , (PPCOpcodeField::kRS,PPCOpcodeField::kRB), (PPCOpcodeField::kRA,PPCOpcodeField::kCRcond), PrintDisasm_xorx), }; +#undef INSTRUCTION static_assert(sizeof(ppc_opcode_disasm_table) / sizeof(PPCOpcodeDisasmInfo) == static_cast(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen"); const PPCOpcodeDisasmInfo& GetOpcodeDisasmInfo(PPCOpcode opcode) { diff --git a/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc b/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc index 43210f5fb..22c41b270 100644 --- a/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc +++ b/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc @@ -470,6 +470,7 @@ PPCOpcodeInfo ppc_opcode_table[] = { INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral), INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral), }; +#undef INSTRUCTION static_assert(sizeof(ppc_opcode_table) / sizeof(PPCOpcodeInfo) == static_cast(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen"); const PPCOpcodeInfo& GetOpcodeInfo(PPCOpcode opcode) { diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 6bd57b4f7..0fbeb30cd 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -257,11 +257,22 @@ Function* Processor::ResolveFunction(uint32_t address) { // Grab symbol declaration. auto function = LookupFunction(address); + if (!function) { entry->status = Entry::STATUS_FAILED; return nullptr; } + auto module_for = function->module(); + + auto xexmod = dynamic_cast(module_for); + if (xexmod) { + auto addr_flags = xexmod->GetInstructionAddressFlags(address); + if (addr_flags) { + addr_flags->was_resolved = 1; + } + } + if (!DemandFunction(function)) { entry->status = Entry::STATUS_FAILED; return nullptr; diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index b0b963467..7ccf3f71b 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -14,13 +14,16 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_order.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" + #include "xenia/cpu/cpu_flags.h" #include "xenia/cpu/export_resolver.h" #include "xenia/cpu/lzx.h" #include "xenia/cpu/processor.h" +#include "xenia/emulator.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/xmodule.h" @@ -29,6 +32,14 @@ #include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/pe/pe_image.h" +DEFINE_bool(disable_instruction_infocache, false, + "Disables caching records of called instructions/mmio accesses.", + "CPU"); +DEFINE_bool(disable_function_precompilation, true, + "Disables pre-compiling guest functions that we know we've called " + "on previous runs", + "CPU"); + static const uint8_t xe_xex2_retail_key[16] = { 0x20, 0xB1, 0x85, 0xA5, 0x9D, 0x28, 0xFD, 0xC3, 0x40, 0x58, 0x3F, 0xBB, 0x08, 0x96, 0xBF, 0x91}; @@ -977,6 +988,7 @@ bool XexModule::LoadContinue() { // Scan and find the low/high addresses. // All code sections are continuous, so this should be easy. + // could use a source for the above information auto heap = memory()->LookupHeap(base_address_); auto page_size = heap->page_size(); @@ -1045,7 +1057,24 @@ bool XexModule::LoadContinue() { library_offset += library->size; } } + sha1::SHA1 final_image_sha_; + final_image_sha_.reset(); + + unsigned high_code = this->high_address_ - this->low_address_; + + final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_), + high_code); + final_image_sha_.finalize(image_sha_bytes_); + + char fmtbuf[16]; + + for (unsigned i = 0; i < 16; ++i) { + sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]); + image_sha_str_ += &fmtbuf[0]; + } + + info_cache_.Init(this); // Find __savegprlr_* and __restgprlr_* and the others. // We can flag these for special handling (inlining/etc). if (!FindSaveRest()) { @@ -1288,7 +1317,68 @@ std::unique_ptr XexModule::CreateFunction(uint32_t address) { return std::unique_ptr( processor_->backend()->CreateGuestFunction(this, address)); } +void XexInfoCache::Init(XexModule* xexmod) { + if (cvars::disable_instruction_infocache) { + return; + } + auto emu = xexmod->kernel_state_->emulator(); + std::filesystem::path infocache_path = emu->cache_root(); + infocache_path.append(L"modules"); + + infocache_path.append(xexmod->image_sha_str_); + + std::filesystem::create_directories(infocache_path); + infocache_path.append("executable_addr_flags.bin"); + + unsigned num_codebytes = xexmod->high_address_ - xexmod->low_address_; + num_codebytes += 3; // round up to nearest multiple of 4 + num_codebytes &= ~3; + bool did_exist = true; + if (!std::filesystem::exists(infocache_path)) { + xe::filesystem::CreateEmptyFile(infocache_path); + did_exist = false; + } + + // todo: prepopulate with stuff from pdata, dll exports + this->executable_addr_flags_ = std::move(xe::MappedMemory::Open( + infocache_path, xe::MappedMemory::Mode::kReadWrite, 0, + sizeof(InfoCacheFlagsHeader) + + (sizeof(InfoCacheFlags) * + (num_codebytes / + 4)))); // one infocacheflags entry for each PPC instr-sized addr + + if (did_exist) { + xexmod->PrecompileKnownFunctions(); + } +} + +InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) { + if (guest_addr < low_address_ || guest_addr > high_address_) { + return nullptr; + } + + guest_addr -= low_address_; + + return info_cache_.LookupFlags(guest_addr); +} + +void XexModule::PrecompileKnownFunctions() { + if (cvars::disable_function_precompilation) { + return; + } + uint32_t start = 0; + uint32_t end = (high_address_ - low_address_) / 4; + auto flags = info_cache_.LookupFlags(0); + if (!flags) { + return; + } + for (uint32_t i = 0; i < end; i++) { + if (flags[i].was_resolved) { + processor_->ResolveFunction(low_address_ + (i * 4)); + } + } +} bool XexModule::FindSaveRest() { // Special stack save/restore functions. // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm diff --git a/src/xenia/cpu/xex_module.h b/src/xenia/cpu/xex_module.h index cd8fc49c5..06045ff92 100644 --- a/src/xenia/cpu/xex_module.h +++ b/src/xenia/cpu/xex_module.h @@ -12,7 +12,7 @@ #include #include - +#include "xenia/base/mapped_memory.h" #include "xenia/cpu/module.h" #include "xenia/kernel/util/xex2_info.h" @@ -30,6 +30,39 @@ constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2"); constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F'); class Runtime; +struct InfoCacheFlags { + uint32_t was_resolved : 1; // has this address ever been called/requested + // via resolvefunction? + uint32_t accessed_mmio : 1; + uint32_t reserved : 30; +}; +struct XexInfoCache { + struct InfoCacheFlagsHeader { + unsigned char reserved[256]; // put xenia version here + + InfoCacheFlags* LookupFlags(unsigned offset) { + return &reinterpret_cast(&this[1])[offset]; + } + }; + /* + for every 4-byte aligned address, records a 4 byte set of flags. + */ + std::unique_ptr executable_addr_flags_; + + void Init(class XexModule*); + InfoCacheFlags* LookupFlags(unsigned offset) { + offset /= 4; + if (!executable_addr_flags_) { + return nullptr; + } + uint8_t* data = executable_addr_flags_->data(); + + if (!data) { + return nullptr; + } + return reinterpret_cast(data)->LookupFlags(offset); + } +}; class XexModule : public xe::cpu::Module { public: @@ -174,10 +207,14 @@ class XexModule : public xe::cpu::Module { XEX_MODULE_PATCH_FULL)); } + InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); + void PrecompileKnownFunctions(); + protected: std::unique_ptr CreateFunction(uint32_t address) override; private: + friend struct XexInfoCache; void ReadSecurityInfo(); int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key); @@ -217,6 +254,10 @@ class XexModule : public xe::cpu::Module { XexFormat xex_format_ = kFormatUnknown; SecurityInfoContext security_info_ = {}; + + uint8_t image_sha_bytes_[16]; + std::string image_sha_str_; + XexInfoCache info_cache_; }; } // namespace cpu diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 23c634cf2..dfc993dee 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -16,6 +16,7 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_stream.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" @@ -28,6 +29,10 @@ #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" +DEFINE_bool(log_unknown_register_writes, false, + "Log writes to unknown registers from " + "CommandProcessor::WriteRegister. Has significant performance hit.", + "GPU"); namespace xe { namespace gpu { @@ -329,19 +334,9 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) { write_ptr_index_ = value; write_ptr_index_event_->Set(); } - -void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { +void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index, + uint32_t value) { RegisterFile& regs = *register_file_; - if (index >= RegisterFile::kRegisterCount) { - XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index); - return; - } - - regs.values[index].u32 = value; - if (!regs.GetRegisterInfo(index)) { - XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); - } - // Scratch register writeback. if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; @@ -469,6 +464,43 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } } } +void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { + if (XE_UNLIKELY(cvars::log_unknown_register_writes)) { + // chrispy: rearrange check order, place set after checks + if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) { + XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); + check_reg_out_of_bounds: + if (XE_UNLIKELY(index >= RegisterFile::kRegisterCount)) { + XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", + index); + return; + } + } + } else { + goto check_reg_out_of_bounds; + } + register_file_->values[index].u32 = value; + + // regs with extra logic on write: XE_GPU_REG_COHER_STATUS_HOST + // XE_GPU_REG_DC_LUT_RW_INDEX + // XE_GPU_REG_DC_LUT_SEQ_COLOR XE_GPU_REG_DC_LUT_PWL_DATA + // XE_GPU_REG_DC_LUT_30_COLOR + + // quick pre-test + // todo: figure out just how unlikely this is. if very (it ought to be, theres + // a ton of registers other than these) make this predicate branchless and + // mark with unlikely, then make HandleSpecialRegisterWrite noinline yep, its + // very unlikely. these ORS here are meant to be bitwise ors, so that we do + // not do branching evaluation of the conditions (we will almost always take + // all of the branches) + if (XE_UNLIKELY( + (index - XE_GPU_REG_SCRATCH_REG0 < 8) | + (index == XE_GPU_REG_COHER_STATUS_HOST) | + ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) { + HandleSpecialRegisterWrite(index, value); + } +} void CommandProcessor::MakeCoherent() { SCOPE_profile_cpu_f("gpu"); @@ -570,7 +602,7 @@ void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) { // Return up a level if we encounter a bad packet. XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); assert_always(); - //break; + // break; } } while (reader.read_count()); diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index ffc8eeffa..412e8833d 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -150,7 +150,9 @@ class CommandProcessor { void WorkerThreadMain(); virtual bool SetupContext() = 0; virtual void ShutdownContext() = 0; - + // rarely needed, most register writes have no special logic here + XE_NOINLINE + void HandleSpecialRegisterWrite(uint32_t index, uint32_t value); virtual void WriteRegister(uint32_t index, uint32_t value); const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 8038b0dc2..add11e4f6 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -712,7 +712,7 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) { ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height; ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth; ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth; - if (ff_viewport_update_needed_) { + if (XE_UNLIKELY(ff_viewport_update_needed_)) { ff_viewport_ = viewport; deferred_command_list_.RSSetViewport(ff_viewport_); ff_viewport_update_needed_ = false; diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 46f372503..0ae6c8552 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -4799,18 +4799,16 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( if (!current_transfers.empty()) { are_current_command_list_render_targets_valid_ = false; if (dest_rt_key.is_depth) { - command_list.D3DOMSetRenderTargets( - 0, nullptr, FALSE, &dest_d3d12_rt.descriptor_draw().GetHandle()); + auto handle = dest_d3d12_rt.descriptor_draw().GetHandle(); + command_list.D3DOMSetRenderTargets(0, nullptr, FALSE, &handle); if (!use_stencil_reference_output_) { command_processor_.SetStencilReference(UINT8_MAX); } } else { - command_list.D3DOMSetRenderTargets( - 1, - &(dest_d3d12_rt.descriptor_load_separate().IsValid() - ? dest_d3d12_rt.descriptor_load_separate().GetHandle() - : dest_d3d12_rt.descriptor_draw().GetHandle()), - FALSE, nullptr); + auto handle = dest_d3d12_rt.descriptor_load_separate().IsValid() + ? dest_d3d12_rt.descriptor_load_separate().GetHandle() + : dest_d3d12_rt.descriptor_draw().GetHandle(); + command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr); } uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles(); @@ -5425,12 +5423,12 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( dest_d3d12_rt.SetResourceState(D3D12_RESOURCE_STATE_RENDER_TARGET), D3D12_RESOURCE_STATE_RENDER_TARGET); if (clear_via_drawing) { - command_list.D3DOMSetRenderTargets( - 1, - &(dest_d3d12_rt.descriptor_load_separate().IsValid() - ? dest_d3d12_rt.descriptor_load_separate().GetHandle() - : dest_d3d12_rt.descriptor_draw().GetHandle()), - FALSE, nullptr); + auto handle = + (dest_d3d12_rt.descriptor_load_separate().IsValid() + ? dest_d3d12_rt.descriptor_load_separate().GetHandle() + : dest_d3d12_rt.descriptor_draw().GetHandle()); + + command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr); are_current_command_list_render_targets_valid_ = true; D3D12_VIEWPORT clear_viewport; clear_viewport.TopLeftX = float(clear_rect.left); diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 24904c7e8..94e21a7e0 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -78,314 +78,24 @@ namespace shaders { #include "xenia/gpu/shaders/bytecode/d3d12_5_1/texture_load_r5g6b5_b5g6r5_scaled_cs.h" } // namespace shaders -const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = { - // k_1_REVERSE - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_1 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8 - {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, - DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_1_5_5_5 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM, - kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_5_6_5 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, - kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_6_5_5 - // On the host, green bits in blue, blue bits in green. - {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, - kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)}, - // k_8_8_8_8 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10 - {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_8_A - {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, - DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8_B - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8_8 - {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb, - DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_Cr_Y1_Cb_Y0_REP - // Red and blue swapped in the load shader for simplicity. - // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for - // the signed version, separate unsigned and signed load shaders completely - // (as one doesn't need decompression for this format, while another does). - {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM, - kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexGBGR8ToRGB8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_Y1_Cr_Y0_Cb_REP - // Red and blue swapped in the load shader for simplicity. - // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for - // the signed version, separate unsigned and signed load shaders completely - // (as one doesn't need decompression for this format, while another does). - {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM, - kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexBGRG8ToRGB8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_16_16_EDRAM - // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_8_8_8_A - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_4_4_4_4 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, - kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_10_11_11 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_11_11_10 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_DXT1 - {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT2_3 - {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT4_5 - {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_16_16_16_EDRAM - // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // R32_FLOAT for depth because shaders would require an additional SRV to - // sample stencil, which we don't provide. - // k_24_8 - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_24_8_FLOAT - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16 - {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16 - {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_EXPAND - {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_EXPAND - {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16_EXPAND - {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_FLOAT - {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_FLOAT - {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16_FLOAT - {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_32_32_32_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32_FLOAT - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_32_FLOAT - {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndex64bpb, - DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_32_32_32_32_FLOAT - {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, - kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32_AS_8 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_8 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_MPEG - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_MPEG - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_DXN - {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8_UNORM, - kLoadShaderIndexDXNToRG8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_8_8_8_AS_16_16_16_16 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT1_AS_16_16_16_16 - {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT2_3_AS_16_16_16_16 - {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT4_5_AS_16_16_16_16 - {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10_AS_16_16_16_16 - {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_10_11_11_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_11_11_10_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_32_32_32_FLOAT - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_DXT3A - // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size - // alignment requirement. - {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_DXT5A - {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM, - kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_CTX1 - {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_DXT3A_AS_1_1_1_1 - {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, - kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_8_8_8_8_GAMMA_EDRAM - // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10_FLOAT_EDRAM - // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, -}; +/* + chrispy: we're getting cache misses in GetHostFormatSwizzle, use a + denser array todo: not all 65536 possible swizzles are used, this could + probably be one cache line +*/ +using SwizzleArray = std::array; + +static constexpr SwizzleArray build_xenos_swizzle_for_format() { + SwizzleArray result{0}; + + for (int i = 0; i < 64; ++i) { + result[i] = + static_cast(D3D12TextureCache::host_formats_[i].swizzle); + } + return result; +} +alignas(64) constexpr SwizzleArray xenos_swizzle_for_format = + build_xenos_swizzle_for_format(); D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file, D3D12SharedMemory& shared_memory, @@ -1544,7 +1254,8 @@ bool D3D12TextureCache::IsScaledResolveSupportedForFormat( } uint32_t D3D12TextureCache::GetHostFormatSwizzle(TextureKey key) const { - return host_formats_[uint32_t(key.format)].swizzle; + // return host_formats_[uint32_t(key.format)].swizzle; + return xenos_swizzle_for_format[uint32_t(key.format)]; } uint32_t D3D12TextureCache::GetMaxHostTextureWidthHeight( diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h index 6a14948fe..d5aacd617 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -160,29 +160,6 @@ class D3D12TextureCache final : public TextureCache { ID3D12Resource* RequestSwapTexture( D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, xenos::TextureFormat& format_out); - - protected: - bool IsSignedVersionSeparateForFormat(TextureKey key) const override; - bool IsScaledResolveSupportedForFormat(TextureKey key) const override; - uint32_t GetHostFormatSwizzle(TextureKey key) const override; - - uint32_t GetMaxHostTextureWidthHeight( - xenos::DataDimension dimension) const override; - uint32_t GetMaxHostTextureDepthOrArraySize( - xenos::DataDimension dimension) const override; - - std::unique_ptr CreateTexture(TextureKey key) override; - - // This binds pipelines, allocates descriptors, and copies! - bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base, - bool load_mips) override; - - void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override; - - private: - static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2; - static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5; - struct HostFormat { // Format info for the regular case. // DXGI format (typeless when different signedness or number representation @@ -223,6 +200,352 @@ class D3D12TextureCache final : public TextureCache { // Mapping of Xenos swizzle components to DXGI format components. uint32_t swizzle; }; + static constexpr HostFormat host_formats_[64]{ + // k_1_REVERSE + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_1 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8 + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, + DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_1_5_5_5 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM, + kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_5_6_5 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_6_5_5 + // On the host, green bits in blue, blue bits in green. + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)}, + // k_8_8_8_8 + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10 + {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_8_A + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, + DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8_B + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8_8 + {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb, + DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_Cr_Y1_Cb_Y0_REP + // Red and blue swapped in the load shader for simplicity. + // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is + // usable for + // the signed version, separate unsigned and signed load shaders + // completely + // (as one doesn't need decompression for this format, while another + // does). + {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM, + kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndexGBGR8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_Y1_Cr_Y0_Cb_REP + // Red and blue swapped in the load shader for simplicity. + // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is + // usable for + // the signed version, separate unsigned and signed load shaders + // completely + // (as one doesn't need decompression for this format, while another + // does). + {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM, + kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndexBGRG8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_16_16_EDRAM + // Not usable as a texture, also has -32...32 range. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_8_8_8_A + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_4_4_4_4 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_10_11_11 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_11_11_10 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_DXT1 + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT2_3 + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT4_5 + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_16_16_16_EDRAM + // Not usable as a texture, also has -32...32 range. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // R32_FLOAT for depth because shaders would require an additional SRV + // to + // sample stencil, which we don't provide. + // k_24_8 + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_24_8_FLOAT + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16 + {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16 + {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_EXPAND + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_EXPAND + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16_EXPAND + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_FLOAT + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_FLOAT + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16_FLOAT + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_32_32_32_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32_FLOAT + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_32_FLOAT + {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_32_32_32_32_FLOAT + {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, + kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32_AS_8 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_8 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_MPEG + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_MPEG + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_MPEG_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_MPEG_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_DXN + {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexDXNToRG8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_8_8_8_AS_16_16_16_16 + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT1_AS_16_16_16_16 + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT2_3_AS_16_16_16_16 + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT4_5_AS_16_16_16_16 + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10_AS_16_16_16_16 + {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_10_11_11_AS_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_11_11_10_AS_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_32_32_32_FLOAT + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_DXT3A + // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size + // alignment requirement. + {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_DXT5A + {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM, + kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_CTX1 + {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_DXT3A_AS_1_1_1_1 + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_8_8_8_8_GAMMA_EDRAM + // Not usable as a texture. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10_FLOAT_EDRAM + // Not usable as a texture. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + }; + + protected: + bool IsSignedVersionSeparateForFormat(TextureKey key) const override; + bool IsScaledResolveSupportedForFormat(TextureKey key) const override; + uint32_t GetHostFormatSwizzle(TextureKey key) const override; + + uint32_t GetMaxHostTextureWidthHeight( + xenos::DataDimension dimension) const override; + uint32_t GetMaxHostTextureDepthOrArraySize( + xenos::DataDimension dimension) const override; + + std::unique_ptr CreateTexture(TextureKey key) override; + + // This binds pipelines, allocates descriptors, and copies! + bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base, + bool load_mips) override; + + void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override; + + private: + static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2; + static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5; class D3D12Texture final : public Texture { public: @@ -467,8 +790,6 @@ class D3D12TextureCache final : public TextureCache { xenos::ClampMode NormalizeClampMode(xenos::ClampMode clamp_mode) const; - static const HostFormat host_formats_[64]; - D3D12CommandProcessor& command_processor_; bool bindless_resources_used_; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index b49b9925a..fdc9cb0cf 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -198,7 +198,7 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) { // maximum [width(0x0FFF), height(0x0FFF)] return 0x050002D0; default: - if (!register_file_.GetRegisterInfo(r)) { + if (!register_file_.IsValidRegister(r)) { XELOGE("GPU: Read from unknown register ({:04X})", r); } } diff --git a/src/xenia/gpu/register_file.cc b/src/xenia/gpu/register_file.cc index f65d5d87c..5dd580e07 100644 --- a/src/xenia/gpu/register_file.cc +++ b/src/xenia/gpu/register_file.cc @@ -8,7 +8,7 @@ */ #include "xenia/gpu/register_file.h" - +#include #include #include "xenia/base/math.h" @@ -17,6 +17,52 @@ namespace xe { namespace gpu { RegisterFile::RegisterFile() { std::memset(values, 0, sizeof(values)); } +constexpr unsigned int GetHighestRegisterNumber() { + uint32_t highest = 0; +#define XE_GPU_REGISTER(index, type, name) \ + highest = std::max(highest, index); +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return highest; +} +constexpr unsigned int GetLowestRegisterNumber() { + uint32_t lowest = UINT_MAX; +#define XE_GPU_REGISTER(index, type, name) \ + lowest = std::min(lowest, index); +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return lowest; +} + +static constexpr uint32_t lowest_register = GetLowestRegisterNumber(); +static constexpr uint32_t highest_register = GetHighestRegisterNumber(); + +static constexpr uint32_t total_num_registers = + highest_register - lowest_register; + +static constexpr uint32_t num_required_words_for_registers = + ((total_num_registers + 63) & ~63) / 64; +// can't use bitset, its not constexpr in c++ 17 +using ValidRegisterBitset = std::array< + uint64_t, + num_required_words_for_registers>; // std::bitset; + +static constexpr ValidRegisterBitset BuildValidRegisterBitset() { + ValidRegisterBitset result{}; +#define XE_GPU_REGISTER(index, type, name) \ + result[(index - lowest_register) / 64] |= \ + 1ULL << ((index - lowest_register) % 64); + +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return result; +} +static constexpr ValidRegisterBitset valid_register_bitset = + BuildValidRegisterBitset(); const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) { switch (index) { @@ -34,6 +80,18 @@ const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) { return nullptr; } } +/* + todo: this still uses a lot of cpu! our bitset is too large +*/ +bool RegisterFile::IsValidRegister(uint32_t index) { + if (XE_UNLIKELY(index < lowest_register) || + XE_UNLIKELY(index > highest_register)) { + return false; + } + uint32_t register_linear_index = index - lowest_register; + return (valid_register_bitset[register_linear_index / 64] & + (1ULL << (register_linear_index % 64))) != 0; +} } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/register_file.h b/src/xenia/gpu/register_file.h index e9a4f1137..11eebd8c5 100644 --- a/src/xenia/gpu/register_file.h +++ b/src/xenia/gpu/register_file.h @@ -32,7 +32,7 @@ class RegisterFile { RegisterFile(); static const RegisterInfo* GetRegisterInfo(uint32_t index); - + static bool IsValidRegister(uint32_t index); static constexpr size_t kRegisterCount = 0x5003; union RegisterValue { uint32_t u32; diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 0d43b0a5e..85ba32c18 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -41,9 +41,6 @@ #include "xenia/ui/windowed_app_context.h" #include "xenia/xbox.h" -DEFINE_string(target_trace_file, "", "Specifies the trace file to load.", - "GPU"); - namespace xe { namespace gpu { @@ -66,7 +63,7 @@ TraceViewer::TraceViewer(xe::ui::WindowedAppContext& app_context, TraceViewer::~TraceViewer() = default; bool TraceViewer::OnInitialize() { - std::string path = cvars::target_trace_file; + std::string path = cvars::target_trace_file.u8string(); // If no path passed, ask the user. // On Android, however, there's no synchronous file picker, and the trace file diff --git a/src/xenia/gpu/trace_viewer.h b/src/xenia/gpu/trace_viewer.h index 58ab16e4e..188a6eb53 100644 --- a/src/xenia/gpu/trace_viewer.h +++ b/src/xenia/gpu/trace_viewer.h @@ -12,6 +12,7 @@ #include +#include "xenia/base/cvar.h" #include "xenia/emulator.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/trace_player.h" @@ -24,7 +25,7 @@ #include "xenia/ui/window.h" #include "xenia/ui/window_listener.h" #include "xenia/ui/windowed_app.h" - +DECLARE_path(target_trace_file); namespace xe { namespace gpu { diff --git a/src/xenia/gpu/trace_writer.cc b/src/xenia/gpu/trace_writer.cc index b83e21868..bc7aadd5b 100644 --- a/src/xenia/gpu/trace_writer.cc +++ b/src/xenia/gpu/trace_writer.cc @@ -25,7 +25,7 @@ namespace xe { namespace gpu { - +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 TraceWriter::TraceWriter(uint8_t* membase) : membase_(membase), file_(nullptr) {} @@ -362,6 +362,6 @@ void TraceWriter::WriteGammaRamp( fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_); } } - +#endif } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/trace_writer.h b/src/xenia/gpu/trace_writer.h index 407166068..0239d7f95 100644 --- a/src/xenia/gpu/trace_writer.h +++ b/src/xenia/gpu/trace_writer.h @@ -17,11 +17,22 @@ #include "xenia/gpu/registers.h" #include "xenia/gpu/trace_protocol.h" +// only enable trace writer in debug builds, measured hit from the trace +// function calls (even if they just immediately return) is 0.40-0.60% cpu time +// total. with inlining they just bloat the caller and negatively impact +// register allocation for the caller +#ifdef NDEBUG +#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 0 +#else +#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 1 +#endif + namespace xe { namespace gpu { class TraceWriter { public: +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 explicit TraceWriter(uint8_t* membase); ~TraceWriter(); @@ -61,6 +72,49 @@ class TraceWriter { bool compress_output_ = true; size_t compression_threshold_ = 1024; // Min. number of bytes to compress. + +#else + // this could be annoying to maintain if new methods are added or the + // signatures change + constexpr explicit TraceWriter(uint8_t* membase) {} + + static constexpr bool is_open() { return false; } + + static constexpr bool Open(const std::filesystem::path& path, + uint32_t title_id) { + return false; + } + static constexpr void Flush() {} + static constexpr void Close() {} + + static constexpr void WritePrimaryBufferStart(uint32_t base_ptr, + uint32_t count) {} + static constexpr void WritePrimaryBufferEnd() {} + static constexpr void WriteIndirectBufferStart(uint32_t base_ptr, + uint32_t count) {} + static constexpr void WriteIndirectBufferEnd() {} + static constexpr void WritePacketStart(uint32_t base_ptr, uint32_t count) {} + static constexpr void WritePacketEnd() {} + static constexpr void WriteMemoryRead(uint32_t base_ptr, size_t length, + const void* host_ptr = nullptr) {} + static constexpr void WriteMemoryReadCached(uint32_t base_ptr, + size_t length) {} + static constexpr void WriteMemoryReadCachedNop(uint32_t base_ptr, + size_t length) {} + static constexpr void WriteMemoryWrite(uint32_t base_ptr, size_t length, + const void* host_ptr = nullptr) {} + static constexpr void WriteEdramSnapshot(const void* snapshot) {} + static constexpr void WriteEvent(EventCommand::Type event_type) {} + static constexpr void WriteRegisters(uint32_t first_register, + const uint32_t* register_values, + uint32_t register_count, + bool execute_callbacks_on_play) {} + static constexpr void WriteGammaRamp( + const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb, + uint32_t gamma_ramp_rw_component) {} + +#endif }; } // namespace gpu diff --git a/src/xenia/kernel/user_module.cc b/src/xenia/kernel/user_module.cc index 5d1cb0f39..f2dc5b1d8 100644 --- a/src/xenia/kernel/user_module.cc +++ b/src/xenia/kernel/user_module.cc @@ -225,6 +225,7 @@ X_STATUS UserModule::LoadContinue() { ldr_data->xex_header_base = guest_xex_header_; ldr_data->full_image_size = security_header->image_size; ldr_data->image_base = this->xex_module()->base_address(); + ldr_data->entry_point = entry_point_; OnLoad(); diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index 730972f25..388fefc62 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -198,7 +198,8 @@ bool Memory::Initialize() { // Add handlers for MMIO. mmio_handler_ = cpu::MMIOHandler::Install( virtual_membase_, physical_membase_, physical_membase_ + 0x1FFFFFFF, - HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this); + HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this, + nullptr, nullptr); if (!mmio_handler_) { XELOGE("Unable to install MMIO handlers"); assert_always(); @@ -213,6 +214,11 @@ bool Memory::Initialize() { return true; } +void Memory::SetMMIOExceptionRecordingCallback( + cpu::MmioAccessRecordCallback callback, void* context) { + mmio_handler_->SetMMIOExceptionRecordingCallback(callback, context); +} + static const struct { uint64_t virtual_address_start; uint64_t virtual_address_end; @@ -1528,9 +1534,10 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address, } bool PhysicalHeap::AllocSystemHeap(uint32_t size, uint32_t alignment, - uint32_t allocation_type, uint32_t protect, - bool top_down, uint32_t* out_address) { - return Alloc(size, alignment, allocation_type, protect, top_down, out_address); + uint32_t allocation_type, uint32_t protect, + bool top_down, uint32_t* out_address) { + return Alloc(size, alignment, allocation_type, protect, top_down, + out_address); } bool PhysicalHeap::Decommit(uint32_t address, uint32_t size) { diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 813eb25bc..ed313a26d 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -498,6 +498,9 @@ class Memory { bool Save(ByteStream* stream); bool Restore(ByteStream* stream); + void SetMMIOExceptionRecordingCallback(cpu::MmioAccessRecordCallback callback, + void* context); + private: int MapViews(uint8_t* mapping_base); void UnmapViews(); diff --git a/src/xenia/ui/window_win.cc b/src/xenia/ui/window_win.cc index f2458b69d..1de3e0448 100644 --- a/src/xenia/ui/window_win.cc +++ b/src/xenia/ui/window_win.cc @@ -181,7 +181,6 @@ bool Win32Window::OpenImpl() { SetWindowPlacement(hwnd_, &initial_dpi_placement); } } - // Disable rounded corners starting with Windows 11 (or silently receive and // ignore E_INVALIDARG on Windows versions before 10.0.22000.0), primarily to // preserve all pixels of the guest output. @@ -189,7 +188,6 @@ bool Win32Window::OpenImpl() { DwmSetWindowAttribute(hwnd_, DWMWA_WINDOW_CORNER_PREFERENCE, &window_corner_preference, sizeof(window_corner_preference)); - // Disable flicks. ATOM atom = GlobalAddAtomW(L"MicrosoftTabletPenServiceProperty"); const DWORD_PTR dwHwndTabletProperty = @@ -1047,7 +1045,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam, } break; case WM_MOVE: { - OnMonitorUpdate(MonitorUpdateEvent(this, false)); + // chrispy: fix clang use of temporary error + MonitorUpdateEvent update_event{this, false}; + OnMonitorUpdate(update_event); } break; case WM_SIZE: { @@ -1084,7 +1084,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam, } break; case WM_DISPLAYCHANGE: { - OnMonitorUpdate(MonitorUpdateEvent(this, true)); + // chrispy: fix clang use of temporary error + MonitorUpdateEvent update_event{this, true}; + OnMonitorUpdate(update_event); } break; case WM_DPICHANGED: {