From cb85fe401c25f91a41739b2a91254c3341527db9 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 13 Aug 2022 12:59:00 -0700 Subject: [PATCH 1/4] Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90% But for normal msvc builds i would put it at around 30-50% Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger fixed a number of errors where temporaries were being passed by reference/pointer Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes. Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold. Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me. Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases ^this can be toggled off in the platform_win header handle indirect call true with constant function pointer, was occurring in h3 lookup host format swizzle in denser array by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar ^looking up whether its known or not took approx 0.3% cpu time Changed some things in /cpu to make the project UNITYBUILD friendly The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds) Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed added support for docdecaduple precision floating point so that we can represent our performance gains numerically tons of other stuff im probably forgetting --- src/xenia/base/byte_order.h | 4 +- src/xenia/base/mapped_memory_win.cc | 3 +- src/xenia/base/memory_win.cc | 32 +- src/xenia/base/mutex.cc | 9 +- src/xenia/base/platform.h | 39 +- src/xenia/base/platform_win.h | 27 ++ src/xenia/base/profiling.h | 2 +- src/xenia/base/ring_buffer.h | 70 ++++ src/xenia/base/threading_timer_queue.cc | 15 +- src/xenia/base/threading_win.cc | 83 +++- src/xenia/base/utf8.cc | 5 +- src/xenia/cpu/backend/x64/x64_backend.cc | 38 +- src/xenia/cpu/backend/x64/x64_backend.h | 2 + src/xenia/cpu/backend/x64/x64_emitter.cc | 2 +- src/xenia/cpu/backend/x64/x64_emitter.h | 28 +- src/xenia/cpu/backend/x64/x64_op.h | 49 ++- src/xenia/cpu/backend/x64/x64_seq_memory.cc | 137 +++++-- src/xenia/cpu/backend/x64/x64_sequences.cc | 8 +- .../passes/constant_propagation_pass.cc | 13 + .../compiler/passes/simplification_pass.cc | 182 ++++++++- .../cpu/compiler/passes/simplification_pass.h | 4 +- src/xenia/cpu/hir/instr.cc | 14 + src/xenia/cpu/hir/instr.h | 2 + src/xenia/cpu/mmio_handler.cc | 23 +- src/xenia/cpu/mmio_handler.h | 17 +- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 28 +- src/xenia/cpu/ppc/ppc_emit_control.cc | 14 +- src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc | 1 + src/xenia/cpu/ppc/ppc_opcode_table_gen.cc | 1 + src/xenia/cpu/processor.cc | 11 + src/xenia/cpu/xex_module.cc | 90 +++++ src/xenia/cpu/xex_module.h | 43 +- src/xenia/gpu/command_processor.cc | 58 ++- src/xenia/gpu/command_processor.h | 4 +- .../gpu/d3d12/d3d12_command_processor.cc | 2 +- .../gpu/d3d12/d3d12_render_target_cache.cc | 26 +- src/xenia/gpu/d3d12/d3d12_texture_cache.cc | 329 +--------------- src/xenia/gpu/d3d12/d3d12_texture_cache.h | 371 ++++++++++++++++-- src/xenia/gpu/graphics_system.cc | 2 +- src/xenia/gpu/register_file.cc | 60 ++- src/xenia/gpu/register_file.h | 2 +- src/xenia/gpu/trace_viewer.cc | 5 +- src/xenia/gpu/trace_viewer.h | 3 +- src/xenia/gpu/trace_writer.cc | 4 +- src/xenia/gpu/trace_writer.h | 54 +++ src/xenia/kernel/user_module.cc | 1 + src/xenia/memory.cc | 15 +- src/xenia/memory.h | 3 + src/xenia/ui/window_win.cc | 10 +- 49 files changed, 1462 insertions(+), 483 deletions(-) diff --git a/src/xenia/base/byte_order.h b/src/xenia/base/byte_order.h index 1a3c63b2f..5a076f319 100644 --- a/src/xenia/base/byte_order.h +++ b/src/xenia/base/byte_order.h @@ -46,7 +46,9 @@ static_assert((std::endian::native == std::endian::big) || namespace xe { -#if XE_COMPILER_MSVC +// chrispy: added workaround for clang, otherwise byteswap_ulong becomes calls +// to ucrtbase +#if XE_COMPILER_MSVC == 1 && !defined(__clang__) #define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort #define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong #define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64 diff --git a/src/xenia/base/mapped_memory_win.cc b/src/xenia/base/mapped_memory_win.cc index f0af5ee7b..6e12954b6 100644 --- a/src/xenia/base/mapped_memory_win.cc +++ b/src/xenia/base/mapped_memory_win.cc @@ -28,7 +28,8 @@ namespace xe { class Win32MappedMemory : public MappedMemory { public: // CreateFile returns INVALID_HANDLE_VALUE in case of failure. - static constexpr HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE; + // chrispy: made inline const to get around clang error + static inline const HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE; // CreateFileMapping returns nullptr in case of failure. static constexpr HANDLE kMappingHandleInvalid = nullptr; diff --git a/src/xenia/base/memory_win.cc b/src/xenia/base/memory_win.cc index cbed1b362..807e3911c 100644 --- a/src/xenia/base/memory_win.cc +++ b/src/xenia/base/memory_win.cc @@ -15,7 +15,15 @@ WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES) #define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS #endif - +/* + these two dont bypass much ms garbage compared to the threading ones, + but Protect is used by PhysicalHeap::EnableAccessCallbacks which eats a lot + of cpu time, so every bit counts +*/ +XE_NTDLL_IMPORT(NtProtectVirtualMemory, cls_NtProtectVirtualMemory, + NtProtectVirtualMemoryPointer); +XE_NTDLL_IMPORT(NtQueryVirtualMemory, cls_NtQueryVirtualMemory, + NtQueryVirtualMemoryPointer); namespace xe { namespace memory { @@ -139,6 +147,18 @@ bool Protect(void* base_address, size_t length, PageAccess access, *out_old_access = PageAccess::kNoAccess; } DWORD new_protect = ToWin32ProtectFlags(access); + +#if XE_USE_NTDLL_FUNCTIONS == 1 + + DWORD old_protect = 0; + SIZE_T MemoryLength = length; + PVOID MemoryCache = base_address; + + BOOL result = NtProtectVirtualMemoryPointer.invoke( + (HANDLE)0xFFFFFFFFFFFFFFFFLL, &MemoryCache, &MemoryLength, + new_protect, &old_protect) >= 0; + +#else #ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS DWORD old_protect = 0; BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect); @@ -146,6 +166,7 @@ bool Protect(void* base_address, size_t length, PageAccess access, ULONG old_protect = 0; BOOL result = VirtualProtectFromApp(base_address, length, ULONG(new_protect), &old_protect); +#endif #endif if (!result) { return false; @@ -161,8 +182,17 @@ bool QueryProtect(void* base_address, size_t& length, PageAccess& access_out) { MEMORY_BASIC_INFORMATION info; ZeroMemory(&info, sizeof(info)); +#if XE_USE_NTDLL_FUNCTIONS == 1 + ULONG_PTR ResultLength; + NTSTATUS query_result = NtQueryVirtualMemoryPointer.invoke( + (HANDLE)0xFFFFFFFFFFFFFFFFLL, (PVOID)base_address, + 0 /* MemoryBasicInformation*/, &info, length, &ResultLength); + SIZE_T result = query_result >= 0 ? ResultLength : 0; +#else SIZE_T result = VirtualQuery(base_address, &info, length); + +#endif if (!result) { return false; } diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc index 80bdb8411..322985594 100644 --- a/src/xenia/base/mutex.cc +++ b/src/xenia/base/mutex.cc @@ -10,10 +10,9 @@ #include "xenia/base/mutex.h" namespace xe { - -std::recursive_mutex& global_critical_region::mutex() { - static std::recursive_mutex global_mutex; - return global_mutex; -} +// chrispy: moved this out of body of function to eliminate the initialization +// guards +static std::recursive_mutex global_mutex; +std::recursive_mutex& global_critical_region::mutex() { return global_mutex; } } // namespace xe diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 439d0c467..6d1a6d5f9 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -41,19 +41,33 @@ #error Unsupported target OS. #endif -#if defined(__clang__) +#if defined(__clang__) && !defined(_MSC_VER) // chrispy: support clang-cl #define XE_COMPILER_CLANG 1 +#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1 #elif defined(__GNUC__) #define XE_COMPILER_GNUC 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 #elif defined(_MSC_VER) #define XE_COMPILER_MSVC 1 +#define XE_COMPILER_HAS_MSVC_EXTENSIONS 1 #elif defined(__MINGW32) #define XE_COMPILER_MINGW32 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 #elif defined(__INTEL_COMPILER) #define XE_COMPILER_INTEL 1 #else #define XE_COMPILER_UNKNOWN 1 #endif +// chrispy: had to place this here. +#if defined(__clang__) && defined(_MSC_VER) +#define XE_COMPILER_CLANG_CL 1 +#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1 +#endif + +// clang extensions == superset of gnu extensions +#if XE_COMPILER_HAS_CLANG_EXTENSIONS == 1 +#define XE_COMPILER_HAS_GNU_EXTENSIONS 1 +#endif #if defined(_M_AMD64) || defined(__amd64__) #define XE_ARCH_AMD64 1 @@ -93,6 +107,29 @@ #define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value) #define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value) +#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1 +#define XE_FORCEINLINE __forceinline +#define XE_NOINLINE __declspec(noinline) +// can't properly emulate "cold" in msvc, but can still segregate the function +// into its own seg +#define XE_COLD __declspec(code_seg(".cold")) +#define XE_LIKELY(...) (!!(__VA_ARGS__)) +#define XE_UNLIKELY(...) (!!(__VA_ARGS__)) + +#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1 +#define XE_FORCEINLINE __attribute__((always_inline)) +#define XE_NOINLINE __attribute__((noinline)) +#define XE_COLD __attribute__((cold)) +#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true) +#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false) +#else +#define XE_FORCEINLINE inline +#define XE_NOINLINE +#define XE_COLD +#define XE_LIKELY(...) (!!(__VA_ARGS__)) +#define XE_UNLIKELY(...) (!!(__VA_ARGS__)) +#endif + namespace xe { #if XE_PLATFORM_WIN32 diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index 22cad5d93..3013a9c14 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -34,4 +34,31 @@ #undef DeleteFile #undef GetFirstChild +#define XE_USE_NTDLL_FUNCTIONS 1 +#if XE_USE_NTDLL_FUNCTIONS==1 +/* + ntdll versions of functions often skip through a lot of extra garbage in KernelBase +*/ +#define XE_NTDLL_IMPORT(name, cls, clsvar) \ + static class cls { \ + public: \ + FARPROC fn;\ + cls() : fn(nullptr) {\ + auto ntdll = GetModuleHandleA("ntdll.dll");\ + if (ntdll) { \ + fn = GetProcAddress(ntdll, #name );\ + }\ + } \ + template \ + inline TRet invoke(TArgs... args) {\ + return reinterpret_cast(fn)(args...);\ + }\ + inline operator bool() const {\ + return fn!=nullptr;\ + }\ + } clsvar +#else +#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false + +#endif #endif // XENIA_BASE_PLATFORM_WIN_H_ diff --git a/src/xenia/base/profiling.h b/src/xenia/base/profiling.h index 0e45b6cc2..b754bcf31 100644 --- a/src/xenia/base/profiling.h +++ b/src/xenia/base/profiling.h @@ -20,7 +20,7 @@ #include "xenia/ui/virtual_key.h" #include "xenia/ui/window_listener.h" -#if XE_PLATFORM_WIN32 +#if XE_PLATFORM_WIN32 && 0 #define XE_OPTION_PROFILING 1 #define XE_OPTION_PROFILING_UI 1 #else diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index 3165d6b7d..9925622de 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -19,7 +19,26 @@ #include "xenia/base/byte_order.h" namespace xe { +/* + todo: this class is CRITICAL to the performance of the entire emulator + currently, about 0.74% cpu time is still taken up by ReadAndSwap, 0.23 + is used by read_count I believe that part of the issue is that smaller + ringbuffers are kicking off an automatic prefetcher stream, that ends up + reading ahead of the end of the ring because it can only go in a straight + line it then gets a cache miss when it eventually wraps around to the start + of the ring? really hard to tell whats going on there honestly, maybe we can + occasionally prefetch the first line of the ring to L1? For the automatic + prefetching i don't think there are any good options. I don't know if we have + any control over where these buffers will be (they seem to be in guest memory + :/), but if we did we could right-justify the buffer so that the final byte + of the ring ends at the end of a page. i think most automatic prefetchers + cannot cross page boundaries it does feel like something isnt right here + though + todo: microoptimization, we can change our size members to be uint32 so + that the registers no longer need the rex prefix, shrinking the generated + code a bit.. like i said, every bit helps in this class +*/ class RingBuffer { public: RingBuffer(uint8_t* buffer, size_t capacity); @@ -32,6 +51,8 @@ class RingBuffer { uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; } void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; } size_t read_count() const { +// chrispy: these branches are unpredictable +#if 0 if (read_offset_ == write_offset_) { return 0; } else if (read_offset_ < write_offset_) { @@ -39,6 +60,33 @@ class RingBuffer { } else { return (capacity_ - read_offset_) + write_offset_; } +#else + size_t read_offs = read_offset_; + size_t write_offs = write_offset_; + size_t cap = capacity_; + + size_t offset_delta = write_offs - read_offs; + size_t wrap_read_count = (cap - read_offs) + write_offs; + + size_t comparison_value = read_offs <= write_offs; +#if 0 + size_t selector = + static_cast(-static_cast(comparison_value)); + offset_delta &= selector; + + wrap_read_count &= ~selector; + return offset_delta | wrap_read_count; +#else + + if (XE_LIKELY(read_offs <= write_offs)) { + return offset_delta; // will be 0 if they are equal, semantically + // identical to old code (i checked the asm, msvc + // does not automatically do this) + } else { + return wrap_read_count; + } +#endif +#endif } size_t write_offset() const { return write_offset_; } @@ -113,6 +161,28 @@ class RingBuffer { size_t write_offset_ = 0; }; +template <> +inline uint32_t RingBuffer::ReadAndSwap() { + size_t read_offset = this->read_offset_; + xenia_assert(this->capacity_ >= 4); + + size_t next_read_offset = read_offset + 4; + #if 0 + size_t zerotest = next_read_offset - this->capacity_; + // unpredictable branch, use bit arith instead + // todo: it would be faster to use lzcnt, but we need to figure out if all + // machines we support support it + next_read_offset &= -static_cast(!!zerotest); + #else + if (XE_UNLIKELY(next_read_offset == this->capacity_)) { + next_read_offset = 0; + //todo: maybe prefetch next? or should that happen much earlier? + } + #endif + this->read_offset_ = next_read_offset; + unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset]; + return xe::byte_swap(ring_value); +} } // namespace xe #endif // XENIA_BASE_RING_BUFFER_H_ diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 79546b9d6..b55b618ae 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -10,12 +10,12 @@ #include #include +#include "third_party/disruptorplus/include/disruptorplus/blocking_wait_strategy.hpp" #include "third_party/disruptorplus/include/disruptorplus/multi_threaded_claim_strategy.hpp" #include "third_party/disruptorplus/include/disruptorplus/ring_buffer.hpp" #include "third_party/disruptorplus/include/disruptorplus/sequence_barrier.hpp" #include "third_party/disruptorplus/include/disruptorplus/spin_wait.hpp" #include "third_party/disruptorplus/include/disruptorplus/spin_wait_strategy.hpp" - #include "xenia/base/assert.h" #include "xenia/base/threading.h" #include "xenia/base/threading_timer_queue.h" @@ -26,6 +26,12 @@ namespace xe { namespace threading { using WaitItem = TimerQueueWaitItem; +/* + chrispy: changed this to a blocking wait from a spin-wait, the spin was + monopolizing a ton of cpu time (depending on the game 2-4% of total cpu time) + on my 3990x no complaints since that change +*/ +using WaitStrat = dp::blocking_wait_strategy; class TimerQueue { public: @@ -147,9 +153,10 @@ class TimerQueue { // This ring buffer will be used to introduce timers queued by the public API static constexpr size_t kWaitCount = 512; dp::ring_buffer> buffer_; - dp::spin_wait_strategy wait_strategy_; - dp::multi_threaded_claim_strategy claim_strategy_; - dp::sequence_barrier consumed_; + + WaitStrat wait_strategy_; + dp::multi_threaded_claim_strategy claim_strategy_; + dp::sequence_barrier consumed_; // This is a _sorted_ (ascending due_) list of active timers managed by a // dedicated thread diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index 8f6087b05..5c00400e2 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -7,19 +7,49 @@ ****************************************************************************** */ +#include #include "xenia/base/assert.h" #include "xenia/base/chrono_steady_cast.h" #include "xenia/base/logging.h" #include "xenia/base/platform_win.h" #include "xenia/base/threading.h" #include "xenia/base/threading_timer_queue.h" - -#define LOG_LASTERROR() \ - { XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); } - +#if defined(__clang__) +// chrispy: i do not understand why this is an error for clang here +// something about the quoted __FUNCTION__ freaks it out (clang 14.0.1) +#define LOG_LASTERROR() \ + do { \ + XELOGI("Win32 Error 0x{:08X} in {} (...)", GetLastError(), __FUNCTION__); \ + } while (false) +#else +#define LOG_LASTERROR() \ + do { \ + XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); \ + } while (false) +#endif typedef HANDLE (*SetThreadDescriptionFn)(HANDLE hThread, PCWSTR lpThreadDescription); +// sys function for ntyieldexecution, by calling it we sidestep +// RtlGetCurrentUmsThread +XE_NTDLL_IMPORT(NtYieldExecution, cls_NtYieldExecution, + NtYieldExecutionPointer); +// sidestep the activation context/remapping special windows handles like stdout +XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject, + NtWaitForSingleObjectPointer); + +XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer); +// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns +// the events state prior to the call, but we dont need that. might need to +// check whether one or the other is faster in the kernel though yeah, just +// checked, the code in ntoskrnl is way simpler for clearevent than resetevent +XE_NTDLL_IMPORT(NtClearEvent, cls_NtClearEvent, NtClearEventPointer); +XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer); + +// heavily called, we dont skip much garbage by calling this, but every bit +// counts +XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore, + NtReleaseSemaphorePointer); namespace xe { namespace threading { @@ -80,7 +110,13 @@ void set_name(const std::string_view name) { } void MaybeYield() { +#if defined(XE_USE_NTDLL_FUNCTIONS) + NtYieldExecutionPointer.invoke(); +#else SwitchToThread(); +#endif + + // memorybarrier is really not necessary here... MemoryBarrier(); } @@ -134,8 +170,26 @@ class Win32Handle : public T { WaitResult Wait(WaitHandle* wait_handle, bool is_alertable, std::chrono::milliseconds timeout) { HANDLE handle = wait_handle->native_handle(); - DWORD result = WaitForSingleObjectEx(handle, DWORD(timeout.count()), - is_alertable ? TRUE : FALSE); + DWORD result; + DWORD timeout_dw = DWORD(timeout.count()); + BOOL bAlertable = is_alertable ? TRUE : FALSE; + // todo: we might actually be able to use NtWaitForSingleObject even if its + // alertable, just need to study whether + // RtlDeactivateActivationContextUnsafeFast/RtlActivateActivationContext are + // actually needed for us +#if XE_USE_NTDLL_FUNCTIONS == 1 + if (bAlertable) { + result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable); + } else { + LARGE_INTEGER timeout_big; + timeout_big.QuadPart = -10000LL * static_cast(timeout_dw); + + result = NtWaitForSingleObjectPointer.invoke( + handle, bAlertable, timeout_dw == INFINITE ? nullptr : &timeout_big); + } +#else + result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable); +#endif switch (result) { case WAIT_OBJECT_0: return WaitResult::kSuccess; @@ -178,7 +232,9 @@ std::pair WaitMultiple(WaitHandle* wait_handles[], size_t wait_handle_count, bool wait_all, bool is_alertable, std::chrono::milliseconds timeout) { - std::vector handles(wait_handle_count); + std::vector handles( + wait_handle_count); // max handles is like 64, so it would make more + // sense to just do a fixed size array here for (size_t i = 0; i < wait_handle_count; ++i) { handles[i] = wait_handles[i]->native_handle(); } @@ -208,9 +264,16 @@ class Win32Event : public Win32Handle { public: explicit Win32Event(HANDLE handle) : Win32Handle(handle) {} ~Win32Event() override = default; +#if XE_USE_NTDLL_FUNCTIONS == 1 + void Set() override { NtSetEventPointer.invoke(handle_, nullptr); } + void Reset() override { NtClearEventPointer.invoke(handle_); } + void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); } +#else void Set() override { SetEvent(handle_); } void Reset() override { ResetEvent(handle_); } void Pulse() override { PulseEvent(handle_); } + +#endif }; std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { @@ -220,6 +283,7 @@ std::unique_ptr Event::CreateManualResetEvent(bool initial_state) { return std::make_unique(handle); } else { LOG_LASTERROR(); + return nullptr; } } @@ -240,10 +304,15 @@ class Win32Semaphore : public Win32Handle { explicit Win32Semaphore(HANDLE handle) : Win32Handle(handle) {} ~Win32Semaphore() override = default; bool Release(int release_count, int* out_previous_count) override { +#if XE_USE_NTDLL_FUNCTIONS == 1 + return NtReleaseSemaphorePointer.invoke(handle_, release_count, + out_previous_count) >= 0; +#else return ReleaseSemaphore(handle_, release_count, reinterpret_cast(out_previous_count)) ? true : false; +#endif } }; diff --git a/src/xenia/base/utf8.cc b/src/xenia/base/utf8.cc index 6405aa2f8..65f798f54 100644 --- a/src/xenia/base/utf8.cc +++ b/src/xenia/base/utf8.cc @@ -82,8 +82,9 @@ std::string upper_ascii(const std::string_view view) { template inline size_t hash_fnv1a(const std::string_view view) { const size_t offset_basis = 0xCBF29CE484222325ull; - const size_t prime = 0x00000100000001B3ull; - auto work = [&prime](size_t hash, uint8_t byte_of_data) { + // chrispy: constant capture errors on clang + auto work = [](size_t hash, uint8_t byte_of_data) { + const size_t prime = 0x00000100000001B3ull; hash ^= byte_of_data; hash *= prime; return hash; diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index 7d15d0e63..c3711f239 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -25,7 +25,7 @@ #include "xenia/cpu/breakpoint.h" #include "xenia/cpu/processor.h" #include "xenia/cpu/stack_walker.h" - +#include "xenia/cpu/xex_module.h" DEFINE_int32(x64_extension_mask, -1, "Allow the detection and utilization of specific instruction set " "features.\n" @@ -45,6 +45,12 @@ DEFINE_int32(x64_extension_mask, -1, " -1 = Detect and utilize all possible processor features\n", "x64"); +DEFINE_bool(record_mmio_access_exceptions, true, + "For guest addresses records whether we caught any mmio accesses " + "for them. This info can then be used on a subsequent run to " + "instruct the recompiler to emit checks", + "CPU"); + namespace xe { namespace cpu { namespace backend { @@ -86,6 +92,11 @@ X64Backend::~X64Backend() { ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this); } +static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) { + reinterpret_cast(context) + ->RecordMMIOExceptionForGuestInstruction(hostaddr); +} + bool X64Backend::Initialize(Processor* processor) { if (!Backend::Initialize(processor)) { return false; @@ -146,6 +157,8 @@ bool X64Backend::Initialize(Processor* processor) { // Setup exception callback ExceptionHandler::Install(&ExceptionCallbackThunk, this); + processor->memory()->SetMMIOExceptionRecordingCallback( + ForwardMMIOAccessForRecording, (void*)this); return true; } @@ -390,7 +403,28 @@ bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) { auto backend = reinterpret_cast(data); return backend->ExceptionCallback(ex); } +void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) { + uint64_t host_addr_u64 = (uint64_t)host_address; + auto fnfor = code_cache()->LookupFunction(host_addr_u64); + if (fnfor) { + uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64); + + Module* guest_module = fnfor->module(); + if (guest_module) { + XexModule* xex_guest_module = dynamic_cast(guest_module); + + if (xex_guest_module) { + cpu::InfoCacheFlags* icf = + xex_guest_module->GetInstructionAddressFlags(guestaddr); + + if (icf) { + icf->accessed_mmio = true; + } + } + } + } +} bool X64Backend::ExceptionCallback(Exception* ex) { if (ex->code() != Exception::Code::kIllegalInstruction) { // We only care about illegal instructions. Other things will be handled by @@ -399,6 +433,8 @@ bool X64Backend::ExceptionCallback(Exception* ex) { return false; } + // processor_->memory()->LookupVirtualMappedRange() + // Verify an expected illegal instruction. auto instruction_bytes = xe::load_and_swap(reinterpret_cast(ex->pc())); diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index a87cdc102..4ec930698 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -92,6 +92,8 @@ class X64Backend : public Backend { } virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; + void RecordMMIOExceptionForGuestInstruction(void* host_address); + private: static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index e481788c3..dc435c39f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -156,7 +156,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, void** out_code_address, size_t* out_code_size, std::vector* out_source_map) { SCOPE_profile_cpu_f("cpu"); - + guest_module_ = dynamic_cast(function->module()); // Reset. debug_info_ = debug_info; debug_info_flags_ = debug_info_flags; diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 93a7babaf..93ac9915f 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -18,8 +18,8 @@ #include "xenia/cpu/hir/hir_builder.h" #include "xenia/cpu/hir/instr.h" #include "xenia/cpu/hir/value.h" +#include "xenia/cpu/xex_module.h" #include "xenia/memory.h" - // NOTE: must be included last as it expects windows.h to already be included. #include "third_party/xbyak/xbyak/xbyak.h" #include "third_party/xbyak/xbyak/xbyak_util.h" @@ -65,11 +65,7 @@ enum class SimdDomain : uint32_t { // CONFLICTING means its used in multiple domains) }; -enum class MXCSRMode : uint32_t { - Unknown, - Fpu, - Vmx -}; +enum class MXCSRMode : uint32_t { Unknown, Fpu, Vmx }; static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) { if (dom1 == dom2) { @@ -326,16 +322,21 @@ class X64Emitter : public Xbyak::CodeGenerator { size_t stack_size() const { return stack_size_; } SimdDomain DeduceSimdDomain(const hir::Value* for_value); - void ForgetMxcsrMode() { - mxcsr_mode_ = MXCSRMode::Unknown; - } + void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; } /* - returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr + returns true if had to load mxcsr. DOT_PRODUCT can use this to skip + clearing the overflow flag, as it will never be set in the vmx fpscr */ - bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE + bool ChangeMxcsrMode( + MXCSRMode new_mode, + bool already_set = false); // already_set means that the caller already + // did vldmxcsr, used for SET_ROUNDING_MODE + + void LoadFpuMxcsrDirect(); // unsafe, does not change mxcsr_mode_ + void LoadVmxMxcsrDirect(); // unsafe, does not change mxcsr_mode_ + + XexModule* GuestModule() { return guest_module_; } - void LoadFpuMxcsrDirect(); //unsafe, does not change mxcsr_mode_ - void LoadVmxMxcsrDirect(); //unsafe, does not change mxcsr_mode_ protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); @@ -348,6 +349,7 @@ class X64Emitter : public Xbyak::CodeGenerator { X64Backend* backend_ = nullptr; X64CodeCache* code_cache_ = nullptr; XbyakAllocator* allocator_ = nullptr; + XexModule* guest_module_ = nullptr; Xbyak::util::Cpu cpu_; uint32_t feature_flags_ = 0; diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h index 745603032..b9257f179 100644 --- a/src/xenia/cpu/backend/x64/x64_op.h +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -60,23 +60,46 @@ union InstrKey { InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); } InstrKey(uint32_t v) : value(v) {} + + // this used to take about 1% cpu while precompiling + // it kept reloading opcode, and also constantly repacking and unpacking the + // bitfields. instead, we pack the fields at the very end InstrKey(const Instr* i) : value(0) { - opcode = i->opcode->num; - uint32_t sig = i->opcode->signature; - dest = - GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; - src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); - if (src1 == OPCODE_SIG_TYPE_V) { - src1 += i->src1.value->type; + const OpcodeInfo* info = i->GetOpcodeInfo(); + + uint32_t sig = info->signature; + + OpcodeSignatureType dest_type, src1_type, src2_type, src3_type; + + UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type); + + uint32_t out_desttype = (uint32_t)dest_type; + uint32_t out_src1type = (uint32_t)src1_type; + uint32_t out_src2type = (uint32_t)src2_type; + uint32_t out_src3type = (uint32_t)src3_type; + + Value* destv = i->dest; + // pre-deref, even if not value + Value* src1v = i->src1.value; + Value* src2v = i->src2.value; + Value* src3v = i->src3.value; + + if (out_src1type == OPCODE_SIG_TYPE_V) { + out_src1type += src1v->type; } - src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); - if (src2 == OPCODE_SIG_TYPE_V) { - src2 += i->src2.value->type; + + if (out_src2type == OPCODE_SIG_TYPE_V) { + out_src2type += src2v->type; } - src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); - if (src3 == OPCODE_SIG_TYPE_V) { - src3 += i->src3.value->type; + + if (out_src3type == OPCODE_SIG_TYPE_V) { + out_src3type += src3v->type; } + opcode = info->num; + dest = out_desttype ? OPCODE_SIG_TYPE_V + destv->type : 0; + src1 = out_src1type; + src2 = out_src2type; + src3 = out_src3type; } template GuestAddressFor(); + if (!guestaddr) { + return false; + } + + auto flags = e.GuestModule()->GetInstructionAddressFlags(guestaddr); + + return flags && flags->accessed_mmio; +} // ============================================================================ // OPCODE_LOAD_OFFSET @@ -1030,6 +1049,28 @@ struct LOAD_OFFSET_I64 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, LOAD_OFFSET_I32, LOAD_OFFSET_I64); +template +static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) { + if (swap) { + value = xe::byte_swap(value); + } + if (guestaddr >= 0xE0000000) { + guestaddr += 0x1000; + } + + auto ctx = reinterpret_cast(_ctx); + + auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr); + if (!gaddr) { + *reinterpret_cast(ctx->virtual_membase + guestaddr) = value; + } else { + value = xe::byte_swap(value); /* + was having issues, found by comparing the values used with exceptions + to these that we were reversed... + */ + gaddr->write(nullptr, gaddr->callback_context, guestaddr, value); + } +} // ============================================================================ // OPCODE_STORE_OFFSET // ============================================================================ @@ -1038,6 +1079,7 @@ struct STORE_OFFSET_I8 I> { static void Emit(X64Emitter& e, const EmitArgType& i) { auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.src3.is_constant) { e.mov(e.byte[addr], i.src3.constant()); } else { @@ -1076,23 +1118,48 @@ struct STORE_OFFSET_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src3.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src3); - } else { - assert_always("not implemented"); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareStore; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareStore; + } + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.constant()); + } else { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + if (i.src2.is_constant) { + e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant()); + } else { + e.add(e.GetNativeParam(0).cvt32(), i.src2); } - } else { if (i.src3.is_constant) { - if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { - e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + e.mov(e.GetNativeParam(1).cvt32(), i.src3.constant()); + } else { + e.mov(e.GetNativeParam(1).cvt32(), i.src3); + } + e.CallNativeSafe(addrptr); + + } else { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src3.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src3); } else { - e.mov(e.dword[addr], i.src3.constant()); + assert_always("not implemented"); } } else { - e.mov(e.dword[addr], i.src3); + if (i.src3.is_constant) { + if (i.src3.constant() == 0 && e.CanUseMembaseLow32As0()) { + e.mov(e.dword[addr], e.GetMembaseReg().cvt32()); + } else { + e.mov(e.dword[addr], i.src3.constant()); + } + } else { + e.mov(e.dword[addr], i.src3); + } } } } @@ -1290,23 +1357,43 @@ struct STORE_I16 : Sequence> { }; struct STORE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - assert_false(i.src2.is_constant); - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(e.dword[addr], i.src2); - } else { - assert_always("not implemented"); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareStore; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareStore; } - } else { - if (i.src2.is_constant) { - e.mov(e.dword[addr], i.src2.constant()); + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant()); } else { - e.mov(e.dword[addr], i.src2); + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + if (i.src2.is_constant) { + e.mov(e.GetNativeParam(1).cvt32(), i.src2.constant()); + } else { + e.mov(e.GetNativeParam(1).cvt32(), i.src2); + } + e.CallNativeSafe(addrptr); + + } else { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + assert_false(i.src2.is_constant); + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(e.dword[addr], i.src2); + } else { + assert_always("not implemented"); + } + } else { + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } } } if (IsTracingData()) { - addr = ComputeMemoryAddress(e, i.src1); + auto addr = ComputeMemoryAddress(e, i.src1); e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]); e.lea(e.GetNativeParam(0), e.ptr[addr]); e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index e99628728..3fe52857b 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1683,6 +1683,9 @@ struct DIV_I16 : Sequence> { assert_impossible_sequence(DIV_I16); } }; +/* + TODO: hoist the overflow/zero checks into HIR +*/ struct DIV_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; @@ -1766,6 +1769,9 @@ struct DIV_I32 : Sequence> { e.mov(i.dest, e.eax); } }; +/* + TODO: hoist the overflow/zero checks into HIR +*/ struct DIV_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label skip; @@ -1811,7 +1817,7 @@ struct DIV_I64 : Sequence> { } else { // check for signed overflow if (i.src1.is_constant) { - if (i.src1.constant() != (1 << 31)) { + if (i.src1.constant() != (1ll << 63)) { // we're good, overflow is impossible } else { e.cmp(i.src2, -1); // otherwise, if src2 is -1 then we have diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index f7d882279..a4e39e78c 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -149,7 +149,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { i->Remove(); } result = true; + } else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from + // const indirect call true + auto function = processor_->LookupFunction( + uint32_t(i->src2.value->constant.i32)); + if (!function) { + break; + } + // i->Replace(&OPCODE_CALL_TRUE_info, i->flags); + i->opcode = &OPCODE_CALL_TRUE_info; + i->set_src2(nullptr); + i->src2.symbol = function; + result = true; } + break; case OPCODE_BRANCH_TRUE: diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 894c4423b..a5100cff6 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -796,10 +796,13 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, if (var_definition) { var_definition = var_definition->GetDestDefSkipAssigns(); - if (var_definition != NULL) - { - def_opcode = var_definition->opcode->num; + if (!var_definition) { + return false; } + def_opcode = var_definition->opcode->num; + } + if (!var_definition) { + return false; } // x == 0 -> !x if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) { @@ -1231,13 +1234,12 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) { result = false; return value; } - -bool SimplificationPass::SimplifyAddArith(hir::Instr* i, - hir::HIRBuilder* builder) { +bool SimplificationPass::SimplifyAddWithSHL(hir::Instr* i, + hir::HIRBuilder* builder) { /* - example: (x <<1 ) + x == (x*3) + example: (x <<1 ) + x == (x*3) - */ +*/ auto [shlinsn, addend] = i->BinaryValueArrangeByDefiningOpcode(&OPCODE_SHL_info); if (!shlinsn) { @@ -1278,11 +1280,81 @@ bool SimplificationPass::SimplifyAddArith(hir::Instr* i, return true; } +bool SimplificationPass::SimplifyAddToSelf(hir::Instr* i, + hir::HIRBuilder* builder) { + /* + heres a super easy one + */ + + if (i->src1.value != i->src2.value) { + return false; + } + + i->opcode = &OPCODE_SHL_info; + + i->set_src2(builder->LoadConstantUint8(1)); + + return true; +} +bool SimplificationPass::SimplifyAddArith(hir::Instr* i, + hir::HIRBuilder* builder) { + if (SimplifyAddWithSHL(i, builder)) { + return true; + } + if (SimplifyAddToSelf(i, builder)) { + return true; + } + return false; +} bool SimplificationPass::SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder) { + /* + todo: handle expressions like (x*8) - (x*5) == (x*3)...if these can even + happen of course */ return false; } +bool SimplificationPass::SimplifySHLArith(hir::Instr* i, + hir::HIRBuilder* builder) { + Value* sh = i->src2.value; + + Value* shifted = i->src1.value; + + if (!sh->IsConstant()) { + return false; + } + + hir::Instr* definition = shifted->GetDefSkipAssigns(); + + if (!definition) { + return false; + } + + if (definition->GetOpcodeNum() != OPCODE_MUL) { + return false; + } + + if (definition->flags != ARITHMETIC_UNSIGNED) { + return false; + } + + auto [mulconst, mulnonconst] = definition->BinaryValueArrangeAsConstAndVar(); + + if (!mulconst) { + return false; + } + + auto newmul = builder->AllocValue(mulconst->type); + newmul->set_from(mulconst); + + newmul->Shl(sh); + + i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED); + i->set_src1(mulnonconst); + i->set_src2(newmul); + + return true; +} bool SimplificationPass::SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder) { if (!i->dest) { @@ -1301,6 +1373,9 @@ bool SimplificationPass::SimplifyBasicArith(hir::Instr* i, case OPCODE_SUB: { return SimplifySubArith(i, builder); } + case OPCODE_SHL: { + return SimplifySHLArith(i, builder); + } } return false; } @@ -1317,6 +1392,97 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) { } return result; } + +/* + todo: add load-store simplification pass + + do things like load-store byteswap elimination, for instance, + + if a value is loaded, ored with a constant mask, and then stored, we + simply have to byteswap the mask it will be ored with and then we can + eliminate the two byteswaps + + the same can be done for and, or, xor, andn with constant masks + + + this can also be done for comparisons with 0 for equality and not equal + + + another optimization: with ppc you cannot move a floating point register + directly to a gp one, a gp one directly to a floating point register, or a + vmx one to either. so guest code will store the result to the stack, and then + load it to the register it needs in HIR we can sidestep this. we will still + need to byteswap and store the result for correctness, but we can eliminate + the load and byteswap by grabbing the original value from the store + + skyth's sanic idb, 0x824D7724 + lis r11, + lfs f0, flt_8200CBCC@l(r11) + fmuls f0, time, f0 + fctidz f0, f0 # vcvttss2si + stfd f0, 0x190+var_138(r1) + lwz r30, 0x190+var_138+4(r1) + cmplwi cr6, r30, 0x63 # 'c' + ble cr6, counter_op + + + +*/ + +/* + todo: simple loop unrolling + skyth sanic 0x831D9908 + + mr r30, r4 + mr r29, r5 + mr r11, r7 + li r31, 0 + +loc_831D9928: + slwi r9, r11, 1 + addi r10, r11, 1 + addi r8, r1, 0xD0+var_80 + clrlwi r11, r10, 16 + cmplwi cr6, r11, 0x10 + sthx r31, r9, r8 + ble cr6, loc_831D9928 + + v5 = 1; + do + { + v6 = 2 * v5; + v5 = (unsigned __int16)(v5 + 1); + *(_WORD *)&v24[v6] = 0; + } + while ( v5 <= 0x10 ); + v7 = 0; + do + { + v8 = __ROL4__(*(unsigned __int8 *)(v7 + a2), 1); + v7 = (unsigned __int16)(v7 + 1); + ++*(_WORD *)&v24[v8]; + } + while ( v7 < 8 ); + v9 = 1; + v25[0] = 0; + do + { + v10 = 2 * v9; + v11 = 16 - v9; + v9 = (unsigned __int16)(v9 + 1); + v25[v10 / 2] = (*(_WORD *)&v24[v10] << v11) + *(_WORD +*)&v24[v10 + 48]; + } + while ( v9 <= 0x10 ); + + + skyth sanic: + sub_831BBAE0 + + sub_831A41A8 + + +*/ } // namespace passes } // namespace compiler } // namespace cpu diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 8a5d3ee4c..078187eb1 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -36,9 +36,11 @@ class SimplificationPass : public ConditionalGroupSubpass { // handles simple multiplication/addition rules bool SimplifyBasicArith(hir::HIRBuilder* builder); bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder); - + bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder); + bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder); bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder); + bool SimplifySHLArith(hir::Instr* i, hir::HIRBuilder* builder); // handle either or or xor with 0 bool CheckOrXorZero(hir::Instr* i); bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder); diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 92e2848f8..149103d43 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -200,6 +200,20 @@ const Instr* Instr::GetNonFakePrev() const { } return curr; } + +uint32_t Instr::GuestAddressFor() const { + Instr* srch = prev; + + while (srch) { + if (srch->GetOpcodeNum() == OPCODE_SOURCE_OFFSET) { + return (uint32_t)srch->src1.offset; + } + srch = srch->prev; + } + + return 0; // eek. +} + } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 47f629227..38afef241 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -169,6 +169,8 @@ if both are constant, return nullptr, nullptr // gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER, // OPCODE_SOURCE_OFFSET const hir::Instr* GetNonFakePrev() const; + + uint32_t GuestAddressFor() const; }; } // namespace hir diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index eb28703d1..61f420eaa 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -30,7 +30,8 @@ std::unique_ptr MMIOHandler::Install( HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context) { + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context) { // There can be only one handler at a time. assert_null(global_handler_); if (global_handler_) { @@ -40,7 +41,8 @@ std::unique_ptr MMIOHandler::Install( auto handler = std::unique_ptr(new MMIOHandler( virtual_membase, physical_membase, membase_end, host_to_guest_virtual, host_to_guest_virtual_context, access_violation_callback, - access_violation_callback_context)); + access_violation_callback_context, record_mmio_callback, + record_mmio_context)); // Install the exception handler directed at the MMIOHandler. ExceptionHandler::Install(ExceptionCallbackThunk, handler.get()); @@ -54,14 +56,18 @@ MMIOHandler::MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase, HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context) + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, + void* record_mmio_context) : virtual_membase_(virtual_membase), physical_membase_(physical_membase), memory_end_(membase_end), host_to_guest_virtual_(host_to_guest_virtual), host_to_guest_virtual_context_(host_to_guest_virtual_context), access_violation_callback_(access_violation_callback), - access_violation_callback_context_(access_violation_callback_context) {} + access_violation_callback_context_(access_violation_callback_context), + record_mmio_callback_(record_mmio_callback), + record_mmio_context_(record_mmio_context) {} MMIOHandler::~MMIOHandler() { ExceptionHandler::Uninstall(ExceptionCallbackThunk, this); @@ -412,6 +418,8 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { // Quick kill anything outside our mapping. return false; } + uint64_t hostip = ex->pc(); + void* fault_host_address = reinterpret_cast(ex->fault_address()); // Access violations are pretty rare, so we can do a linear search here. @@ -561,6 +569,13 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { } #endif // XE_ARCH_ARM64 + if (record_mmio_callback_) { + // record that the guest address corresponding to the faulting instructions' + // host address reads/writes mmio. we can backpropagate this info on future + // compilations + record_mmio_callback_(record_mmio_context_, (void*)ex->pc()); + } + // Advance RIP to the next instruction so that we resume properly. ex->set_resume_pc(rip + decoded_load_store.length); diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 6240544e0..d9f6dc04c 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -29,7 +29,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context, uint32_t addr); typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context, uint32_t addr, uint32_t value); - +typedef void (*MmioAccessRecordCallback)(void* context, + void* host_insn_address); struct MMIORange { uint32_t address; uint32_t mask; @@ -58,7 +59,8 @@ class MMIOHandler { HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context); + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context); static MMIOHandler* global_handler() { return global_handler_; } bool RegisterRange(uint32_t virtual_address, uint32_t mask, uint32_t size, @@ -68,13 +70,20 @@ class MMIOHandler { bool CheckLoad(uint32_t virtual_address, uint32_t* out_value); bool CheckStore(uint32_t virtual_address, uint32_t value); + void SetMMIOExceptionRecordingCallback(MmioAccessRecordCallback callback, + void* context) { + record_mmio_context_ = context; + record_mmio_callback_ = callback; + } protected: MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase, uint8_t* membase_end, HostToGuestVirtual host_to_guest_virtual, const void* host_to_guest_virtual_context, AccessViolationCallback access_violation_callback, - void* access_violation_callback_context); + void* access_violation_callback_context, + MmioAccessRecordCallback record_mmio_callback, + void* record_mmio_context); static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); @@ -90,7 +99,9 @@ class MMIOHandler { AccessViolationCallback access_violation_callback_; void* access_violation_callback_context_; + MmioAccessRecordCallback record_mmio_callback_; + void* record_mmio_context_; static MMIOHandler* global_handler_; xe::global_critical_region global_critical_region_; diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 5719357a4..6274dfb71 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -1439,11 +1439,23 @@ int InstrEmit_vsel(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_vsel128(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_vsel_(f, VX128_VD128, VX128_VA128, VX128_VB128, VX128_VD128); } +// chrispy: this is test code for checking whether a game takes advantage of the +// VSR/VSL undocumented/undefined variable shift behavior +static void AssertShiftElementsOk(PPCHIRBuilder& f, Value* v) { +#if 0 + Value* splatted = f.Splat(f.Extract(v, (uint8_t)0, INT8_TYPE), VEC128_TYPE); + Value* checkequal = f.Xor(splatted, v); + f.DebugBreakTrue(f.IsTrue(checkequal)); +#endif +} int InstrEmit_vsl(PPCHIRBuilder& f, const InstrData& i) { - Value* v = f.Shl(f.LoadVR(i.VX.VA), - f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE), - f.LoadConstantInt8(0b111))); + Value* va = f.LoadVR(i.VX.VA); + Value* vb = f.LoadVR(i.VX.VB); + + AssertShiftElementsOk(f, vb); + Value* v = + f.Shl(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111))); f.StoreVR(i.VX.VD, v); return 0; } @@ -1623,9 +1635,13 @@ int InstrEmit_vspltisw128(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_vsr(PPCHIRBuilder& f, const InstrData& i) { - Value* v = f.Shr(f.LoadVR(i.VX.VA), - f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE), - f.LoadConstantInt8(0b111))); + Value* va = f.LoadVR(i.VX.VA); + Value* vb = f.LoadVR(i.VX.VB); + + AssertShiftElementsOk(f, vb); + + Value* v = + f.Shr(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111))); f.StoreVR(i.VX.VD, v); return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_control.cc b/src/xenia/cpu/ppc/ppc_emit_control.cc index c990237a7..0fe8e2d54 100644 --- a/src/xenia/cpu/ppc/ppc_emit_control.cc +++ b/src/xenia/cpu/ppc/ppc_emit_control.cc @@ -769,8 +769,14 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) { // bit 62 = RI; recoverable interrupt // return 8000h if unlocked (interrupts enabled), else 0 f.MemoryBarrier(); - f.CallExtern(f.builtins()->check_global_lock); - f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE)); + if (cvars::disable_global_lock || true) { + f.StoreGPR(i.X.RT, f.LoadConstantUint64(0)); + + } else { + f.CallExtern(f.builtins()->check_global_lock); + f.StoreGPR(i.X.RT, + f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE)); + } return 0; } @@ -782,6 +788,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { f.StoreContext( offsetof(PPCContext, scratch), f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE)); +#if 0 if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). if (!cvars::disable_global_lock) { @@ -793,6 +800,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { f.CallExtern(f.builtins()->leave_global_lock); } } +#endif return 0; } else { // L = 0 @@ -807,6 +815,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { f.MemoryBarrier(); f.StoreContext(offsetof(PPCContext, scratch), f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE)); +#if 0 if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). if (!cvars::disable_global_lock) { @@ -818,6 +827,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { f.CallExtern(f.builtins()->leave_global_lock); } } +#endif return 0; } else { // L = 0 diff --git a/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc b/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc index 4323bdfac..3a2772bd5 100644 --- a/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc +++ b/src/xenia/cpu/ppc/ppc_opcode_disasm_gen.cc @@ -5406,6 +5406,7 @@ PPCOpcodeDisasmInfo ppc_opcode_disasm_table[] = { INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral, "XOR Immediate Shifted" , (PPCOpcodeField::kRS,PPCOpcodeField::kUIMM), (PPCOpcodeField::kRA), PrintDisasm_xoris), INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral, "XOR" , (PPCOpcodeField::kRS,PPCOpcodeField::kRB), (PPCOpcodeField::kRA,PPCOpcodeField::kCRcond), PrintDisasm_xorx), }; +#undef INSTRUCTION static_assert(sizeof(ppc_opcode_disasm_table) / sizeof(PPCOpcodeDisasmInfo) == static_cast(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen"); const PPCOpcodeDisasmInfo& GetOpcodeDisasmInfo(PPCOpcode opcode) { diff --git a/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc b/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc index 43210f5fb..22c41b270 100644 --- a/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc +++ b/src/xenia/cpu/ppc/ppc_opcode_table_gen.cc @@ -470,6 +470,7 @@ PPCOpcodeInfo ppc_opcode_table[] = { INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral), INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral), }; +#undef INSTRUCTION static_assert(sizeof(ppc_opcode_table) / sizeof(PPCOpcodeInfo) == static_cast(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen"); const PPCOpcodeInfo& GetOpcodeInfo(PPCOpcode opcode) { diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index 6bd57b4f7..0fbeb30cd 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -257,11 +257,22 @@ Function* Processor::ResolveFunction(uint32_t address) { // Grab symbol declaration. auto function = LookupFunction(address); + if (!function) { entry->status = Entry::STATUS_FAILED; return nullptr; } + auto module_for = function->module(); + + auto xexmod = dynamic_cast(module_for); + if (xexmod) { + auto addr_flags = xexmod->GetInstructionAddressFlags(address); + if (addr_flags) { + addr_flags->was_resolved = 1; + } + } + if (!DemandFunction(function)) { entry->status = Entry::STATUS_FAILED; return nullptr; diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index b0b963467..7ccf3f71b 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -14,13 +14,16 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_order.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" + #include "xenia/cpu/cpu_flags.h" #include "xenia/cpu/export_resolver.h" #include "xenia/cpu/lzx.h" #include "xenia/cpu/processor.h" +#include "xenia/emulator.h" #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/xmodule.h" @@ -29,6 +32,14 @@ #include "third_party/crypto/rijndael-alg-fst.h" #include "third_party/pe/pe_image.h" +DEFINE_bool(disable_instruction_infocache, false, + "Disables caching records of called instructions/mmio accesses.", + "CPU"); +DEFINE_bool(disable_function_precompilation, true, + "Disables pre-compiling guest functions that we know we've called " + "on previous runs", + "CPU"); + static const uint8_t xe_xex2_retail_key[16] = { 0x20, 0xB1, 0x85, 0xA5, 0x9D, 0x28, 0xFD, 0xC3, 0x40, 0x58, 0x3F, 0xBB, 0x08, 0x96, 0xBF, 0x91}; @@ -977,6 +988,7 @@ bool XexModule::LoadContinue() { // Scan and find the low/high addresses. // All code sections are continuous, so this should be easy. + // could use a source for the above information auto heap = memory()->LookupHeap(base_address_); auto page_size = heap->page_size(); @@ -1045,7 +1057,24 @@ bool XexModule::LoadContinue() { library_offset += library->size; } } + sha1::SHA1 final_image_sha_; + final_image_sha_.reset(); + + unsigned high_code = this->high_address_ - this->low_address_; + + final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_), + high_code); + final_image_sha_.finalize(image_sha_bytes_); + + char fmtbuf[16]; + + for (unsigned i = 0; i < 16; ++i) { + sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]); + image_sha_str_ += &fmtbuf[0]; + } + + info_cache_.Init(this); // Find __savegprlr_* and __restgprlr_* and the others. // We can flag these for special handling (inlining/etc). if (!FindSaveRest()) { @@ -1288,7 +1317,68 @@ std::unique_ptr XexModule::CreateFunction(uint32_t address) { return std::unique_ptr( processor_->backend()->CreateGuestFunction(this, address)); } +void XexInfoCache::Init(XexModule* xexmod) { + if (cvars::disable_instruction_infocache) { + return; + } + auto emu = xexmod->kernel_state_->emulator(); + std::filesystem::path infocache_path = emu->cache_root(); + infocache_path.append(L"modules"); + + infocache_path.append(xexmod->image_sha_str_); + + std::filesystem::create_directories(infocache_path); + infocache_path.append("executable_addr_flags.bin"); + + unsigned num_codebytes = xexmod->high_address_ - xexmod->low_address_; + num_codebytes += 3; // round up to nearest multiple of 4 + num_codebytes &= ~3; + bool did_exist = true; + if (!std::filesystem::exists(infocache_path)) { + xe::filesystem::CreateEmptyFile(infocache_path); + did_exist = false; + } + + // todo: prepopulate with stuff from pdata, dll exports + this->executable_addr_flags_ = std::move(xe::MappedMemory::Open( + infocache_path, xe::MappedMemory::Mode::kReadWrite, 0, + sizeof(InfoCacheFlagsHeader) + + (sizeof(InfoCacheFlags) * + (num_codebytes / + 4)))); // one infocacheflags entry for each PPC instr-sized addr + + if (did_exist) { + xexmod->PrecompileKnownFunctions(); + } +} + +InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) { + if (guest_addr < low_address_ || guest_addr > high_address_) { + return nullptr; + } + + guest_addr -= low_address_; + + return info_cache_.LookupFlags(guest_addr); +} + +void XexModule::PrecompileKnownFunctions() { + if (cvars::disable_function_precompilation) { + return; + } + uint32_t start = 0; + uint32_t end = (high_address_ - low_address_) / 4; + auto flags = info_cache_.LookupFlags(0); + if (!flags) { + return; + } + for (uint32_t i = 0; i < end; i++) { + if (flags[i].was_resolved) { + processor_->ResolveFunction(low_address_ + (i * 4)); + } + } +} bool XexModule::FindSaveRest() { // Special stack save/restore functions. // http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm diff --git a/src/xenia/cpu/xex_module.h b/src/xenia/cpu/xex_module.h index cd8fc49c5..06045ff92 100644 --- a/src/xenia/cpu/xex_module.h +++ b/src/xenia/cpu/xex_module.h @@ -12,7 +12,7 @@ #include #include - +#include "xenia/base/mapped_memory.h" #include "xenia/cpu/module.h" #include "xenia/kernel/util/xex2_info.h" @@ -30,6 +30,39 @@ constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2"); constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F'); class Runtime; +struct InfoCacheFlags { + uint32_t was_resolved : 1; // has this address ever been called/requested + // via resolvefunction? + uint32_t accessed_mmio : 1; + uint32_t reserved : 30; +}; +struct XexInfoCache { + struct InfoCacheFlagsHeader { + unsigned char reserved[256]; // put xenia version here + + InfoCacheFlags* LookupFlags(unsigned offset) { + return &reinterpret_cast(&this[1])[offset]; + } + }; + /* + for every 4-byte aligned address, records a 4 byte set of flags. + */ + std::unique_ptr executable_addr_flags_; + + void Init(class XexModule*); + InfoCacheFlags* LookupFlags(unsigned offset) { + offset /= 4; + if (!executable_addr_flags_) { + return nullptr; + } + uint8_t* data = executable_addr_flags_->data(); + + if (!data) { + return nullptr; + } + return reinterpret_cast(data)->LookupFlags(offset); + } +}; class XexModule : public xe::cpu::Module { public: @@ -174,10 +207,14 @@ class XexModule : public xe::cpu::Module { XEX_MODULE_PATCH_FULL)); } + InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr); + void PrecompileKnownFunctions(); + protected: std::unique_ptr CreateFunction(uint32_t address) override; private: + friend struct XexInfoCache; void ReadSecurityInfo(); int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key); @@ -217,6 +254,10 @@ class XexModule : public xe::cpu::Module { XexFormat xex_format_ = kFormatUnknown; SecurityInfoContext security_info_ = {}; + + uint8_t image_sha_bytes_[16]; + std::string image_sha_str_; + XexInfoCache info_cache_; }; } // namespace cpu diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 23c634cf2..dfc993dee 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -16,6 +16,7 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/byte_stream.h" +#include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" @@ -28,6 +29,10 @@ #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" +DEFINE_bool(log_unknown_register_writes, false, + "Log writes to unknown registers from " + "CommandProcessor::WriteRegister. Has significant performance hit.", + "GPU"); namespace xe { namespace gpu { @@ -329,19 +334,9 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) { write_ptr_index_ = value; write_ptr_index_event_->Set(); } - -void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { +void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index, + uint32_t value) { RegisterFile& regs = *register_file_; - if (index >= RegisterFile::kRegisterCount) { - XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index); - return; - } - - regs.values[index].u32 = value; - if (!regs.GetRegisterInfo(index)) { - XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); - } - // Scratch register writeback. if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; @@ -469,6 +464,43 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } } } +void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { + if (XE_UNLIKELY(cvars::log_unknown_register_writes)) { + // chrispy: rearrange check order, place set after checks + if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) { + XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); + check_reg_out_of_bounds: + if (XE_UNLIKELY(index >= RegisterFile::kRegisterCount)) { + XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", + index); + return; + } + } + } else { + goto check_reg_out_of_bounds; + } + register_file_->values[index].u32 = value; + + // regs with extra logic on write: XE_GPU_REG_COHER_STATUS_HOST + // XE_GPU_REG_DC_LUT_RW_INDEX + // XE_GPU_REG_DC_LUT_SEQ_COLOR XE_GPU_REG_DC_LUT_PWL_DATA + // XE_GPU_REG_DC_LUT_30_COLOR + + // quick pre-test + // todo: figure out just how unlikely this is. if very (it ought to be, theres + // a ton of registers other than these) make this predicate branchless and + // mark with unlikely, then make HandleSpecialRegisterWrite noinline yep, its + // very unlikely. these ORS here are meant to be bitwise ors, so that we do + // not do branching evaluation of the conditions (we will almost always take + // all of the branches) + if (XE_UNLIKELY( + (index - XE_GPU_REG_SCRATCH_REG0 < 8) | + (index == XE_GPU_REG_COHER_STATUS_HOST) | + ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) { + HandleSpecialRegisterWrite(index, value); + } +} void CommandProcessor::MakeCoherent() { SCOPE_profile_cpu_f("gpu"); @@ -570,7 +602,7 @@ void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) { // Return up a level if we encounter a bad packet. XELOGE("**** INDIRECT RINGBUFFER: Failed to execute packet."); assert_always(); - //break; + // break; } } while (reader.read_count()); diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index ffc8eeffa..412e8833d 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -150,7 +150,9 @@ class CommandProcessor { void WorkerThreadMain(); virtual bool SetupContext() = 0; virtual void ShutdownContext() = 0; - + // rarely needed, most register writes have no special logic here + XE_NOINLINE + void HandleSpecialRegisterWrite(uint32_t index, uint32_t value); virtual void WriteRegister(uint32_t index, uint32_t value); const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 8038b0dc2..add11e4f6 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -712,7 +712,7 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) { ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height; ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth; ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth; - if (ff_viewport_update_needed_) { + if (XE_UNLIKELY(ff_viewport_update_needed_)) { ff_viewport_ = viewport; deferred_command_list_.RSSetViewport(ff_viewport_); ff_viewport_update_needed_ = false; diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index 46f372503..0ae6c8552 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -4799,18 +4799,16 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( if (!current_transfers.empty()) { are_current_command_list_render_targets_valid_ = false; if (dest_rt_key.is_depth) { - command_list.D3DOMSetRenderTargets( - 0, nullptr, FALSE, &dest_d3d12_rt.descriptor_draw().GetHandle()); + auto handle = dest_d3d12_rt.descriptor_draw().GetHandle(); + command_list.D3DOMSetRenderTargets(0, nullptr, FALSE, &handle); if (!use_stencil_reference_output_) { command_processor_.SetStencilReference(UINT8_MAX); } } else { - command_list.D3DOMSetRenderTargets( - 1, - &(dest_d3d12_rt.descriptor_load_separate().IsValid() - ? dest_d3d12_rt.descriptor_load_separate().GetHandle() - : dest_d3d12_rt.descriptor_draw().GetHandle()), - FALSE, nullptr); + auto handle = dest_d3d12_rt.descriptor_load_separate().IsValid() + ? dest_d3d12_rt.descriptor_load_separate().GetHandle() + : dest_d3d12_rt.descriptor_draw().GetHandle(); + command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr); } uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles(); @@ -5425,12 +5423,12 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( dest_d3d12_rt.SetResourceState(D3D12_RESOURCE_STATE_RENDER_TARGET), D3D12_RESOURCE_STATE_RENDER_TARGET); if (clear_via_drawing) { - command_list.D3DOMSetRenderTargets( - 1, - &(dest_d3d12_rt.descriptor_load_separate().IsValid() - ? dest_d3d12_rt.descriptor_load_separate().GetHandle() - : dest_d3d12_rt.descriptor_draw().GetHandle()), - FALSE, nullptr); + auto handle = + (dest_d3d12_rt.descriptor_load_separate().IsValid() + ? dest_d3d12_rt.descriptor_load_separate().GetHandle() + : dest_d3d12_rt.descriptor_draw().GetHandle()); + + command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr); are_current_command_list_render_targets_valid_ = true; D3D12_VIEWPORT clear_viewport; clear_viewport.TopLeftX = float(clear_rect.left); diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 24904c7e8..94e21a7e0 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -78,314 +78,24 @@ namespace shaders { #include "xenia/gpu/shaders/bytecode/d3d12_5_1/texture_load_r5g6b5_b5g6r5_scaled_cs.h" } // namespace shaders -const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = { - // k_1_REVERSE - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_1 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8 - {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, - DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_1_5_5_5 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM, - kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_5_6_5 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, - kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_6_5_5 - // On the host, green bits in blue, blue bits in green. - {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, - kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)}, - // k_8_8_8_8 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10 - {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_8_A - {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, - DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8_B - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_8_8 - {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb, - DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_Cr_Y1_Cb_Y0_REP - // Red and blue swapped in the load shader for simplicity. - // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for - // the signed version, separate unsigned and signed load shaders completely - // (as one doesn't need decompression for this format, while another does). - {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM, - kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexGBGR8ToRGB8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_Y1_Cr_Y0_Cb_REP - // Red and blue swapped in the load shader for simplicity. - // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for - // the signed version, separate unsigned and signed load shaders completely - // (as one doesn't need decompression for this format, while another does). - {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM, - kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexBGRG8ToRGB8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_16_16_EDRAM - // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_8_8_8_A - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_4_4_4_4 - // Red and blue swapped in the load shader for simplicity. - {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, - kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_10_11_11 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_11_11_10 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_DXT1 - {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT2_3 - {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT4_5 - {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_16_16_16_EDRAM - // Not usable as a texture, also has -32...32 range. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // R32_FLOAT for depth because shaders would require an additional SRV to - // sample stencil, which we don't provide. - // k_24_8 - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_24_8_FLOAT - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16 - {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16 - {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_EXPAND - {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_EXPAND - {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16_EXPAND - {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_16_FLOAT - {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, - DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_FLOAT - {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_16_16_16_FLOAT - {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_32_32_32_32 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32_FLOAT - {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb, - DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_32_FLOAT - {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndex64bpb, - DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_32_32_32_32_FLOAT - {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, - kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_32_AS_8 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_8 - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_MPEG - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_MPEG - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_32_AS_8_8_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_16_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_16_16_MPEG_INTERLACED - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_DXN - {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8_UNORM, - kLoadShaderIndexDXNToRG8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_8_8_8_8_AS_16_16_16_16 - {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown, - false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT1_AS_16_16_16_16 - {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT2_3_AS_16_16_16_16 - {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_DXT4_5_AS_16_16_16_16 - {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, - DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10_AS_16_16_16_16 - {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, - kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_10_11_11_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_11_11_10_AS_16_16_16_16 - {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, - kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, - kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_32_32_32_FLOAT - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, - // k_DXT3A - // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size - // alignment requirement. - {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_DXT5A - {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM, - kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, - // k_CTX1 - {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, - // k_DXT3A_AS_1_1_1_1 - {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, - kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_8_8_8_8_GAMMA_EDRAM - // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, - // k_2_10_10_10_FLOAT_EDRAM - // Not usable as a texture. - {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, - DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, - kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, -}; +/* + chrispy: we're getting cache misses in GetHostFormatSwizzle, use a + denser array todo: not all 65536 possible swizzles are used, this could + probably be one cache line +*/ +using SwizzleArray = std::array; + +static constexpr SwizzleArray build_xenos_swizzle_for_format() { + SwizzleArray result{0}; + + for (int i = 0; i < 64; ++i) { + result[i] = + static_cast(D3D12TextureCache::host_formats_[i].swizzle); + } + return result; +} +alignas(64) constexpr SwizzleArray xenos_swizzle_for_format = + build_xenos_swizzle_for_format(); D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file, D3D12SharedMemory& shared_memory, @@ -1544,7 +1254,8 @@ bool D3D12TextureCache::IsScaledResolveSupportedForFormat( } uint32_t D3D12TextureCache::GetHostFormatSwizzle(TextureKey key) const { - return host_formats_[uint32_t(key.format)].swizzle; + // return host_formats_[uint32_t(key.format)].swizzle; + return xenos_swizzle_for_format[uint32_t(key.format)]; } uint32_t D3D12TextureCache::GetMaxHostTextureWidthHeight( diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.h b/src/xenia/gpu/d3d12/d3d12_texture_cache.h index 6a14948fe..d5aacd617 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.h @@ -160,29 +160,6 @@ class D3D12TextureCache final : public TextureCache { ID3D12Resource* RequestSwapTexture( D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, xenos::TextureFormat& format_out); - - protected: - bool IsSignedVersionSeparateForFormat(TextureKey key) const override; - bool IsScaledResolveSupportedForFormat(TextureKey key) const override; - uint32_t GetHostFormatSwizzle(TextureKey key) const override; - - uint32_t GetMaxHostTextureWidthHeight( - xenos::DataDimension dimension) const override; - uint32_t GetMaxHostTextureDepthOrArraySize( - xenos::DataDimension dimension) const override; - - std::unique_ptr CreateTexture(TextureKey key) override; - - // This binds pipelines, allocates descriptors, and copies! - bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base, - bool load_mips) override; - - void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override; - - private: - static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2; - static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5; - struct HostFormat { // Format info for the regular case. // DXGI format (typeless when different signedness or number representation @@ -223,6 +200,352 @@ class D3D12TextureCache final : public TextureCache { // Mapping of Xenos swizzle components to DXGI format components. uint32_t swizzle; }; + static constexpr HostFormat host_formats_[64]{ + // k_1_REVERSE + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_1 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8 + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, + DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_1_5_5_5 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM, + kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_5_6_5 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_6_5_5 + // On the host, green bits in blue, blue bits in green. + {DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM, + kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)}, + // k_8_8_8_8 + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10 + {DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_8_A + {DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb, + DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8_B + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_8_8 + {DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb, + DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_Cr_Y1_Cb_Y0_REP + // Red and blue swapped in the load shader for simplicity. + // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is + // usable for + // the signed version, separate unsigned and signed load shaders + // completely + // (as one doesn't need decompression for this format, while another + // does). + {DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM, + kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndexGBGR8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_Y1_Cr_Y0_Cb_REP + // Red and blue swapped in the load shader for simplicity. + // TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is + // usable for + // the signed version, separate unsigned and signed load shaders + // completely + // (as one doesn't need decompression for this format, while another + // does). + {DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM, + kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndexBGRG8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_16_16_EDRAM + // Not usable as a texture, also has -32...32 range. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_8_8_8_A + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_4_4_4_4 + // Red and blue swapped in the load shader for simplicity. + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_10_11_11 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_11_11_10 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_DXT1 + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT2_3 + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT4_5 + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_16_16_16_EDRAM + // Not usable as a texture, also has -32...32 range. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // R32_FLOAT for depth because shaders would require an additional SRV + // to + // sample stencil, which we don't provide. + // k_24_8 + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_24_8_FLOAT + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16 + {DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16 + {DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_EXPAND + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_EXPAND + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16_EXPAND + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_16_FLOAT + {DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb, + DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_FLOAT + {DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, + kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_16_16_16_FLOAT + {DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_32_32_32_32 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32_FLOAT + {DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb, + DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_32_FLOAT + {DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, + kLoadShaderIndex64bpb, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_32_32_32_32_FLOAT + {DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT, + kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_32_AS_8 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_8 + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_MPEG + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_MPEG + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_32_AS_8_8_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_16_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_MPEG_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_16_16_MPEG_INTERLACED + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_DXN + {DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexDXNToRG8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_8_8_8_8_AS_16_16_16_16 + {DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT1_AS_16_16_16_16 + {DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT2_3_AS_16_16_16_16 + {DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_DXT4_5_AS_16_16_16_16 + {DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, + DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10_AS_16_16_16_16 + {DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, + kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_10_11_11_AS_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_11_11_10_AS_16_16_16_16 + {DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM, + kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM, + kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_32_32_32_FLOAT + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB}, + // k_DXT3A + // R8_UNORM has the same size as BC2, but doesn't have the 4x4 size + // alignment requirement. + {DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_DXT5A + {DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM, + kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR}, + // k_CTX1 + {DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG}, + // k_DXT3A_AS_1_1_1_1 + {DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM, + kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_8_8_8_8_GAMMA_EDRAM + // Not usable as a texture. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + // k_2_10_10_10_FLOAT_EDRAM + // Not usable as a texture. + {DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, + DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN, + kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA}, + }; + + protected: + bool IsSignedVersionSeparateForFormat(TextureKey key) const override; + bool IsScaledResolveSupportedForFormat(TextureKey key) const override; + uint32_t GetHostFormatSwizzle(TextureKey key) const override; + + uint32_t GetMaxHostTextureWidthHeight( + xenos::DataDimension dimension) const override; + uint32_t GetMaxHostTextureDepthOrArraySize( + xenos::DataDimension dimension) const override; + + std::unique_ptr CreateTexture(TextureKey key) override; + + // This binds pipelines, allocates descriptors, and copies! + bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base, + bool load_mips) override; + + void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override; + + private: + static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2; + static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5; class D3D12Texture final : public Texture { public: @@ -467,8 +790,6 @@ class D3D12TextureCache final : public TextureCache { xenos::ClampMode NormalizeClampMode(xenos::ClampMode clamp_mode) const; - static const HostFormat host_formats_[64]; - D3D12CommandProcessor& command_processor_; bool bindless_resources_used_; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index b49b9925a..fdc9cb0cf 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -198,7 +198,7 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) { // maximum [width(0x0FFF), height(0x0FFF)] return 0x050002D0; default: - if (!register_file_.GetRegisterInfo(r)) { + if (!register_file_.IsValidRegister(r)) { XELOGE("GPU: Read from unknown register ({:04X})", r); } } diff --git a/src/xenia/gpu/register_file.cc b/src/xenia/gpu/register_file.cc index f65d5d87c..5dd580e07 100644 --- a/src/xenia/gpu/register_file.cc +++ b/src/xenia/gpu/register_file.cc @@ -8,7 +8,7 @@ */ #include "xenia/gpu/register_file.h" - +#include #include #include "xenia/base/math.h" @@ -17,6 +17,52 @@ namespace xe { namespace gpu { RegisterFile::RegisterFile() { std::memset(values, 0, sizeof(values)); } +constexpr unsigned int GetHighestRegisterNumber() { + uint32_t highest = 0; +#define XE_GPU_REGISTER(index, type, name) \ + highest = std::max(highest, index); +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return highest; +} +constexpr unsigned int GetLowestRegisterNumber() { + uint32_t lowest = UINT_MAX; +#define XE_GPU_REGISTER(index, type, name) \ + lowest = std::min(lowest, index); +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return lowest; +} + +static constexpr uint32_t lowest_register = GetLowestRegisterNumber(); +static constexpr uint32_t highest_register = GetHighestRegisterNumber(); + +static constexpr uint32_t total_num_registers = + highest_register - lowest_register; + +static constexpr uint32_t num_required_words_for_registers = + ((total_num_registers + 63) & ~63) / 64; +// can't use bitset, its not constexpr in c++ 17 +using ValidRegisterBitset = std::array< + uint64_t, + num_required_words_for_registers>; // std::bitset; + +static constexpr ValidRegisterBitset BuildValidRegisterBitset() { + ValidRegisterBitset result{}; +#define XE_GPU_REGISTER(index, type, name) \ + result[(index - lowest_register) / 64] |= \ + 1ULL << ((index - lowest_register) % 64); + +#include "xenia/gpu/register_table.inc" +#undef XE_GPU_REGISTER + + return result; +} +static constexpr ValidRegisterBitset valid_register_bitset = + BuildValidRegisterBitset(); const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) { switch (index) { @@ -34,6 +80,18 @@ const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) { return nullptr; } } +/* + todo: this still uses a lot of cpu! our bitset is too large +*/ +bool RegisterFile::IsValidRegister(uint32_t index) { + if (XE_UNLIKELY(index < lowest_register) || + XE_UNLIKELY(index > highest_register)) { + return false; + } + uint32_t register_linear_index = index - lowest_register; + return (valid_register_bitset[register_linear_index / 64] & + (1ULL << (register_linear_index % 64))) != 0; +} } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/register_file.h b/src/xenia/gpu/register_file.h index e9a4f1137..11eebd8c5 100644 --- a/src/xenia/gpu/register_file.h +++ b/src/xenia/gpu/register_file.h @@ -32,7 +32,7 @@ class RegisterFile { RegisterFile(); static const RegisterInfo* GetRegisterInfo(uint32_t index); - + static bool IsValidRegister(uint32_t index); static constexpr size_t kRegisterCount = 0x5003; union RegisterValue { uint32_t u32; diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 0d43b0a5e..85ba32c18 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -41,9 +41,6 @@ #include "xenia/ui/windowed_app_context.h" #include "xenia/xbox.h" -DEFINE_string(target_trace_file, "", "Specifies the trace file to load.", - "GPU"); - namespace xe { namespace gpu { @@ -66,7 +63,7 @@ TraceViewer::TraceViewer(xe::ui::WindowedAppContext& app_context, TraceViewer::~TraceViewer() = default; bool TraceViewer::OnInitialize() { - std::string path = cvars::target_trace_file; + std::string path = cvars::target_trace_file.u8string(); // If no path passed, ask the user. // On Android, however, there's no synchronous file picker, and the trace file diff --git a/src/xenia/gpu/trace_viewer.h b/src/xenia/gpu/trace_viewer.h index 58ab16e4e..188a6eb53 100644 --- a/src/xenia/gpu/trace_viewer.h +++ b/src/xenia/gpu/trace_viewer.h @@ -12,6 +12,7 @@ #include +#include "xenia/base/cvar.h" #include "xenia/emulator.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/trace_player.h" @@ -24,7 +25,7 @@ #include "xenia/ui/window.h" #include "xenia/ui/window_listener.h" #include "xenia/ui/windowed_app.h" - +DECLARE_path(target_trace_file); namespace xe { namespace gpu { diff --git a/src/xenia/gpu/trace_writer.cc b/src/xenia/gpu/trace_writer.cc index b83e21868..bc7aadd5b 100644 --- a/src/xenia/gpu/trace_writer.cc +++ b/src/xenia/gpu/trace_writer.cc @@ -25,7 +25,7 @@ namespace xe { namespace gpu { - +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 TraceWriter::TraceWriter(uint8_t* membase) : membase_(membase), file_(nullptr) {} @@ -362,6 +362,6 @@ void TraceWriter::WriteGammaRamp( fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_); } } - +#endif } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/trace_writer.h b/src/xenia/gpu/trace_writer.h index 407166068..0239d7f95 100644 --- a/src/xenia/gpu/trace_writer.h +++ b/src/xenia/gpu/trace_writer.h @@ -17,11 +17,22 @@ #include "xenia/gpu/registers.h" #include "xenia/gpu/trace_protocol.h" +// only enable trace writer in debug builds, measured hit from the trace +// function calls (even if they just immediately return) is 0.40-0.60% cpu time +// total. with inlining they just bloat the caller and negatively impact +// register allocation for the caller +#ifdef NDEBUG +#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 0 +#else +#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 1 +#endif + namespace xe { namespace gpu { class TraceWriter { public: +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 explicit TraceWriter(uint8_t* membase); ~TraceWriter(); @@ -61,6 +72,49 @@ class TraceWriter { bool compress_output_ = true; size_t compression_threshold_ = 1024; // Min. number of bytes to compress. + +#else + // this could be annoying to maintain if new methods are added or the + // signatures change + constexpr explicit TraceWriter(uint8_t* membase) {} + + static constexpr bool is_open() { return false; } + + static constexpr bool Open(const std::filesystem::path& path, + uint32_t title_id) { + return false; + } + static constexpr void Flush() {} + static constexpr void Close() {} + + static constexpr void WritePrimaryBufferStart(uint32_t base_ptr, + uint32_t count) {} + static constexpr void WritePrimaryBufferEnd() {} + static constexpr void WriteIndirectBufferStart(uint32_t base_ptr, + uint32_t count) {} + static constexpr void WriteIndirectBufferEnd() {} + static constexpr void WritePacketStart(uint32_t base_ptr, uint32_t count) {} + static constexpr void WritePacketEnd() {} + static constexpr void WriteMemoryRead(uint32_t base_ptr, size_t length, + const void* host_ptr = nullptr) {} + static constexpr void WriteMemoryReadCached(uint32_t base_ptr, + size_t length) {} + static constexpr void WriteMemoryReadCachedNop(uint32_t base_ptr, + size_t length) {} + static constexpr void WriteMemoryWrite(uint32_t base_ptr, size_t length, + const void* host_ptr = nullptr) {} + static constexpr void WriteEdramSnapshot(const void* snapshot) {} + static constexpr void WriteEvent(EventCommand::Type event_type) {} + static constexpr void WriteRegisters(uint32_t first_register, + const uint32_t* register_values, + uint32_t register_count, + bool execute_callbacks_on_play) {} + static constexpr void WriteGammaRamp( + const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table, + const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb, + uint32_t gamma_ramp_rw_component) {} + +#endif }; } // namespace gpu diff --git a/src/xenia/kernel/user_module.cc b/src/xenia/kernel/user_module.cc index 5d1cb0f39..f2dc5b1d8 100644 --- a/src/xenia/kernel/user_module.cc +++ b/src/xenia/kernel/user_module.cc @@ -225,6 +225,7 @@ X_STATUS UserModule::LoadContinue() { ldr_data->xex_header_base = guest_xex_header_; ldr_data->full_image_size = security_header->image_size; ldr_data->image_base = this->xex_module()->base_address(); + ldr_data->entry_point = entry_point_; OnLoad(); diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index 730972f25..388fefc62 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -198,7 +198,8 @@ bool Memory::Initialize() { // Add handlers for MMIO. mmio_handler_ = cpu::MMIOHandler::Install( virtual_membase_, physical_membase_, physical_membase_ + 0x1FFFFFFF, - HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this); + HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this, + nullptr, nullptr); if (!mmio_handler_) { XELOGE("Unable to install MMIO handlers"); assert_always(); @@ -213,6 +214,11 @@ bool Memory::Initialize() { return true; } +void Memory::SetMMIOExceptionRecordingCallback( + cpu::MmioAccessRecordCallback callback, void* context) { + mmio_handler_->SetMMIOExceptionRecordingCallback(callback, context); +} + static const struct { uint64_t virtual_address_start; uint64_t virtual_address_end; @@ -1528,9 +1534,10 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address, } bool PhysicalHeap::AllocSystemHeap(uint32_t size, uint32_t alignment, - uint32_t allocation_type, uint32_t protect, - bool top_down, uint32_t* out_address) { - return Alloc(size, alignment, allocation_type, protect, top_down, out_address); + uint32_t allocation_type, uint32_t protect, + bool top_down, uint32_t* out_address) { + return Alloc(size, alignment, allocation_type, protect, top_down, + out_address); } bool PhysicalHeap::Decommit(uint32_t address, uint32_t size) { diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 813eb25bc..ed313a26d 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -498,6 +498,9 @@ class Memory { bool Save(ByteStream* stream); bool Restore(ByteStream* stream); + void SetMMIOExceptionRecordingCallback(cpu::MmioAccessRecordCallback callback, + void* context); + private: int MapViews(uint8_t* mapping_base); void UnmapViews(); diff --git a/src/xenia/ui/window_win.cc b/src/xenia/ui/window_win.cc index f2458b69d..1de3e0448 100644 --- a/src/xenia/ui/window_win.cc +++ b/src/xenia/ui/window_win.cc @@ -181,7 +181,6 @@ bool Win32Window::OpenImpl() { SetWindowPlacement(hwnd_, &initial_dpi_placement); } } - // Disable rounded corners starting with Windows 11 (or silently receive and // ignore E_INVALIDARG on Windows versions before 10.0.22000.0), primarily to // preserve all pixels of the guest output. @@ -189,7 +188,6 @@ bool Win32Window::OpenImpl() { DwmSetWindowAttribute(hwnd_, DWMWA_WINDOW_CORNER_PREFERENCE, &window_corner_preference, sizeof(window_corner_preference)); - // Disable flicks. ATOM atom = GlobalAddAtomW(L"MicrosoftTabletPenServiceProperty"); const DWORD_PTR dwHwndTabletProperty = @@ -1047,7 +1045,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam, } break; case WM_MOVE: { - OnMonitorUpdate(MonitorUpdateEvent(this, false)); + // chrispy: fix clang use of temporary error + MonitorUpdateEvent update_event{this, false}; + OnMonitorUpdate(update_event); } break; case WM_SIZE: { @@ -1084,7 +1084,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam, } break; case WM_DISPLAYCHANGE: { - OnMonitorUpdate(MonitorUpdateEvent(this, true)); + // chrispy: fix clang use of temporary error + MonitorUpdateEvent update_event{this, true}; + OnMonitorUpdate(update_event); } break; case WM_DPICHANGED: { From 020d64a1a1e7d31df5099097d99632e61f1f6b90 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 13 Aug 2022 13:20:35 -0700 Subject: [PATCH 2/4] revert to using old bad spinwait, disruptorplus' blocking_wait code does not compile --- src/xenia/base/threading_timer_queue.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index b55b618ae..5cd7bf1d2 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -31,7 +31,12 @@ using WaitItem = TimerQueueWaitItem; monopolizing a ton of cpu time (depending on the game 2-4% of total cpu time) on my 3990x no complaints since that change */ -using WaitStrat = dp::blocking_wait_strategy; + +/* + edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds + +*/ +using WaitStrat = dp::spin_wait_strategy;//dp::blocking_wait_strategy; class TimerQueue { public: From c9e41194283f65bfd28e4faa251ef1afa347d8d7 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 13 Aug 2022 13:43:45 -0700 Subject: [PATCH 3/4] Add branch of ffmpeg with non-recursive split_radix_permutation Add branch of disruptorplus with working blocking_wait_stategy Switch back to blocking wait for timer queue --- .gitmodules | 4 ++-- src/xenia/base/threading_timer_queue.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index a73061e22..3e780194f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -48,7 +48,7 @@ url = https://github.com/fmtlib/fmt.git [submodule "third_party/disruptorplus"] path = third_party/disruptorplus - url = https://github.com/xenia-project/disruptorplus.git + url = https://github.com/chrisps/disruptorpus.git [submodule "third_party/DirectXShaderCompiler"] path = third_party/DirectXShaderCompiler url = https://github.com/microsoft/DirectXShaderCompiler.git @@ -63,7 +63,7 @@ url = https://github.com/Cyan4973/xxHash.git [submodule "third_party/FFmpeg"] path = third_party/FFmpeg - url = https://github.com/xenia-project/FFmpeg.git + url = https://github.com/chrisps/FFmpeg_radixsplit.git [submodule "third_party/premake-androidndk"] path = third_party/premake-androidndk url = https://github.com/Triang3l/premake-androidndk.git diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 5cd7bf1d2..8e19b50dd 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem; edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds */ -using WaitStrat = dp::spin_wait_strategy;//dp::blocking_wait_strategy; +using WaitStrat = dp::blocking_wait_strategy; class TimerQueue { public: From 495b1f8bc8f4e2ec1cc0d1409ee01ac8b04f9b73 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 13 Aug 2022 14:05:35 -0700 Subject: [PATCH 4/4] once again return to spinloop --- src/xenia/base/threading_timer_queue.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 8e19b50dd..7d6e612cf 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem; edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds */ -using WaitStrat = dp::blocking_wait_strategy; +using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy; class TimerQueue { public: