diff --git a/src/xenia/app/emulator_window.cc b/src/xenia/app/emulator_window.cc index 65a6f5e0e..1fd73dd2f 100644 --- a/src/xenia/app/emulator_window.cc +++ b/src/xenia/app/emulator_window.cc @@ -979,7 +979,12 @@ void EmulatorWindow::ToggleDisplayConfigDialog() { } void EmulatorWindow::ToggleControllerVibration() { - emulator()->input_system()->ToggleVibration(); + auto input_sys = emulator()->input_system(); + if (input_sys) { + auto input_lock = input_sys->lock(); + + input_sys->ToggleVibration(); + } } void EmulatorWindow::ShowCompatibility() { diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc index 5f4905dda..774f8e25d 100644 --- a/src/xenia/base/clock.cc +++ b/src/xenia/base/clock.cc @@ -50,14 +50,8 @@ uint64_t last_guest_tick_count_ = 0; // Last sampled host tick count. uint64_t last_host_tick_count_ = Clock::QueryHostTickCount(); -struct null_lock { - public: - static void lock() {} - static void unlock() {} - static bool try_lock() { return true; } -}; -using tick_mutex_type = null_lock; // xe::xe_mutex; +using tick_mutex_type = xe_unlikely_mutex; // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync // std::mutex tick_mutex_; @@ -176,6 +170,7 @@ uint64_t Clock::QueryGuestTickCount() { return guest_tick_count; } +uint64_t* Clock::GetGuestTickCountPointer() { return &last_guest_tick_count_; } uint64_t Clock::QueryGuestSystemTime() { if (cvars::clock_no_scaling) { return Clock::QueryHostSystemTime(); diff --git a/src/xenia/base/clock.h b/src/xenia/base/clock.h index 81894ca97..0f27ce081 100644 --- a/src/xenia/base/clock.h +++ b/src/xenia/base/clock.h @@ -74,6 +74,8 @@ class Clock { // Queries the current guest tick count, accounting for frequency adjustment // and scaling. static uint64_t QueryGuestTickCount(); + + static uint64_t* GetGuestTickCountPointer(); // Queries the guest time, in FILETIME format, accounting for scaling. static uint64_t QueryGuestSystemTime(); // Queries the milliseconds since the guest began, accounting for scaling. diff --git a/src/xenia/base/clock_win.cc b/src/xenia/base/clock_win.cc index e087aa946..c391731d3 100644 --- a/src/xenia/base/clock_win.cc +++ b/src/xenia/base/clock_win.cc @@ -12,18 +12,17 @@ #include "xenia/base/platform_win.h" namespace xe { - #if XE_USE_KUSER_SHARED==1 +#if XE_USE_KUSER_SHARED == 1 uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; } uint64_t Clock::host_tick_count_platform() { - return *reinterpret_cast(&KUserShared()->SystemTime); + return *reinterpret_cast(GetKUserSharedSystemTime()); } uint64_t Clock::QueryHostSystemTime() { - return *reinterpret_cast(&KUserShared()->SystemTime); + return *reinterpret_cast(GetKUserSharedSystemTime()); } - - #else +#else uint64_t Clock::host_tick_frequency_platform() { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); @@ -44,13 +43,9 @@ uint64_t Clock::QueryHostSystemTime() { return (uint64_t(t.dwHighDateTime) << 32) | t.dwLowDateTime; } -uint64_t Clock::QueryHostUptimeMillis() { - return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); -} #endif uint64_t Clock::QueryHostUptimeMillis() { return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); } - } // namespace xe diff --git a/src/xenia/base/console_win.cc b/src/xenia/base/console_win.cc index 252e99b84..2549a46ed 100644 --- a/src/xenia/base/console_win.cc +++ b/src/xenia/base/console_win.cc @@ -19,7 +19,7 @@ namespace xe { // TODO(Triang3l): Set the default depending on the actual subsystem. Currently // it inhibits message boxes. -static bool has_console_attached_ = true; +static bool has_console_attached_ = false; bool has_console_attached() { return has_console_attached_; } diff --git a/src/xenia/base/logging.h b/src/xenia/base/logging.h index 208abd3e3..6b3c41561 100644 --- a/src/xenia/base/logging.h +++ b/src/xenia/base/logging.h @@ -78,17 +78,25 @@ std::pair GetThreadBuffer(); void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written); } // namespace internal - -// Appends a line to the log with {fmt}-style formatting. template -void AppendLogLineFormat(LogLevel log_level, const char prefix_char, +XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level, + const char prefix_char, + const char* format, + const Args&... args) { + auto target = internal::GetThreadBuffer(); + auto result = fmt::format_to_n(target.first, target.second, format, args...); + internal::AppendLogLine(log_level, prefix_char, result.size); +} + + // Appends a line to the log with {fmt}-style formatting. +//chrispy: inline the initial check, outline the append. the append should happen rarely for end users +template +XE_FORCEINLINE static void AppendLogLineFormat(LogLevel log_level, const char prefix_char, const char* format, const Args&... args) { if (!internal::ShouldLog(log_level)) { return; } - auto target = internal::GetThreadBuffer(); - auto result = fmt::format_to_n(target.first, target.second, format, args...); - internal::AppendLogLine(log_level, prefix_char, result.size); + AppendLogLineFormat_Impl(log_level, prefix_char, format, args...); } // Appends a line to the log. @@ -98,18 +106,19 @@ void AppendLogLine(LogLevel log_level, const char prefix_char, } // namespace logging // Logs a fatal error and aborts the program. -void FatalError(const std::string_view str); +[[noreturn]] void FatalError(const std::string_view str); } // namespace xe #if XE_OPTION_ENABLE_LOGGING template -void XELOGE(const char* format, const Args&... args) { +XE_COLD void XELOGE(const char* format, const Args&... args) { xe::logging::AppendLogLineFormat(xe::LogLevel::Error, '!', format, args...); } template +XE_COLD void XELOGW(const char* format, const Args&... args) { xe::logging::AppendLogLineFormat(xe::LogLevel::Warning, 'w', format, args...); } @@ -131,12 +140,12 @@ void XELOGCPU(const char* format, const Args&... args) { template void XELOGAPU(const char* format, const Args&... args) { - xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'A', format, args...); + xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'A', format, args...); } template void XELOGGPU(const char* format, const Args&... args) { - xe::logging::AppendLogLineFormat(xe::LogLevel::Info, 'G', format, args...); + xe::logging::AppendLogLineFormat(xe::LogLevel::Debug, 'G', format, args...); } template diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 979b390ba..1afeedc14 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -466,9 +466,11 @@ constexpr inline fourcc_t make_fourcc(const std::string_view fourcc) { } return make_fourcc(fourcc[0], fourcc[1], fourcc[2], fourcc[3]); } -//chrispy::todo:use for command stream vector, resize happens a ton and has to call memset + +// chrispy::todo:use for command stream vector, resize happens a ton and has to +// call memset template -class fixed_vmem_vector { +class FixedVMemVector { static_assert((sz & 65535) == 0, "Always give fixed_vmem_vector a size divisible by 65536 to " "avoid wasting memory on windows"); @@ -477,12 +479,12 @@ class fixed_vmem_vector { size_t nbytes_; public: - fixed_vmem_vector() + FixedVMemVector() : data_((uint8_t*)memory::AllocFixed( nullptr, sz, memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite)), nbytes_(0) {} - ~fixed_vmem_vector() { + ~FixedVMemVector() { if (data_) { memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease); data_ = nullptr; @@ -503,13 +505,221 @@ class fixed_vmem_vector { resize(0); // todo:maybe zero out } void reserve(size_t size) { xenia_assert(size < sz); } - - }; +// software prefetches/cache operations +namespace swcache { +/* + warning, prefetchw's current behavior is not consistent across msvc and + clang, for clang it will only compile to prefetchw if the set architecture + supports it, for msvc however it will unconditionally compile to prefetchw! + so prefetchw support is still in process + + + only use these if you're absolutely certain you know what you're doing; + you can easily tank performance through misuse CPUS have excellent automatic + prefetchers that can predict patterns, but in situations where memory + accesses are super unpredictable and follow no pattern you can make use of + them + + another scenario where it can be handy is when crossing page boundaries, + as many automatic prefetchers do not allow their streams to cross pages (no + idea what this means for huge pages) + + I believe software prefetches do not kick off an automatic prefetcher + stream, so you can't just prefetch one line of the data you're about to + access and be fine, you need to go all the way + + prefetchnta is implementation dependent, and that makes its use a bit + limited. For intel cpus, i believe it only prefetches the line into one way + of the L3 + + for amd cpus, it marks the line as requiring immediate eviction, the + next time an entry is needed in the set it resides in it will be evicted. ms + does dumb shit for memcpy, like looping over the contents of the source + buffer and doing prefetchnta on them, likely evicting some of the data they + just prefetched by the end of the buffer, and probably messing up data that + was already in the cache + + + another warning for these: this bypasses what i think is called + "critical word load", the data will always become available starting from the + very beginning of the line instead of from the piece that is needed + + L1I cache is not prefetchable, however likely all cpus can fulfill + requests for the L1I from L2, so prefetchL2 on instructions should be fine + + todo: clwb, clflush +*/ +#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 + +XE_FORCEINLINE +static void PrefetchW(const void* addr) { __builtin_prefetch(addr, 1, 0); } +XE_FORCEINLINE + +static void PrefetchNTA(const void* addr) { __builtin_prefetch(addr, 0, 0); } +XE_FORCEINLINE + +static void PrefetchL3(const void* addr) { __builtin_prefetch(addr, 0, 1); } +XE_FORCEINLINE + +static void PrefetchL2(const void* addr) { __builtin_prefetch(addr, 0, 2); } +XE_FORCEINLINE + +static void PrefetchL1(const void* addr) { __builtin_prefetch(addr, 0, 3); } +#elif XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1 +XE_FORCEINLINE +static void PrefetchW(const void* addr) { _m_prefetchw(addr); } + +XE_FORCEINLINE +static void PrefetchNTA(const void* addr) { + _mm_prefetch((const char*)addr, _MM_HINT_NTA); +} +XE_FORCEINLINE + +static void PrefetchL3(const void* addr) { + _mm_prefetch((const char*)addr, _MM_HINT_T2); +} +XE_FORCEINLINE + +static void PrefetchL2(const void* addr) { + _mm_prefetch((const char*)addr, _MM_HINT_T1); +} +XE_FORCEINLINE + +static void PrefetchL1(const void* addr) { + _mm_prefetch((const char*)addr, _MM_HINT_T0); +} + +#else +XE_FORCEINLINE +static void PrefetchW(const void* addr) {} + +XE_FORCEINLINE +static void PrefetchNTA(const void* addr) {} +XE_FORCEINLINE + +static void PrefetchL3(const void* addr) {} +XE_FORCEINLINE + +static void PrefetchL2(const void* addr) {} +XE_FORCEINLINE + +static void PrefetchL1(const void* addr) {} + +#endif + +enum class PrefetchTag { Write, Nontemporal, Level3, Level2, Level1 }; + +template +static void Prefetch(const void* addr) { + static_assert(false, "Unknown tag"); +} + +template <> +static void Prefetch(const void* addr) { + PrefetchW(addr); +} +template <> +static void Prefetch(const void* addr) { + PrefetchNTA(addr); +} +template <> +static void Prefetch(const void* addr) { + PrefetchL3(addr); +} +template <> +static void Prefetch(const void* addr) { + PrefetchL2(addr); +} +template <> +static void Prefetch(const void* addr) { + PrefetchL1(addr); +} +// todo: does aarch64 have streaming stores/loads? + +/* + non-temporal stores/loads + + the stores allow cacheable memory to behave like write-combining memory. + on the first nt store to a line, an intermediate buffer will be + allocated by the cpu for stores that come after. once the entire contents of + the line have been written the intermediate buffer will be transmitted to + memory + + the written line will not be cached and if it is in the cache it will be + invalidated from all levels of the hierarchy + + the cpu in this case does not have to read line from memory when we + first write to it if it is not anywhere in the cache, so we use half the + memory bandwidth using these stores + + non-temporal loads are... loads, but they dont use the cache. you need + to manually insert memory barriers (_ReadWriteBarrier, ReadBarrier, etc, do + not use any barriers that generate actual code) if on msvc to prevent it from + moving the load of the data to just before the use of the data (immediately + requiring the memory to be available = big stall) +*/ +#if XE_COMPILER_MSVC == 1 && XE_COMPILER_CLANG_CL == 0 +#define XE_MSVC_REORDER_BARRIER _ReadWriteBarrier + +#else +// if the compiler actually has pipelining for instructions we dont need a +// barrier +#define XE_MSVC_REORDER_BARRIER() static_cast(0) +#endif +#if XE_ARCH_AMD64 == 1 + +XE_FORCEINLINE +static void WriteLineNT(void* destination, const void* source) { + assert((reinterpret_cast(destination) & 63ULL) == 0); + __m256i low = _mm256_loadu_si256((const __m256i*)source); + __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]); + XE_MSVC_REORDER_BARRIER(); + _mm256_stream_si256((__m256i*)destination, low); + _mm256_stream_si256(&((__m256i*)destination)[1], high); +} + +XE_FORCEINLINE +static void ReadLineNT(void* destination, const void* source) { + assert((reinterpret_cast(source) & 63ULL) == 0); + __m256i low = _mm256_stream_load_si256((const __m256i*)source); + __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]); + XE_MSVC_REORDER_BARRIER(); + _mm256_storeu_si256((__m256i*)destination, low); + _mm256_storeu_si256(&((__m256i*)destination)[1], high); +} + +XE_FORCEINLINE +static void WriteFence() { _mm_sfence(); } +XE_FORCEINLINE +static void ReadFence() { _mm_lfence(); } +XE_FORCEINLINE +static void ReadWriteFence() { _mm_mfence(); } +#else + +XE_FORCEINLINE +static void WriteLineNT(void* destination, const void* source) { + assert((reinterpret_cast(destination) & 63ULL) == 0); + memcpy(destination, source, 64); +} + +XE_FORCEINLINE +static void ReadLineNT(void* destination, const void* source) { + assert((reinterpret_cast(source) & 63ULL) == 0); + memcpy(destination, source, 64); +} +XE_FORCEINLINE +static void WriteFence() {} +XE_FORCEINLINE +static void ReadFence() {} +XE_FORCEINLINE +static void ReadWriteFence() {} +#endif +} // namespace swcache } // namespace xe #endif // XENIA_BASE_MEMORY_H_ diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc index 762f05490..027cd7882 100644 --- a/src/xenia/base/mutex.cc +++ b/src/xenia/base/mutex.cc @@ -12,12 +12,14 @@ #include "xenia/base/platform_win.h" #endif - namespace xe { -#if XE_PLATFORM_WIN32 == 1 &&XE_ENABLE_FAST_WIN32_MUTEX == 1 - //default spincount for entercriticalsection is insane on windows, 0x20007D0i64 (33556432 times!!) - //when a lock is highly contended performance degrades sharply on some processors - #define XE_CRIT_SPINCOUNT 128 +#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1 +// default spincount for entercriticalsection is insane on windows, 0x20007D0i64 +// (33556432 times!!) when a lock is highly contended performance degrades +// sharply on some processors todo: perhaps we should have a set of optional +// jobs that processors can do instead of spinning, for instance, sorting a list +// so we have better locality later or something +#define XE_CRIT_SPINCOUNT 128 /* chrispy: todo, if a thread exits before releasing the global mutex we need to check this and release the mutex one way to do this is by using FlsAlloc and @@ -30,8 +32,8 @@ static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) { } xe_global_mutex::xe_global_mutex() { - InitializeCriticalSectionAndSpinCount(global_critical_section(this), - XE_CRIT_SPINCOUNT); + InitializeCriticalSectionEx(global_critical_section(this), XE_CRIT_SPINCOUNT, + CRITICAL_SECTION_NO_DEBUG_INFO); } xe_global_mutex ::~xe_global_mutex() { DeleteCriticalSection(global_critical_section(this)); @@ -65,7 +67,8 @@ CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) { return reinterpret_cast(mutex); } xe_fast_mutex::xe_fast_mutex() { - InitializeCriticalSectionAndSpinCount(fast_crit(this), XE_CRIT_SPINCOUNT); + InitializeCriticalSectionEx(fast_crit(this), XE_CRIT_SPINCOUNT, + CRITICAL_SECTION_NO_DEBUG_INFO); } xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); } diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h index 36351377b..92c3a9405 100644 --- a/src/xenia/base/mutex.h +++ b/src/xenia/base/mutex.h @@ -12,10 +12,10 @@ #include #include "platform.h" -#define XE_ENABLE_FAST_WIN32_MUTEX 1 +#define XE_ENABLE_FAST_WIN32_MUTEX 1 namespace xe { -#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1 +#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX == 1 /* must conform to BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as @@ -23,7 +23,8 @@ namespace xe { this emulates a recursive mutex, except with far less overhead */ -class alignas(64) xe_global_mutex { + +class alignas(4096) xe_global_mutex { char detail[64]; public: @@ -47,11 +48,50 @@ class alignas(64) xe_fast_mutex { void unlock(); bool try_lock(); }; +// a mutex that is extremely unlikely to ever be locked +// use for race conditions that have extremely remote odds of happening +class xe_unlikely_mutex { + std::atomic mut; + bool _tryget() { + uint32_t lock_expected = 0; + return mut.compare_exchange_strong(lock_expected, 1); + } + + public: + xe_unlikely_mutex() : mut(0) {} + ~xe_unlikely_mutex() { mut = 0; } + + void lock() { + uint32_t lock_expected = 0; + + if (XE_LIKELY(_tryget())) { + return; + } else { + do { + // chrispy: warning, if no SMT, mm_pause does nothing... +#if XE_ARCH_AMD64 == 1 + _mm_pause(); +#endif + + } while (!_tryget()); + } + } + void unlock() { mut.exchange(0); } + bool try_lock() { return _tryget(); } +}; using xe_mutex = xe_fast_mutex; #else using global_mutex_type = std::recursive_mutex; using xe_mutex = std::mutex; +using xe_unlikely_mutex = std::mutex; #endif +struct null_mutex { + public: + static void lock() {} + static void unlock() {} + static bool try_lock() { return true; } +}; + using global_unique_lock_type = std::unique_lock; // The global critical region mutex singleton. // This must guard any operation that may suspend threads or be sensitive to diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index 6d1a6d5f9..ebb555034 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -122,6 +122,7 @@ #define XE_COLD __attribute__((cold)) #define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true) #define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false) + #else #define XE_FORCEINLINE inline #define XE_NOINLINE @@ -129,6 +130,24 @@ #define XE_LIKELY(...) (!!(__VA_ARGS__)) #define XE_UNLIKELY(...) (!!(__VA_ARGS__)) #endif +// only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which +// acts as __restrict across the board todo: __restrict is part of the type +// system, we might actually have to still emit it on clang and gcc +#if XE_COMPILER_CLANG_CL == 0 && XE_COMPILER_MSVC == 1 + +#define XE_RESTRICT __restrict +#else +#define XE_RESTRICT +#endif + +#if XE_ARCH_AMD64 == 1 +#define XE_HOST_CACHE_LINE_SIZE 64 +#elif XE_ARCH_ARM64 == 1 +#define XE_HOST_CACHE_LINE_SIZE 64 +#else + +#error unknown cache line size for unknown architecture! +#endif namespace xe { diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index a9ea263d6..a608f04b4 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -35,7 +35,9 @@ #undef GetFirstChild #define XE_USE_NTDLL_FUNCTIONS 1 -#define XE_USE_KUSER_SHARED 1 +//chrispy: disabling this for now, more research needs to be done imo, although it does work very well on my machine +// +#define XE_USE_KUSER_SHARED 0 #if XE_USE_NTDLL_FUNCTIONS == 1 /* ntdll versions of functions often skip through a lot of extra garbage in @@ -61,142 +63,19 @@ #define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false #endif - +#if XE_USE_KUSER_SHARED==1 // KUSER_SHARED struct __declspec(align(4)) _KSYSTEM_TIME { unsigned int LowPart; int High1Time; int High2Time; }; -enum _NT_PRODUCT_TYPE { - NtProductWinNt = 0x1, - NtProductLanManNt = 0x2, - NtProductServer = 0x3, -}; -enum _ALTERNATIVE_ARCHITECTURE_TYPE { - StandardDesign = 0x0, - NEC98x86 = 0x1, - EndAlternatives = 0x2, -}; -#pragma pack(push, 1) -struct $3D940D5D03EF7F98CEE6737EDE752E57 { - __int8 _bf_0; -}; - -union $DA7A7E727E24E4DD62317E27558CCADA { - unsigned __int8 MitigationPolicies; - $3D940D5D03EF7F98CEE6737EDE752E57 __s1; -}; -struct __declspec(align(4)) $4BF4056B39611650D41923F164DAFA52 { - __int32 _bf_0; -}; - -union __declspec(align(4)) $BB68545E345A5F8046EF3BC0FE928142 { - unsigned int SharedDataFlags; - $4BF4056B39611650D41923F164DAFA52 __s1; -}; -union $5031D289C483414B89DA3F368D1FE62C { - volatile _KSYSTEM_TIME TickCount; - volatile unsigned __int64 TickCountQuad; - unsigned int ReservedTickCountOverlay[3]; -}; -struct $F91ACE6F13277DFC9425B9B8BBCB30F7 { - volatile unsigned __int8 QpcBypassEnabled; - unsigned __int8 QpcShift; -}; - -union __declspec(align(2)) $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 { - unsigned __int16 QpcData; - $F91ACE6F13277DFC9425B9B8BBCB30F7 __s1; -}; - -struct __declspec(align(8)) _KUSER_SHARED_DATA { - unsigned int TickCountLowDeprecated; - unsigned int TickCountMultiplier; - volatile _KSYSTEM_TIME InterruptTime; - volatile _KSYSTEM_TIME SystemTime; - volatile _KSYSTEM_TIME TimeZoneBias; - unsigned __int16 ImageNumberLow; - unsigned __int16 ImageNumberHigh; - wchar_t NtSystemRoot[260]; - unsigned int MaxStackTraceDepth; - unsigned int CryptoExponent; - unsigned int TimeZoneId; - unsigned int LargePageMinimum; - unsigned int AitSamplingValue; - unsigned int AppCompatFlag; - unsigned __int64 RNGSeedVersion; - unsigned int GlobalValidationRunlevel; - volatile int TimeZoneBiasStamp; - unsigned int NtBuildNumber; - _NT_PRODUCT_TYPE NtProductType; - unsigned __int8 ProductTypeIsValid; - unsigned __int8 Reserved0[1]; - unsigned __int16 NativeProcessorArchitecture; - unsigned int NtMajorVersion; - unsigned int NtMinorVersion; - unsigned __int8 ProcessorFeatures[64]; - unsigned int Reserved1; - unsigned int Reserved3; - volatile unsigned int TimeSlip; - _ALTERNATIVE_ARCHITECTURE_TYPE AlternativeArchitecture; - unsigned int BootId; - _LARGE_INTEGER SystemExpirationDate; - unsigned int SuiteMask; - unsigned __int8 KdDebuggerEnabled; - $DA7A7E727E24E4DD62317E27558CCADA ___u33; - unsigned __int8 Reserved6[2]; - volatile unsigned int ActiveConsoleId; - volatile unsigned int DismountCount; - unsigned int ComPlusPackage; - unsigned int LastSystemRITEventTickCount; - unsigned int NumberOfPhysicalPages; - unsigned __int8 SafeBootMode; - unsigned __int8 VirtualizationFlags; - unsigned __int8 Reserved12[2]; - $BB68545E345A5F8046EF3BC0FE928142 ___u43; - unsigned int DataFlagsPad[1]; - unsigned __int64 TestRetInstruction; - __int64 QpcFrequency; - unsigned int SystemCall; - unsigned int SystemCallPad0; - unsigned __int64 SystemCallPad[2]; - $5031D289C483414B89DA3F368D1FE62C ___u50; - unsigned int TickCountPad[1]; - unsigned int Cookie; - unsigned int CookiePad[1]; - __int64 ConsoleSessionForegroundProcessId; - unsigned __int64 TimeUpdateLock; - unsigned __int64 BaselineSystemTimeQpc; - unsigned __int64 BaselineInterruptTimeQpc; - unsigned __int64 QpcSystemTimeIncrement; - unsigned __int64 QpcInterruptTimeIncrement; - unsigned __int8 QpcSystemTimeIncrementShift; - unsigned __int8 QpcInterruptTimeIncrementShift; - unsigned __int16 UnparkedProcessorCount; - unsigned int EnclaveFeatureMask[4]; - unsigned int TelemetryCoverageRound; - unsigned __int16 UserModeGlobalLogger[16]; - unsigned int ImageFileExecutionOptions; - unsigned int LangGenerationCount; - unsigned __int64 Reserved4; - volatile unsigned __int64 InterruptTimeBias; - volatile unsigned __int64 QpcBias; - unsigned int ActiveProcessorCount; - volatile unsigned __int8 ActiveGroupCount; - unsigned __int8 Reserved9; - $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 ___u74; - _LARGE_INTEGER TimeZoneBiasEffectiveStart; - _LARGE_INTEGER TimeZoneBiasEffectiveEnd; - _XSTATE_CONFIGURATION XState; -}; -static constexpr unsigned KUSER_SIZE = sizeof(_KUSER_SHARED_DATA); - -static_assert(KUSER_SIZE == 1808, "yay"); -#pragma pack(pop) - -static _KUSER_SHARED_DATA* KUserShared() { - return (_KUSER_SHARED_DATA*)0x7FFE0000; +static constexpr size_t KSUER_SHARED_SYSTEMTIME_OFFSET = 0x14; +static unsigned char* KUserShared() { return (unsigned char*)0x7FFE0000ULL; } +static volatile _KSYSTEM_TIME* GetKUserSharedSystemTime() { + return reinterpret_cast( + KUserShared() + KSUER_SHARED_SYSTEMTIME_OFFSET); } +#endif #endif // XENIA_BASE_PLATFORM_WIN_H_ diff --git a/src/xenia/base/ring_buffer.cc b/src/xenia/base/ring_buffer.cc index b4e1013f8..d7176c068 100644 --- a/src/xenia/base/ring_buffer.cc +++ b/src/xenia/base/ring_buffer.cc @@ -8,46 +8,52 @@ */ #include "xenia/base/ring_buffer.h" - #include #include namespace xe { RingBuffer::RingBuffer(uint8_t* buffer, size_t capacity) - : buffer_(buffer), capacity_(capacity) {} + : buffer_(buffer), + capacity_(static_cast(capacity)), + read_offset_(0), + write_offset_(0) {} -void RingBuffer::AdvanceRead(size_t count) { +void RingBuffer::AdvanceRead(size_t _count) { + ring_size_t count = static_cast(_count); if (read_offset_ + count < capacity_) { read_offset_ += count; } else { - size_t left_half = capacity_ - read_offset_; - size_t right_half = count - left_half; + ring_size_t left_half = capacity_ - read_offset_; + ring_size_t right_half = count - left_half; read_offset_ = right_half; } } -void RingBuffer::AdvanceWrite(size_t count) { +void RingBuffer::AdvanceWrite(size_t _count) { + ring_size_t count = static_cast(_count); + if (write_offset_ + count < capacity_) { write_offset_ += count; } else { - size_t left_half = capacity_ - write_offset_; - size_t right_half = count - left_half; + ring_size_t left_half = capacity_ - write_offset_; + ring_size_t right_half = count - left_half; write_offset_ = right_half; } } -RingBuffer::ReadRange RingBuffer::BeginRead(size_t count) { - count = std::min(count, capacity_); +RingBuffer::ReadRange RingBuffer::BeginRead(size_t _count) { + ring_size_t count = + std::min(static_cast(_count), capacity_); if (!count) { return {0}; } if (read_offset_ + count < capacity_) { - return {buffer_ + read_offset_, count, nullptr, 0}; + return {buffer_ + read_offset_, nullptr, count, 0}; } else { - size_t left_half = capacity_ - read_offset_; - size_t right_half = count - left_half; - return {buffer_ + read_offset_, left_half, buffer_, right_half}; + ring_size_t left_half = capacity_ - read_offset_; + ring_size_t right_half = count - left_half; + return {buffer_ + read_offset_, buffer_, left_half, right_half}; } } @@ -59,7 +65,8 @@ void RingBuffer::EndRead(ReadRange read_range) { } } -size_t RingBuffer::Read(uint8_t* buffer, size_t count) { +size_t RingBuffer::Read(uint8_t* buffer, size_t _count) { + ring_size_t count = static_cast(_count); count = std::min(count, capacity_); if (!count) { return 0; @@ -69,7 +76,7 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) { if (read_offset_ < write_offset_) { assert_true(read_offset_ + count <= write_offset_); } else if (read_offset_ + count >= capacity_) { - size_t left_half = capacity_ - read_offset_; + ring_size_t left_half = capacity_ - read_offset_; assert_true(count - left_half <= write_offset_); } @@ -77,8 +84,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) { std::memcpy(buffer, buffer_ + read_offset_, count); read_offset_ += count; } else { - size_t left_half = capacity_ - read_offset_; - size_t right_half = count - left_half; + ring_size_t left_half = capacity_ - read_offset_; + ring_size_t right_half = count - left_half; std::memcpy(buffer, buffer_ + read_offset_, left_half); std::memcpy(buffer + left_half, buffer_, right_half); read_offset_ = right_half; @@ -87,7 +94,8 @@ size_t RingBuffer::Read(uint8_t* buffer, size_t count) { return count; } -size_t RingBuffer::Write(const uint8_t* buffer, size_t count) { +size_t RingBuffer::Write(const uint8_t* buffer, size_t _count) { + ring_size_t count = static_cast(_count); count = std::min(count, capacity_); if (!count) { return 0; @@ -105,8 +113,8 @@ size_t RingBuffer::Write(const uint8_t* buffer, size_t count) { std::memcpy(buffer_ + write_offset_, buffer, count); write_offset_ += count; } else { - size_t left_half = capacity_ - write_offset_; - size_t right_half = count - left_half; + ring_size_t left_half = capacity_ - write_offset_; + ring_size_t right_half = count - left_half; std::memcpy(buffer_ + write_offset_, buffer, left_half); std::memcpy(buffer_, buffer + left_half, right_half); write_offset_ = right_half; diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index 9925622de..a4befb686 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -17,6 +17,8 @@ #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" +#include "xenia/base/math.h" +#include "xenia/base/memory.h" namespace xe { /* @@ -39,18 +41,24 @@ namespace xe { that the registers no longer need the rex prefix, shrinking the generated code a bit.. like i said, every bit helps in this class */ +using ring_size_t = uint32_t; class RingBuffer { public: RingBuffer(uint8_t* buffer, size_t capacity); uint8_t* buffer() const { return buffer_; } - size_t capacity() const { return capacity_; } + ring_size_t capacity() const { return capacity_; } bool empty() const { return read_offset_ == write_offset_; } - size_t read_offset() const { return read_offset_; } - uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; } + ring_size_t read_offset() const { return read_offset_; } + uintptr_t read_ptr() const { + return uintptr_t(buffer_) + static_cast(read_offset_); + } + + // todo: offset/ capacity_ is probably always 1 when its over, just check and + // subtract instead void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; } - size_t read_count() const { + ring_size_t read_count() const { // chrispy: these branches are unpredictable #if 0 if (read_offset_ == write_offset_) { @@ -61,14 +69,14 @@ class RingBuffer { return (capacity_ - read_offset_) + write_offset_; } #else - size_t read_offs = read_offset_; - size_t write_offs = write_offset_; - size_t cap = capacity_; + ring_size_t read_offs = read_offset_; + ring_size_t write_offs = write_offset_; + ring_size_t cap = capacity_; - size_t offset_delta = write_offs - read_offs; - size_t wrap_read_count = (cap - read_offs) + write_offs; + ring_size_t offset_delta = write_offs - read_offs; + ring_size_t wrap_read_count = (cap - read_offs) + write_offs; - size_t comparison_value = read_offs <= write_offs; + ring_size_t comparison_value = read_offs <= write_offs; #if 0 size_t selector = static_cast(-static_cast(comparison_value)); @@ -89,10 +97,12 @@ class RingBuffer { #endif } - size_t write_offset() const { return write_offset_; } + ring_size_t write_offset() const { return write_offset_; } uintptr_t write_ptr() const { return uintptr_t(buffer_) + write_offset_; } - void set_write_offset(size_t offset) { write_offset_ = offset % capacity_; } - size_t write_count() const { + void set_write_offset(size_t offset) { + write_offset_ = static_cast(offset) % capacity_; + } + ring_size_t write_count() const { if (read_offset_ == write_offset_) { return capacity_; } else if (write_offset_ < read_offset_) { @@ -107,13 +117,35 @@ class RingBuffer { struct ReadRange { const uint8_t* first; - size_t first_length; + const uint8_t* second; - size_t second_length; + ring_size_t first_length; + ring_size_t second_length; }; ReadRange BeginRead(size_t count); void EndRead(ReadRange read_range); + /* + BeginRead, but if there is a second Range it will prefetch all lines of it + + this does not prefetch the first range, because software prefetching can do that faster than we can + */ + template + XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) { + ReadRange range = BeginRead(count); + + if (range.second) { + ring_size_t numlines = + xe::align(range.second_length, XE_HOST_CACHE_LINE_SIZE) / + XE_HOST_CACHE_LINE_SIZE; + //chrispy: maybe unroll? + for (ring_size_t i = 0; i < numlines; ++i) { + swcache::Prefetch(range.second + (i * XE_HOST_CACHE_LINE_SIZE)); + } + } + return range; + } + size_t Read(uint8_t* buffer, size_t count); template size_t Read(T* buffer, size_t count) { @@ -156,29 +188,29 @@ class RingBuffer { private: uint8_t* buffer_ = nullptr; - size_t capacity_ = 0; - size_t read_offset_ = 0; - size_t write_offset_ = 0; + ring_size_t capacity_ = 0; + ring_size_t read_offset_ = 0; + ring_size_t write_offset_ = 0; }; template <> inline uint32_t RingBuffer::ReadAndSwap() { - size_t read_offset = this->read_offset_; + ring_size_t read_offset = this->read_offset_; xenia_assert(this->capacity_ >= 4); - size_t next_read_offset = read_offset + 4; - #if 0 + ring_size_t next_read_offset = read_offset + 4; +#if 0 size_t zerotest = next_read_offset - this->capacity_; // unpredictable branch, use bit arith instead // todo: it would be faster to use lzcnt, but we need to figure out if all // machines we support support it next_read_offset &= -static_cast(!!zerotest); - #else +#else if (XE_UNLIKELY(next_read_offset == this->capacity_)) { next_read_offset = 0; - //todo: maybe prefetch next? or should that happen much earlier? + // todo: maybe prefetch next? or should that happen much earlier? } - #endif +#endif this->read_offset_ = next_read_offset; unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset]; return xe::byte_swap(ring_value); diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index c3711f239..ba17c3caf 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -10,7 +10,7 @@ #include "xenia/cpu/backend/x64/x64_backend.h" #include - +#include #include "third_party/capstone/include/capstone/capstone.h" #include "third_party/capstone/include/capstone/x86.h" @@ -50,6 +50,9 @@ DEFINE_bool(record_mmio_access_exceptions, true, "for them. This info can then be used on a subsequent run to " "instruct the recompiler to emit checks", "CPU"); +#if XE_X64_PROFILER_AVAILABLE == 1 +DECLARE_bool(instrument_call_times); +#endif namespace xe { namespace cpu { @@ -96,6 +99,68 @@ static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) { reinterpret_cast(context) ->RecordMMIOExceptionForGuestInstruction(hostaddr); } +#if XE_X64_PROFILER_AVAILABLE == 1 +// todo: better way of passing to atexit. maybe do in destructor instead? +// nope, destructor is never called +static GuestProfilerData* backend_profiler_data = nullptr; + +static uint64_t nanosecond_lifetime_start = 0; +static void WriteGuestProfilerData() { + if (cvars::instrument_call_times) { + uint64_t end = Clock::QueryHostSystemTime(); + + uint64_t total = end - nanosecond_lifetime_start; + + double totaltime_divisor = static_cast(total); + + FILE* output_file = nullptr; + std::vector> unsorted_profile{}; + for (auto&& entry : *backend_profiler_data) { + if (entry.second) { // skip times of 0 + unsorted_profile.emplace_back(entry.first, entry.second); + } + } + + std::sort(unsorted_profile.begin(), unsorted_profile.end(), + [](auto& x, auto& y) { return x.second < y.second; }); + + fopen_s(&output_file, "profile_times.txt", "w"); + FILE* idapy_file = nullptr; + fopen_s(&idapy_file, "profile_print_times.py", "w"); + + for (auto&& sorted_entry : unsorted_profile) { + // double time_in_seconds = + // static_cast(sorted_entry.second) / 10000000.0; + double time_in_milliseconds = + static_cast(sorted_entry.second) / (10000000.0 / 1000.0); + + double slice = static_cast(sorted_entry.second) / + static_cast(totaltime_divisor); + + fprintf(output_file, + "%X took %.20f milliseconds, totaltime slice percentage %.20f \n", + sorted_entry.first, time_in_milliseconds, slice); + + fprintf(idapy_file, + "print(get_name(0x%X) + ' took %.20f ms, %.20f percent')\n", + sorted_entry.first, time_in_milliseconds, slice); + } + + fclose(output_file); + fclose(idapy_file); + } +} + +static void GuestProfilerUpdateThreadProc() { + nanosecond_lifetime_start = Clock::QueryHostSystemTime(); + + do { + xe::threading::Sleep(std::chrono::seconds(30)); + WriteGuestProfilerData(); + } while (true); +} +static std::unique_ptr g_profiler_update_thread{}; +#endif bool X64Backend::Initialize(Processor* processor) { if (!Backend::Initialize(processor)) { @@ -159,6 +224,21 @@ bool X64Backend::Initialize(Processor* processor) { processor->memory()->SetMMIOExceptionRecordingCallback( ForwardMMIOAccessForRecording, (void*)this); + +#if XE_X64_PROFILER_AVAILABLE == 1 + if (cvars::instrument_call_times) { + backend_profiler_data = &profiler_data_; + xe::threading::Thread::CreationParameters slimparams; + + slimparams.create_suspended = false; + slimparams.initial_priority = xe::threading::ThreadPriority::kLowest; + slimparams.stack_size = 65536 * 4; + + g_profiler_update_thread = std::move(xe::threading::Thread::Create( + slimparams, GuestProfilerUpdateThreadProc)); + } +#endif + return true; } @@ -734,6 +814,7 @@ void X64Backend::InitializeBackendContext(void* ctx) { bctx->flags = 0; // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; + bctx->guest_tick_count = Clock::GetGuestTickCountPointer(); } const uint32_t mxcsr_table[8] = { 0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80, @@ -747,6 +828,23 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) { bctx->mxcsr_fpu = mxcsr_table[control]; ((ppc::PPCContext*)ctx)->fpscr.bits.rn = control; } + +#if XE_X64_PROFILER_AVAILABLE == 1 +uint64_t* X64Backend::GetProfilerRecordForFunction(uint32_t guest_address) { + // who knows, we might want to compile different versions of a function one + // day + auto entry = profiler_data_.find(guest_address); + + if (entry != profiler_data_.end()) { + return &entry->second; + } else { + profiler_data_[guest_address] = 0; + + return &profiler_data_[guest_address]; + } +} + +#endif } // namespace x64 } // namespace backend } // namespace cpu diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 4ec930698..d4ded3e83 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -15,6 +15,14 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/backend.h" +#if XE_PLATFORM_WIN32 == 1 +// we use KUSER_SHARED's systemtime field, which is at a fixed address and +// obviously windows specific, to get the start/end time for a function using +// rdtsc would be too slow and skew the results by consuming extra cpu time, so +// we have lower time precision but better overall accuracy +#define XE_X64_PROFILER_AVAILABLE 1 +#endif + DECLARE_int32(x64_extension_mask); namespace xe { @@ -24,6 +32,8 @@ namespace xe { namespace cpu { namespace backend { namespace x64 { +// mapping of guest function addresses to total nanoseconds taken in the func +using GuestProfilerData = std::map; class X64CodeCache; @@ -37,8 +47,10 @@ typedef void (*ResolveFunctionThunk)(); // negatively index the membase reg) struct X64BackendContext { void* ResolveFunction_Ptr; // cached pointer to resolvefunction - unsigned int mxcsr_fpu; // currently, the way we implement rounding mode - // affects both vmx and the fpu + uint64_t* guest_tick_count; + + unsigned int mxcsr_fpu; // currently, the way we implement rounding mode + // affects both vmx and the fpu unsigned int mxcsr_vmx; unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted @@ -93,7 +105,9 @@ class X64Backend : public Backend { virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override; void RecordMMIOExceptionForGuestInstruction(void* host_address); - +#if XE_X64_PROFILER_AVAILABLE == 1 + uint64_t* GetProfilerRecordForFunction(uint32_t guest_address); +#endif private: static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); @@ -106,6 +120,10 @@ class X64Backend : public Backend { HostToGuestThunk host_to_guest_thunk_; GuestToHostThunk guest_to_host_thunk_; ResolveFunctionThunk resolve_function_thunk_; + +#if XE_X64_PROFILER_AVAILABLE == 1 + GuestProfilerData profiler_data_; +#endif }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index dc435c39f..ccd6e969a 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -57,6 +57,12 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false, "code. The workaround may cause reduced CPU performance but is a " "more accurate emulation", "x64"); + +#if XE_X64_PROFILER_AVAILABLE == 1 +DEFINE_bool(instrument_call_times, false, + "Compute time taken for functions, for profiling guest code", + "x64"); +#endif namespace xe { namespace cpu { namespace backend { @@ -120,28 +126,37 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) */ unsigned int data[4]; Xbyak::util::Cpu::getCpuid(0x80000001, data); - if (data[2] & (1U << 5)) { + unsigned amd_flags = data[2]; + if (amd_flags & (1U << 5)) { if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { feature_flags_ |= kX64EmitLZCNT; } } + // todo: although not reported by cpuid, zen 1 and zen+ also have fma4 + if (amd_flags & (1U << 16)) { + if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) { + feature_flags_ |= kX64EmitFMA4; + } + } + if (amd_flags & (1U << 21)) { + if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) { + feature_flags_ |= kX64EmitTBM; + } + } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { bool is_zennish = cpu_.displayFamily >= 0x17; - + /* + chrispy: according to agner's tables, all amd architectures that + we support (ones with avx) have the same timings for + jrcxz/loop/loope/loopne as for other jmps + */ + feature_flags_ |= kX64FastJrcx; + feature_flags_ |= kX64FastLoop; if (is_zennish) { // ik that i heard somewhere that this is the case for zen, but i need to // verify. cant find my original source for that. // todo: ask agner? feature_flags_ |= kX64FlagsIndependentVars; - feature_flags_ |= kX64FastJrcx; - - if (cpu_.displayFamily > 0x17) { - feature_flags_ |= kX64FastLoop; - - } else if (cpu_.displayFamily == 0x17 && cpu_.displayModel >= 0x31) { - feature_flags_ |= kX64FastLoop; - } // todo:figure out at model zen+ became zen2, this is just the model - // for my cpu, which is ripper90 } } may_use_membase32_as_zero_reg_ = @@ -157,6 +172,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, std::vector* out_source_map) { SCOPE_profile_cpu_f("cpu"); guest_module_ = dynamic_cast(function->module()); + current_guest_function_ = function->address(); // Reset. debug_info_ = debug_info; debug_info_flags_ = debug_info_flags; @@ -286,10 +302,19 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { * chrispy: removed this, it serves no purpose mov(qword[rsp + StackLayout::GUEST_CTX_HOME], GetContextReg()); */ + mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rcx); mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], rax); // 0 +#if XE_X64_PROFILER_AVAILABLE == 1 + if (cvars::instrument_call_times) { + mov(rdx, 0x7ffe0014); // load pointer to kusershared systemtime + mov(rdx, qword[rdx]); + mov(qword[rsp + StackLayout::GUEST_PROFILER_START], + rdx); // save time for end of function + } +#endif // Safe now to do some tracing. if (debug_info_flags_ & DebugInfoFlags::kDebugInfoTraceFunctions) { // We require 32-bit addresses. @@ -363,6 +388,7 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { mov(GetContextReg(), qword[rsp + StackLayout::GUEST_CTX_HOME]); */ code_offsets.epilog = getSize(); + EmitProfilerEpilogue(); add(rsp, (uint32_t)stack_size); ret(); @@ -391,6 +417,27 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { return true; } +// dont use rax, we do this in tail call handling +void X64Emitter::EmitProfilerEpilogue() { +#if XE_X64_PROFILER_AVAILABLE == 1 + if (cvars::instrument_call_times) { + uint64_t* profiler_entry = + backend()->GetProfilerRecordForFunction(current_guest_function_); + mov(ecx, 0x7ffe0014); + mov(rdx, qword[rcx]); + mov(rbx, (uintptr_t)profiler_entry); + sub(rdx, qword[rsp + StackLayout::GUEST_PROFILER_START]); + + // atomic add our time to the profiler entry + // this could be atomic free if we had per thread profile counts, and on a + // threads exit we lock and sum up to the global counts, which would make + // this a few cycles less intrusive, but its good enough for now + // actually... lets just try without atomics lol + // lock(); + add(qword[rbx], rdx); + } +#endif +} void X64Emitter::MarkSourceOffset(const Instr* i) { auto entry = source_map_arena_.Alloc(); @@ -558,7 +605,7 @@ void X64Emitter::Call(const hir::Instr* instr, GuestFunction* function) { if (instr->flags & hir::CALL_TAIL) { // Since we skip the prolog we need to mark the return here. EmitTraceUserCallReturn(); - + EmitProfilerEpilogue(); // Pass the callers return address over. mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); @@ -602,7 +649,7 @@ void X64Emitter::CallIndirect(const hir::Instr* instr, if (instr->flags & hir::CALL_TAIL) { // Since we skip the prolog we need to mark the return here. EmitTraceUserCallReturn(); - + EmitProfilerEpilogue(); // Pass the callers return address over. mov(rcx, qword[rsp + StackLayout::GUEST_RET_ADDR]); @@ -952,7 +999,34 @@ static const vec128_t xmm_consts[] = { /*XMMVSRShlByteshuf*/ v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), // XMMVSRMask - vec128b(1)}; + vec128b(1), + /* + XMMF16UnpackLCPI2 + */ + + vec128i(0x38000000), + /* + XMMF16UnpackLCPI3 + */ + vec128q(0x7fe000007fe000ULL), + + /* XMMF16PackLCPI0*/ + vec128i(0x8000000), + /*XMMF16PackLCPI2*/ + vec128i(0x47ffe000), + /*XMMF16PackLCPI3*/ + vec128i(0xc7800000), + /*XMMF16PackLCPI4 + */ + vec128i(0xf7fdfff), + /*XMMF16PackLCPI5*/ + vec128i(0x7fff), + /* + XMMF16PackLCPI6 + */ + vec128i(0x8000) + +}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 93ac9915f..48cfa9909 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -159,7 +159,15 @@ enum XmmConst { XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3 XMMXenosF16ExtRangeStart, XMMVSRShlByteshuf, - XMMVSRMask + XMMVSRMask, + XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768 + XMMF16UnpackLCPI3, // 0x0x7fe000007fe000 + XMMF16PackLCPI0, + XMMF16PackLCPI2, + XMMF16PackLCPI3, + XMMF16PackLCPI4, + XMMF16PackLCPI5, + XMMF16PackLCPI6 }; // X64Backend specific Instr->runtime_flags enum : uint32_t { @@ -177,7 +185,7 @@ class XbyakAllocator : public Xbyak::Allocator { enum X64EmitterFeatureFlags { kX64EmitAVX2 = 1 << 0, kX64EmitFMA = 1 << 1, - kX64EmitLZCNT = 1 << 2, + kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount kX64EmitBMI1 = 1 << 3, kX64EmitBMI2 = 1 << 4, kX64EmitF16C = 1 << 5, @@ -201,7 +209,11 @@ enum X64EmitterFeatureFlags { // inc/dec) do not introduce false dependencies on EFLAGS // because the individual flags are treated as different vars by // the processor. (this applies to zen) - kX64EmitPrefetchW = 1 << 16 + kX64EmitPrefetchW = 1 << 16, + kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx + // instructions, and FX users need the boost + kX64EmitFMA4 = 1 << 18, // todo: also use on zen1? + kX64EmitTBM = 1 << 19 }; class ResolvableGuestCall { public: @@ -337,6 +349,8 @@ class X64Emitter : public Xbyak::CodeGenerator { XexModule* GuestModule() { return guest_module_; } + void EmitProfilerEpilogue(); + protected: void* Emplace(const EmitFunctionInfo& func_info, GuestFunction* function = nullptr); @@ -352,7 +366,7 @@ class X64Emitter : public Xbyak::CodeGenerator { XexModule* guest_module_ = nullptr; Xbyak::util::Cpu cpu_; uint32_t feature_flags_ = 0; - + uint32_t current_guest_function_ = 0; Xbyak::Label* epilog_label_ = nullptr; hir::Instr* current_instr_ = nullptr; diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 846eda234..3d9a5f797 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -19,10 +19,6 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h" -DEFINE_bool(use_extended_range_half, true, - "Emulate extended range half-precision, may be slower on games " - "that use it heavily", - "CPU"); namespace xe { namespace cpu { namespace backend { @@ -1982,6 +1978,137 @@ struct PERMUTE_V128 }; EMITTER_OPCODE_TABLE(OPCODE_PERMUTE, PERMUTE_I32, PERMUTE_V128); +#define LCPI(name, quad1) const __m128i name = _mm_set1_epi32(quad1) +// xmm0 is precasted to int, but contains float +// chrispy: todo: make available to gpu code +static __m128i xenos_float4_to_float16_x4(__m128i xmm0) { + LCPI(LCPI0_0, 2147483647); + LCPI(LCPI0_1, 1207951360); + LCPI(LCPI0_2, 134217728); + LCPI(LCPI0_3, 3347054592); + LCPI(LCPI0_4, 260038655); + LCPI(LCPI0_5, 32767); + LCPI(LCPI0_6, 4294934528); + + __m128i xmm1 = _mm_and_si128(xmm0, LCPI0_0); + + __m128i xmm2 = LCPI0_1; + + __m128i xmm3 = _mm_add_epi32(xmm0, LCPI0_2); + xmm2 = _mm_cmpgt_epi32(xmm2, xmm1); + xmm3 = _mm_srli_epi32(xmm3, 13); + xmm1 = _mm_add_epi32(xmm1, LCPI0_3); + __m128i xmm4 = _mm_min_epu32(xmm1, LCPI0_4); + xmm1 = _mm_cmpeq_epi32(xmm1, xmm4); + xmm4 = LCPI0_5; + xmm3 = _mm_and_si128(xmm3, xmm4); + xmm1 = _mm_and_si128(xmm1, xmm3); + + xmm1 = _mm_castps_si128(_mm_blendv_ps( + _mm_castsi128_ps(xmm4), _mm_castsi128_ps(xmm1), _mm_castsi128_ps(xmm2))); + xmm0 = _mm_srli_epi32(xmm0, 16); + xmm0 = _mm_and_si128(xmm0, LCPI0_6); + xmm0 = _mm_or_si128(xmm1, xmm0); + xmm0 = _mm_packus_epi32(xmm0, _mm_setzero_si128()); + return xmm0; +} +// returns floats, uncasted +// chrispy: todo, make this available to gpu code? +static __m128i xenos_halves_to_floats(__m128i xmm0) { + LCPI(LCPI3_0, 0x1f); + LCPI(LCPI3_1, 0x80000000); + LCPI(LCPI3_2, 0x38000000); + LCPI(LCPI3_3, 0x7fe000); + + __m128i xmm1, xmm2, xmm3, xmm4; + + xmm1 = _mm_cvtepu16_epi32(xmm0); + + xmm2 = _mm_srli_epi32(xmm1, 10); + + xmm2 = _mm_and_si128(xmm2, LCPI3_0); + + xmm0 = _mm_cvtepi16_epi32(xmm0); + + xmm0 = _mm_and_si128(xmm0, LCPI3_1); + + xmm3 = _mm_setzero_si128(); + + xmm4 = _mm_slli_epi32(xmm2, 23); + + xmm4 = _mm_add_epi32(xmm4, LCPI3_2); + + xmm2 = _mm_cmpeq_epi32(xmm2, xmm3); + + xmm1 = _mm_slli_epi32(xmm1, 13); + + xmm1 = _mm_and_si128(xmm1, LCPI3_3); + + xmm3 = _mm_andnot_si128(xmm2, xmm4); + + xmm1 = _mm_andnot_si128(xmm2, xmm1); + + xmm0 = _mm_or_si128(xmm1, xmm0); + xmm0 = _mm_or_si128(xmm0, xmm3); + return xmm0; +} + +#undef LCPI +template +static void emit_fast_f16_unpack(X64Emitter& e, const Inst& i, + XmmConst initial_shuffle) { + auto src1 = i.src1; + + e.vpshufb(i.dest, src1, e.GetXmmConstPtr(initial_shuffle)); + e.vpmovsxwd(e.xmm1, i.dest); + + e.vpsrld(e.xmm2, e.xmm1, 10); + e.vpmovsxwd(e.xmm0, i.dest); + e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS)); + e.vpand(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask)); + + e.vpslld(e.xmm3, e.xmm2, 23); + + e.vpaddd(e.xmm3, e.xmm3, e.GetXmmConstPtr(XMMF16UnpackLCPI2)); + + e.vpcmpeqd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMZero)); + + e.vpslld(e.xmm1, e.xmm1, 13); + + e.vpandn(e.xmm1, e.xmm2, e.xmm1); + e.vpandn(e.xmm2, e.xmm2, e.xmm3); + + e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMF16UnpackLCPI3)); + e.vpor(e.xmm0, e.xmm1, e.xmm0); + e.vpor(i.dest, e.xmm0, e.xmm2); +} +template +static void emit_fast_f16_pack(X64Emitter& e, const Inst& i, + XmmConst final_shuffle) { + e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMF16PackLCPI0)); + e.vpand(e.xmm2, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); + e.vmovdqa(e.xmm3, e.GetXmmConstPtr(XMMF16PackLCPI2)); + + e.vpcmpgtd(e.xmm3, e.xmm3, e.xmm2); + e.vpsrld(e.xmm1, e.xmm1, 13); + + e.vpaddd(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI3)); + e.vpminud(e.xmm0, e.xmm2, e.GetXmmConstPtr(XMMF16PackLCPI4)); + + e.vpcmpeqd(e.xmm2, e.xmm2, e.xmm0); + e.vmovdqa(e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI5)); + e.vpand(e.xmm1, e.xmm1, e.xmm0); + e.vpand(e.xmm1, e.xmm2, e.xmm1); + e.vpxor(e.xmm2, e.xmm2, e.xmm2); + + e.vblendvps(e.xmm1, e.xmm0, e.xmm1, e.xmm3); + + e.vpsrld(e.xmm0, i.src1, 16); + e.vpand(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMF16PackLCPI6)); + e.vorps(e.xmm0, e.xmm1, e.xmm0); + e.vpackusdw(i.dest, e.xmm0, e.xmm2); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(final_shuffle)); +} // ============================================================================ // OPCODE_SWIZZLE // ============================================================================ @@ -2081,14 +2208,9 @@ struct PACK : Sequence> { alignas(16) uint16_t b[8]; _mm_store_ps(a, src1); std::memset(b, 0, sizeof(b)); - if (!cvars::use_extended_range_half) { - for (int i = 0; i < 2; i++) { - b[7 - i] = half_float::detail::float2half(a[i]); - } - } else { - for (int i = 0; i < 2; i++) { - b[7 - i] = float_to_xenos_half(a[i]); - } + + for (int i = 0; i < 2; i++) { + b[7 - i] = float_to_xenos_half(a[i]); } return _mm_load_si128(reinterpret_cast<__m128i*>(b)); @@ -2098,70 +2220,26 @@ struct PACK : Sequence> { // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx // dest = [(src1.x | src1.y), 0, 0, 0] - if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // 0|0|0|0|W|Z|Y|X - e.vcvtps2ph(i.dest, src, 0b00000011); - // Shuffle to X|Y|0|0|0|0|0|0 - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2)); + if (i.src1.is_constant) { + e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); } else { - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); - e.vmovaps(i.dest, e.xmm0); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.vmovaps(i.dest, e.xmm0); } - static __m128i EmulateFLOAT16_4(void*, __m128 src1) { - alignas(16) float a[4]; - alignas(16) uint16_t b[8]; - _mm_store_ps(a, src1); - std::memset(b, 0, sizeof(b)); - if (!cvars::use_extended_range_half) { - for (int i = 0; i < 4; i++) { - b[7 - (i ^ 2)] = - half_float::detail::float2half(a[i]); - } - } else { - for (int i = 0; i < 4; i++) { - b[7 - (i ^ 2)] = float_to_xenos_half(a[i]); - } - } - return _mm_load_si128(reinterpret_cast<__m128i*>(b)); - } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - // dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0] - - if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // 0|0|0|0|W|Z|Y|X - e.vcvtps2ph(i.dest, src, 0b00000011); - // Shuffle to Z|W|X|Y|0|0|0|0 - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4)); + if (!i.src1.is_constant) { + emit_fast_f16_pack(e, i, XMMPackFLOAT16_4); } else { - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); + vec128_t result = vec128b(0); + for (unsigned idx = 0; idx < 4; ++idx) { + result.u16[(7 - (idx ^ 2))] = + float_to_xenos_half(i.src1.constant().f32[idx]); } - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); - e.vmovaps(i.dest, e.xmm0); + + e.LoadConstantXmm(i.dest, result); } } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { @@ -2508,15 +2586,10 @@ struct UNPACK : Sequence> { alignas(16) float b[4]; _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - if (!cvars::use_extended_range_half) { - for (int i = 0; i < 2; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); - } - } else { - for (int i = 0; i < 2; i++) { - b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]); - } + for (int i = 0; i < 2; i++) { + b[i] = xenos_half_to_float(a[VEC128_W(6 + i)]); } + // Constants, or something b[2] = 0.f; b[3] = 1.f; @@ -2536,74 +2609,28 @@ struct UNPACK : Sequence> { // Also zero out the high end. // TODO(benvanik): special case constant unpacks that just get 0/1/etc. - if (e.IsFeatureEnabled(kX64EmitF16C) && - !cvars::use_extended_range_half) { // todo: can use cvtph and bit logic - // to implement - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; - } - // sx = src.iw >> 16; - // sy = src.iw & 0xFFFF; - // dest = { XMConvertHalfToFloat(sx), - // XMConvertHalfToFloat(sy), - // 0.0, - // 1.0 }; - // Shuffle to 0|0|0|0|0|0|Y|X - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_2)); - e.vcvtph2ps(i.dest, i.dest); - e.vpshufd(i.dest, i.dest, 0b10100100); - e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); + if (i.src1.is_constant) { + e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); } else { - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); - e.vmovaps(i.dest, e.xmm0); + e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); } + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_2)); + e.vmovaps(i.dest, e.xmm0); } - static __m128 EmulateFLOAT16_4(void*, __m128i src1) { - alignas(16) uint16_t a[8]; - alignas(16) float b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - if (!cvars::use_extended_range_half) { - for (int i = 0; i < 4; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); - } - } else { - for (int i = 0; i < 4; i++) { - b[i] = xenos_half_to_float(a[VEC128_W(4 + i)]); - } - } - return _mm_load_ps(b); - } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { - // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] - if (e.IsFeatureEnabled(kX64EmitF16C) && !cvars::use_extended_range_half) { - Xmm src; - if (i.src1.is_constant) { - src = i.dest; - e.LoadConstantXmm(src, i.src1.constant()); - } else { - src = i.src1; + if (i.src1.is_constant) { + vec128_t result{}; + + for (int idx = 0; idx < 4; ++idx) { + result.f32[idx] = + xenos_half_to_float(i.src1.constant().u16[VEC128_W(4 + idx)]); } - // Shuffle to 0|0|0|0|W|Z|Y|X - e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackFLOAT16_4)); - e.vcvtph2ps(i.dest, i.dest); + + e.LoadConstantXmm(i.dest, result); + } else { - if (i.src1.is_constant) { - e.lea(e.GetNativeParam(0), e.StashConstantXmm(0, i.src1.constant())); - } else { - e.lea(e.GetNativeParam(0), e.StashXmm(0, i.src1)); - } - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_4)); - e.vmovaps(i.dest, e.xmm0); + emit_fast_f16_unpack(e, i, XMMUnpackFLOAT16_4); } } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 3fe52857b..10498f92b 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -50,6 +50,10 @@ DEFINE_bool(no_round_to_single, false, "Not for users, breaks games. Skip rounding double values to " "single precision and back", "CPU"); +DEFINE_bool( + inline_loadclock, false, + "Directly read cached guest clock without calling the LoadClock method (it gets repeatedly updated by calls from other threads)", + "CPU"); namespace xe { namespace cpu { namespace backend { @@ -475,33 +479,39 @@ EMITTER_OPCODE_TABLE(OPCODE_ROUND, ROUND_F32, ROUND_F64, ROUND_V128); // ============================================================================ struct LOAD_CLOCK : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // When scaling is disabled and the raw clock source is selected, the code - // in the Clock class is actually just forwarding tick counts after one - // simple multiply and division. In that case we rather bake the scaling in - // here to cut extra function calls with CPU cache misses and stack frame - // overhead. - if (cvars::clock_no_scaling && cvars::clock_source_raw) { - auto ratio = Clock::guest_tick_ratio(); - // The 360 CPU is an in-order CPU, AMD64 usually isn't. Without - // mfence/lfence magic the rdtsc instruction can be executed sooner or - // later in the cache window. Since it's resolution however is much higher - // than the 360's mftb instruction this can safely be ignored. - - // Read time stamp in edx (high part) and eax (low part). - e.rdtsc(); - // Make it a 64 bit number in rax. - e.shl(e.rdx, 32); - e.or_(e.rax, e.rdx); - // Apply tick frequency scaling. - e.mov(e.rcx, ratio.first); - e.mul(e.rcx); - // We actually now have a 128 bit number in rdx:rax. - e.mov(e.rcx, ratio.second); - e.div(e.rcx); - e.mov(i.dest, e.rax); + if (cvars::inline_loadclock) { + e.mov(e.rcx, + e.GetBackendCtxPtr(offsetof(X64BackendContext, guest_tick_count))); + e.mov(i.dest, e.qword[e.rcx]); } else { - e.CallNative(LoadClock); - e.mov(i.dest, e.rax); + // When scaling is disabled and the raw clock source is selected, the code + // in the Clock class is actually just forwarding tick counts after one + // simple multiply and division. In that case we rather bake the scaling + // in here to cut extra function calls with CPU cache misses and stack + // frame overhead. + if (cvars::clock_no_scaling && cvars::clock_source_raw) { + auto ratio = Clock::guest_tick_ratio(); + // The 360 CPU is an in-order CPU, AMD64 usually isn't. Without + // mfence/lfence magic the rdtsc instruction can be executed sooner or + // later in the cache window. Since it's resolution however is much + // higher than the 360's mftb instruction this can safely be ignored. + + // Read time stamp in edx (high part) and eax (low part). + e.rdtsc(); + // Make it a 64 bit number in rax. + e.shl(e.rdx, 32); + e.or_(e.rax, e.rdx); + // Apply tick frequency scaling. + e.mov(e.rcx, ratio.first); + e.mul(e.rcx); + // We actually now have a 128 bit number in rdx:rax. + e.mov(e.rcx, ratio.second); + e.div(e.rcx); + e.mov(i.dest, e.rax); + } else { + e.CallNative(LoadClock); + e.mov(i.dest, e.rax); + } } } static uint64_t LoadClock(void* raw_context) { @@ -539,10 +549,12 @@ struct MAX_F64 : Sequence> { struct MAX_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vmaxps(dest, src1, src2); - }); + + auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vmaxps(e.xmm2, src1, src2); + e.vmaxps(e.xmm3, src2, src1); + e.vorps(i.dest, e.xmm2, e.xmm3); } }; EMITTER_OPCODE_TABLE(OPCODE_MAX, MAX_F32, MAX_F64, MAX_V128); @@ -597,10 +609,11 @@ struct MIN_F64 : Sequence> { struct MIN_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.ChangeMxcsrMode(MXCSRMode::Vmx); - EmitCommutativeBinaryXmmOp(e, i, - [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { - e.vminps(dest, src1, src2); - }); + auto src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + auto src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + e.vminps(e.xmm2, src1, src2); + e.vminps(e.xmm3, src2, src1); + e.vorps(i.dest, e.xmm2, e.xmm3); } }; EMITTER_OPCODE_TABLE(OPCODE_MIN, MIN_I8, MIN_I16, MIN_I32, MIN_I64, MIN_F32, @@ -768,6 +781,7 @@ struct SELECT_V128_V128 } else if (mayblend == PermittedBlend::Ps) { e.vblendvps(i.dest, src2, src3, src1); } else { + //ideally we would have an xop path here... // src1 ? src2 : src3; e.vpandn(e.xmm3, src1, src2); e.vpand(i.dest, src1, src3); @@ -1932,6 +1946,53 @@ struct MUL_ADD_V128 }; EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); +struct NEGATED_MUL_ADD_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); + if (e.IsFeatureEnabled(kX64EmitFMA)) { + // todo: this is garbage + e.vmovapd(e.xmm3, src1); + e.vfmadd213sd(e.xmm3, src2, src3); + e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD)); + } else { + // todo: might need to use x87 in this case... + e.vmulsd(e.xmm3, src1, src2); + e.vaddsd(i.dest, e.xmm3, src3); + e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD)); + } + } +}; +struct NEGATED_MUL_ADD_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); + if (e.IsFeatureEnabled(kX64EmitFMA)) { + // todo: this is garbage + e.vmovaps(e.xmm3, src1); + e.vfmadd213ps(e.xmm3, src2, src3); + e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS)); + } else { + // todo: might need to use x87 in this case... + e.vmulps(e.xmm3, src1, src2); + e.vaddps(i.dest, e.xmm3, src3); + e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64, + NEGATED_MUL_ADD_V128); + // ============================================================================ // OPCODE_MUL_SUB // ============================================================================ @@ -1944,12 +2005,7 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); // - 132 -> $1 = $1 * $3 - $2 // - 213 -> $1 = $2 * $1 - $3 // - 231 -> $1 = $2 * $3 - $1 -struct MUL_SUB_F32 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_impossible_sequence(MUL_SUB_F32); - } -}; + struct MUL_SUB_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -1991,7 +2047,54 @@ struct MUL_SUB_V128 } } }; -EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F32, MUL_SUB_F64, MUL_SUB_V128); +EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128); + +struct NEGATED_MUL_SUB_F64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Fpu); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); + if (e.IsFeatureEnabled(kX64EmitFMA)) { + // todo: this is garbage + e.vmovapd(e.xmm3, src1); + e.vfmsub213sd(e.xmm3, src2, src3); + e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD)); + } else { + // todo: might need to use x87 in this case... + e.vmulsd(e.xmm3, src1, src2); + e.vsubsd(i.dest, e.xmm3, src3); + e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD)); + } + } +}; +struct NEGATED_MUL_SUB_V128 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.ChangeMxcsrMode(MXCSRMode::Vmx); + + Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); + if (e.IsFeatureEnabled(kX64EmitFMA)) { + // todo: this is garbage + e.vmovaps(e.xmm3, src1); + e.vfmsub213ps(e.xmm3, src2, src3); + e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS)); + } else { + // todo: might need to use x87 in this case... + e.vmulps(e.xmm3, src1, src2); + e.vsubps(i.dest, e.xmm3, src3); + e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS)); + } + } +}; +EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64, + NEGATED_MUL_SUB_V128); // ============================================================================ // OPCODE_NEG @@ -2264,7 +2367,7 @@ struct DOT_PRODUCT_3_V128 e.ChangeMxcsrMode(MXCSRMode::Vmx); // todo: add fast_dot_product path that just checks for infinity instead of // using mxcsr - auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; + auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH]; // this is going to hurt a bit... /* @@ -2380,7 +2483,7 @@ struct DOT_PRODUCT_4_V128 e.ChangeMxcsrMode(MXCSRMode::Vmx); // todo: add fast_dot_product path that just checks for infinity instead of // using mxcsr - auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]; + auto mxcsr_storage = e.dword[e.rsp + StackLayout::GUEST_SCRATCH]; bool is_lensqr = i.instr->src1.value == i.instr->src2.value; @@ -3162,9 +3265,9 @@ struct SET_ROUNDING_MODE_I32 // backends dont have to worry about it if (i.src1.is_constant) { e.mov(e.eax, mxcsr_table[i.src1.constant()]); - e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64], e.eax); + e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax); e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax); - e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH64]); + e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]); } else { e.mov(e.ecx, i.src1); diff --git a/src/xenia/cpu/backend/x64/x64_stack_layout.h b/src/xenia/cpu/backend/x64/x64_stack_layout.h index 5bd50a803..aee51e63a 100644 --- a/src/xenia/cpu/backend/x64/x64_stack_layout.h +++ b/src/xenia/cpu/backend/x64/x64_stack_layout.h @@ -123,7 +123,10 @@ class StackLayout { */ static const size_t GUEST_STACK_SIZE = 104; //was GUEST_CTX_HOME, can't remove because that'd throw stack alignment off. instead, can be used as a temporary in sequences - static const size_t GUEST_SCRATCH64 = 80; + static const size_t GUEST_SCRATCH = 0; + + //when profiling is on, this stores the nanosecond time at the start of the function + static const size_t GUEST_PROFILER_START = 80; static const size_t GUEST_RET_ADDR = 88; static const size_t GUEST_CALL_RET_ADDR = 96; }; diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 760b7fc2c..df5b72375 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1636,15 +1636,7 @@ Value* HIRBuilder::Div(Value* value1, Value* value2, Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) { ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value3); - #if 0 - bool c1 = value1->IsConstant(); - bool c2 = value2->IsConstant(); - if (c1 && c2) { - Value* dest = CloneValue(value1); - dest->Mul(value2); - return Add(dest, value3); - } - #endif + Instr* i = AppendInstr(OPCODE_MUL_ADD_info, 0, AllocValue(value1->type)); i->set_src1(value1); i->set_src2(value2); @@ -1655,15 +1647,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) { Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) { ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value3); - #if 0 - bool c1 = value1->IsConstant(); - bool c2 = value2->IsConstant(); - if (c1 && c2) { - Value* dest = CloneValue(value1); - dest->Mul(value2); - return Sub(dest, value3); - } - #endif + Instr* i = AppendInstr(OPCODE_MUL_SUB_info, 0, AllocValue(value1->type)); i->set_src1(value1); i->set_src2(value2); @@ -1671,6 +1655,30 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) { return i->dest; } +Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) { + ASSERT_TYPES_EQUAL(value1, value2); + ASSERT_TYPES_EQUAL(value1, value3); + + Instr* i = + AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->set_src3(value3); + return i->dest; +} + +Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) { + ASSERT_TYPES_EQUAL(value1, value2); + ASSERT_TYPES_EQUAL(value1, value3); + + Instr* i = + AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->set_src3(value3); + return i->dest; +} + Value* HIRBuilder::Neg(Value* value) { Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type)); i->set_src1(value); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index 05cb14d34..62164e52d 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -214,6 +214,10 @@ class HIRBuilder { Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3 Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3 + Value* NegatedMulAdd(Value* value1, Value* value2, + Value* value3); // -((1 * 2) + 3) + Value* NegatedMulSub(Value* value1, Value* value2, + Value* value3); // -((1 * 2) - 3) Value* Neg(Value* value); Value* Abs(Value* value); Value* Sqrt(Value* value); @@ -265,6 +269,7 @@ class HIRBuilder { Value* AtomicAdd(Value* address, Value* value); Value* AtomicSub(Value* address, Value* value); void SetNJM(Value* value); + protected: void DumpValue(StringBuffer* str, Value* value); void DumpOp(StringBuffer* str, OpcodeSignatureType sig_type, Instr::Op* op); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 2f7676861..a51fda6d5 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -208,6 +208,12 @@ enum Opcode { OPCODE_STORE_OFFSET, OPCODE_LOAD, OPCODE_STORE, + // chrispy: todo: implement, our current codegen for the unaligned loads is + // very bad + OPCODE_LVLX, + OPCODE_LVRX, + OPCODE_STVLX, + OPCODE_STVRX, OPCODE_MEMSET, OPCODE_CACHE_CONTROL, OPCODE_MEMORY_BARRIER, @@ -244,7 +250,9 @@ enum Opcode { OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type. OPCODE_DIV, OPCODE_MUL_ADD, + OPCODE_NEGATED_MUL_ADD, OPCODE_MUL_SUB, + OPCODE_NEGATED_MUL_SUB, OPCODE_NEG, OPCODE_ABS, OPCODE_SQRT, @@ -284,7 +292,8 @@ enum Opcode { OPCODE_TO_SINGLE, // i could not find a decent name to assign to this opcode, // as we already have OPCODE_ROUND. round double to float ( // ppc "single" fpu instruction result rounding behavior ) - OPCODE_SET_NJM, + OPCODE_SET_NJM, + __OPCODE_MAX_VALUE, // Keep at end. }; diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index b68e9158b..46a328903 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -464,6 +464,19 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V_V, OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) +DEFINE_OPCODE( + OPCODE_NEGATED_MUL_ADD, + "negated_mul_add", + OPCODE_SIG_V_V_V_V, + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) + +DEFINE_OPCODE( + OPCODE_NEGATED_MUL_SUB, + "negated_mul_sub", + OPCODE_SIG_V_V_V_V, + OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) + + DEFINE_OPCODE( OPCODE_NEG, "neg", @@ -692,4 +705,5 @@ DEFINE_OPCODE( "set_njm", OPCODE_SIG_X_V, 0 -) \ No newline at end of file +) + diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index 4cb7ee17d..3a1cd442e 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -613,10 +613,12 @@ class Value { // returns true if every single use is as an operand to a single instruction // (add var2, var1, var1) bool AllUsesByOneInsn() const; - //the maybe is here because this includes vec128, which is untyped data that can be treated as float or int depending on the context + // the maybe is here because this includes vec128, which is untyped data that + // can be treated as float or int depending on the context bool MaybeFloaty() const { return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE; } + private: static bool CompareInt8(Opcode opcode, Value* a, Value* b); static bool CompareInt16(Opcode opcode, Value* a, Value* b); diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index 09205850b..1528d3378 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -14,8 +14,8 @@ #include #include -#include "xenia/base/vec128.h" #include "xenia/base/mutex.h" +#include "xenia/base/vec128.h" namespace xe { namespace cpu { class Processor; @@ -390,9 +390,11 @@ typedef struct alignas(64) PPCContext_s { // These are split to make it easier to do DCE on unused stores. uint64_t cr() const; void set_cr(uint64_t value); - + // todo: remove, saturation should be represented by a vector uint8_t vscr_sat; + uint32_t vrsave; + // uint32_t get_fprf() { // return fpscr.value & 0x000F8000; // } diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 6274dfb71..c95f068c0 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -1197,7 +1197,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, Value* b = f.VectorDenormFlush(f.LoadVR(vb)); Value* c = f.VectorDenormFlush(f.LoadVR(vc)); - Value* v = f.Neg(f.MulSub(a, c, b)); + Value* v = f.NegatedMulSub(a, c, b); f.StoreVR(vd, v); return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_control.cc b/src/xenia/cpu/ppc/ppc_emit_control.cc index 0fe8e2d54..097b91bfa 100644 --- a/src/xenia/cpu/ppc/ppc_emit_control.cc +++ b/src/xenia/cpu/ppc/ppc_emit_control.cc @@ -16,6 +16,12 @@ #include "xenia/cpu/ppc/ppc_hir_builder.h" #include +// chrispy: added this, we can have simpler control flow and do dce on the +// inputs +DEFINE_bool(ignore_trap_instructions, true, + "Generate no code for powerpc trap instructions, can result in " + "better performance in games that aggressively check with trap.", + "CPU"); namespace xe { namespace cpu { @@ -449,6 +455,9 @@ constexpr uint32_t TRAP_SLT = 1 << 4, TRAP_SGT = 1 << 3, TRAP_EQ = 1 << 2, int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb, uint32_t TO) { + if (cvars::ignore_trap_instructions) { + return 0; + } // if (a < b) & TO[0] then TRAP // if (a > b) & TO[1] then TRAP // if (a = b) & TO[2] then TRAP @@ -521,6 +530,9 @@ int InstrEmit_trap(PPCHIRBuilder& f, const InstrData& i, Value* va, Value* vb, } int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) { + if (cvars::ignore_trap_instructions) { + return 0; + } // a <- (RA) // b <- (RB) // if (a < b) & TO[0] then TRAP @@ -534,6 +546,9 @@ int InstrEmit_td(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) { + if (cvars::ignore_trap_instructions) { + return 0; + } // a <- (RA) // if (a < EXTS(SI)) & TO[0] then TRAP // if (a > EXTS(SI)) & TO[1] then TRAP @@ -546,6 +561,9 @@ int InstrEmit_tdi(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) { + if (cvars::ignore_trap_instructions) { + return 0; + } // a <- EXTS((RA)[32:63]) // b <- EXTS((RB)[32:63]) // if (a < b) & TO[0] then TRAP @@ -561,6 +579,9 @@ int InstrEmit_tw(PPCHIRBuilder& f, const InstrData& i) { } int InstrEmit_twi(PPCHIRBuilder& f, const InstrData& i) { + if (cvars::ignore_trap_instructions) { + return 0; + } // a <- EXTS((RA)[32:63]) // if (a < EXTS(SI)) & TO[0] then TRAP // if (a > EXTS(SI)) & TO[1] then TRAP @@ -645,7 +666,9 @@ int InstrEmit_mfspr(PPCHIRBuilder& f, const InstrData& i) { break; case 256: // VRSAVE - v = f.LoadZeroInt64(); + + v = f.ZeroExtend(f.LoadContext(offsetof(PPCContext, vrsave), INT32_TYPE), + INT64_TYPE); break; case 268: // TB @@ -749,6 +772,8 @@ int InstrEmit_mtspr(PPCHIRBuilder& f, const InstrData& i) { f.StoreCTR(rt); break; case 256: + + f.StoreContext(offsetof(PPCContext, vrsave), f.Truncate(rt, INT32_TYPE)); // VRSAVE break; default: @@ -768,6 +793,7 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) { // bit 48 = EE; interrupt enabled // bit 62 = RI; recoverable interrupt // return 8000h if unlocked (interrupts enabled), else 0 +#if 0 f.MemoryBarrier(); if (cvars::disable_global_lock || true) { f.StoreGPR(i.X.RT, f.LoadConstantUint64(0)); @@ -777,63 +803,23 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) { f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE)); } +#else + f.StoreGPR(i.X.RT, f.LoadConstantUint64(0)); +#endif return 0; } int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { - if (i.X.RA & 0x01) { - // L = 1 - // iff storing from r13 - f.MemoryBarrier(); - f.StoreContext( - offsetof(PPCContext, scratch), - f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE)); -#if 0 - if (i.X.RT == 13) { - // iff storing from r13 we are taking a lock (disable interrupts). - if (!cvars::disable_global_lock) { - f.CallExtern(f.builtins()->enter_global_lock); - } - } else { - // Otherwise we are restoring interrupts (probably). - if (!cvars::disable_global_lock) { - f.CallExtern(f.builtins()->leave_global_lock); - } - } -#endif - return 0; - } else { - // L = 0 - XEINSTRNOTIMPLEMENTED(); - return 1; - } + f.StoreContext( + offsetof(PPCContext, scratch), + f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE)); + return 0; } int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { - if (i.X.RA & 0x01) { - // L = 1 - f.MemoryBarrier(); - f.StoreContext(offsetof(PPCContext, scratch), - f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE)); -#if 0 - if (i.X.RT == 13) { - // iff storing from r13 we are taking a lock (disable interrupts). - if (!cvars::disable_global_lock) { - f.CallExtern(f.builtins()->enter_global_lock); - } - } else { - // Otherwise we are restoring interrupts (probably). - if (!cvars::disable_global_lock) { - f.CallExtern(f.builtins()->leave_global_lock); - } - } -#endif - return 0; - } else { - // L = 0 - XEINSTRNOTIMPLEMENTED(); - return 1; - } + f.StoreContext(offsetof(PPCContext, scratch), + f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE)); + return 0; } void RegisterEmitCategoryControl() { diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index 5723c6bfd..71d323f2e 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] + frB) - Value* v = f.Neg( - f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); + Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), + f.LoadFPR(i.A.FRB)); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] + frB) - Value* v = f.Neg( - f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); + Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), + f.LoadFPR(i.A.FRB)); v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); @@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] - frB) - Value* v = f.Neg( - f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); + Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), + f.LoadFPR(i.A.FRB)); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] - frB) - Value* v = f.Neg( - f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); + Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), + f.LoadFPR(i.A.FRB)); v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index 7e7636adb..9bb7d5593 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -834,6 +834,7 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) { // Issue memory barrier for when we go out of lock and want others to see our // updates. + f.MemoryBarrier(); return 0; diff --git a/src/xenia/cpu/thread_state.cc b/src/xenia/cpu/thread_state.cc index 1383646e1..fe9467dd8 100644 --- a/src/xenia/cpu/thread_state.cc +++ b/src/xenia/cpu/thread_state.cc @@ -77,7 +77,8 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id, // Allocate with 64b alignment. - context_ = reinterpret_cast(AllocateContext()); // memory::AlignedAlloc(64); + context_ = reinterpret_cast( + AllocateContext()); processor->backend()->InitializeBackendContext(context_); assert_true(((uint64_t)context_ & 0x3F) == 0); std::memset(context_, 0, sizeof(ppc::PPCContext)); @@ -93,6 +94,7 @@ ThreadState::ThreadState(Processor* processor, uint32_t thread_id, // Set initial registers. context_->r[1] = stack_base; context_->r[13] = pcr_address; + // fixme: VSCR must be set here! } ThreadState::~ThreadState() { @@ -105,7 +107,7 @@ ThreadState::~ThreadState() { if (context_) { FreeContext(reinterpret_cast(context_)); } - // memory::AlignedFree(context_); + // memory::AlignedFree(context_); } void ThreadState::Bind(ThreadState* thread_state) { diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 4205016cd..de5c6fb5f 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -29,10 +29,20 @@ #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" +#if defined(NDEBUG) +static constexpr bool should_log_unknown_reg_writes() { return false; } + +#else + DEFINE_bool(log_unknown_register_writes, false, "Log writes to unknown registers from " "CommandProcessor::WriteRegister. Has significant performance hit.", "GPU"); +static bool should_log_unknown_reg_writes() { + return cvars::log_unknown_register_writes; +} +#endif + namespace xe { namespace gpu { @@ -465,7 +475,7 @@ void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index, } } void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { - if (XE_UNLIKELY(cvars::log_unknown_register_writes)) { + if (should_log_unknown_reg_writes()) { // chrispy: rearrange check order, place set after checks if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) { XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); @@ -498,15 +508,40 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { (index == XE_GPU_REG_COHER_STATUS_HOST) | ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); - //chrispy: reordered for msvc branch probability (assumes if is taken and else is not) + // chrispy: reordered for msvc branch probability (assumes if is taken and + // else is not) if (XE_LIKELY(expr == 0)) { - } else { HandleSpecialRegisterWrite(index, value); } - +} +void CommandProcessor::WriteRegistersFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + for (uint32_t i = 0; i < num_registers; ++i) { + uint32_t data = xe::load_and_swap(base + i); + this->WriteRegister(start_index + i, data); + } } +void CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_registers) { + for (uint32_t i = 0; i < num_registers; ++i) { + uint32_t data = ring->ReadAndSwap(); + WriteRegister(base + i, data); + } +} + +void CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + for (uint32_t m = 0; m < num_times; m++) { + uint32_t reg_data = ring->ReadAndSwap(); + uint32_t target_index = base; + WriteRegister(target_index, reg_data); + } +} void CommandProcessor::MakeCoherent() { SCOPE_profile_cpu_f("gpu"); @@ -628,15 +663,20 @@ void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) { } bool CommandProcessor::ExecutePacket(RingBuffer* reader) { + // prefetch the wraparound range + // it likely is already in L3 cache, but in a zen system it may be another + // chiplets l3 + reader->BeginPrefetchedRead( + reader->read_count()); const uint32_t packet = reader->ReadAndSwap(); const uint32_t packet_type = packet >> 30; - if (packet == 0 || packet == 0x0BADF00D) { + if (XE_UNLIKELY(packet == 0 || packet == 0x0BADF00D)) { trace_writer_.WritePacketStart(uint32_t(reader->read_ptr() - 4), 1); trace_writer_.WritePacketEnd(); return true; } - if (packet == 0xCDCDCDCD) { + if (XE_UNLIKELY(packet == 0xCDCDCDCD)) { XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!"); } @@ -672,10 +712,10 @@ bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) { uint32_t base_index = (packet & 0x7FFF); uint32_t write_one_reg = (packet >> 15) & 0x1; - for (uint32_t m = 0; m < count; m++) { - uint32_t reg_data = reader->ReadAndSwap(); - uint32_t target_index = write_one_reg ? base_index : base_index + m; - WriteRegister(target_index, reg_data); + if (write_one_reg) { + WriteOneRegisterFromRing(reader, base_index, count); + } else { + WriteRegisterRangeFromRing(reader, base_index, count); } trace_writer_.WritePacketEnd(); @@ -939,7 +979,7 @@ bool CommandProcessor::ExecutePacketType3_XE_SWAP(RingBuffer* reader, uint32_t count) { SCOPE_profile_cpu_f("gpu"); - XELOGI("XE_SWAP"); + XELOGD("XE_SWAP"); Profiler::Flip(); @@ -1472,10 +1512,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, reader->AdvanceRead((count - 1) * sizeof(uint32_t)); return true; } - for (uint32_t n = 0; n < count - 1; n++, index++) { - uint32_t data = reader->ReadAndSwap(); - WriteRegister(index, data); - } + + WriteRegisterRangeFromRing(reader, index, count - 1); + return true; } @@ -1484,10 +1523,9 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t count) { uint32_t offset_type = reader->ReadAndSwap(); uint32_t index = offset_type & 0xFFFF; - for (uint32_t n = 0; n < count - 1; n++, index++) { - uint32_t data = reader->ReadAndSwap(); - WriteRegister(index, data); - } + + WriteRegisterRangeFromRing(reader, index, count - 1); + return true; } @@ -1522,12 +1560,12 @@ bool CommandProcessor::ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, assert_always(); return true; } + trace_writer_.WriteMemoryRead(CpuToGpu(address), size_dwords * 4); - for (uint32_t n = 0; n < size_dwords; n++, index++) { - uint32_t data = xe::load_and_swap( - memory_->TranslatePhysical(address + n * 4)); - WriteRegister(index, data); - } + + WriteRegistersFromMem(index, (uint32_t*)memory_->TranslatePhysical(address), + size_dwords); + return true; } @@ -1535,10 +1573,9 @@ bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS( RingBuffer* reader, uint32_t packet, uint32_t count) { uint32_t offset_type = reader->ReadAndSwap(); uint32_t index = offset_type & 0xFFFF; - for (uint32_t n = 0; n < count - 1; n++, index++) { - uint32_t data = reader->ReadAndSwap(); - WriteRegister(index, data); - } + + WriteRegisterRangeFromRing(reader, index, count - 1); + return true; } diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index c9245773b..cde2f4fdb 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -156,6 +156,21 @@ class CommandProcessor { XE_FORCEINLINE virtual void WriteRegister(uint32_t index, uint32_t value); + // mem has big-endian register values + XE_FORCEINLINE + virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + + XE_FORCEINLINE + virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_registers); + + XE_FORCEINLINE + virtual void WriteOneRegisterFromRing( + xe::RingBuffer* ring, uint32_t base, + uint32_t + num_times); // repeatedly write a value to one register, presumably a + // register with special handling for writes const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const { return gamma_ramp_256_entry_table_; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index add11e4f6..db8c874f2 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1710,7 +1710,60 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } } } +void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + for (uint32_t i = 0; i < num_registers; ++i) { + uint32_t data = xe::load_and_swap(base + i); + D3D12CommandProcessor::WriteRegister(start_index + i, data); + } +} +void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_registers) { + // we already brought it into L2 earlier + RingBuffer::ReadRange range = + ring->BeginPrefetchedRead(num_registers * + sizeof(uint32_t)); + uint32_t num_regs_firstrange = + static_cast(range.first_length / sizeof(uint32_t)); + + D3D12CommandProcessor::WriteRegistersFromMem( + base, reinterpret_cast(const_cast(range.first)), + num_regs_firstrange); + if (range.second) { + D3D12CommandProcessor::WriteRegistersFromMem( + base + num_regs_firstrange, + reinterpret_cast(const_cast(range.second)), + num_registers - num_regs_firstrange); + } + ring->EndRead(range); +} +void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_times) { + auto read = ring->BeginPrefetchedRead( + num_times * sizeof(uint32_t)); + + uint32_t first_length = read.first_length / sizeof(uint32_t); + + for (uint32_t i = 0; i < first_length; ++i) { + D3D12CommandProcessor::WriteRegister( + base, xe::load_and_swap(read.first + (sizeof(uint32_t) * i))); + } + + if (read.second) { + uint32_t second_length = read.second_length / sizeof(uint32_t); + + for (uint32_t i = 0; i < second_length; ++i) { + D3D12CommandProcessor::WriteRegister( + base, + xe::load_and_swap(read.second + (sizeof(uint32_t) * i))); + } + } + ring->EndRead(read); +} void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() { gamma_ramp_256_entry_table_up_to_date_ = false; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 6162b4683..2b43233b9 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -42,7 +42,7 @@ namespace xe { namespace gpu { namespace d3d12 { -class D3D12CommandProcessor : public CommandProcessor { +class D3D12CommandProcessor final : public CommandProcessor { public: explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system, kernel::KernelState* kernel_state); @@ -203,9 +203,17 @@ class D3D12CommandProcessor : public CommandProcessor { protected: bool SetupContext() override; void ShutdownContext() override; - + XE_FORCEINLINE void WriteRegister(uint32_t index, uint32_t value) override; - + XE_FORCEINLINE + virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers) override; + XE_FORCEINLINE + virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_registers) override; + XE_FORCEINLINE + virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base, + uint32_t num_times) override; void OnGammaRamp256EntryTableValueWritten() override; void OnGammaRampPWLValueWritten() override; diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc index e946a3319..b180ea8bf 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc @@ -406,14 +406,16 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange( } bool D3D12SharedMemory::UploadRanges( - const std::vector>& upload_page_ranges) { - if (upload_page_ranges.empty()) { + const std::pair* upload_page_ranges, unsigned num_upload_page_ranges) { + if (!num_upload_page_ranges) { return true; } CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST); command_processor_.SubmitBarriers(); auto& command_list = command_processor_.GetDeferredCommandList(); - for (auto upload_range : upload_page_ranges) { + //for (auto upload_range : upload_page_ranges) { + for (unsigned int i = 0; i < num_upload_page_ranges; ++i) { + auto& upload_range = upload_page_ranges[i]; uint32_t upload_range_start = upload_range.first; uint32_t upload_range_length = upload_range.second; trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(), diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h index abf069447..76200ef7f 100644 --- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h +++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h @@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory { bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, uint32_t length_allocations) override; - bool UploadRanges(const std::vector>& - upload_page_ranges) override; + bool UploadRanges(const std::pair* + upload_page_ranges, unsigned num_ranges) override; private: D3D12CommandProcessor& command_processor_; diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index 925956a8a..1d1600389 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -567,7 +567,7 @@ class DeferredCommandList { // uintmax_t to ensure uint64_t and pointer alignment of all structures. //std::vector command_stream_; - fixed_vmem_vector command_stream_; + FixedVMemVector command_stream_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index c51cc61a0..10017fff1 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, xenos::kMaxResolveSize); y1 = y0 + int32_t(xenos::kMaxResolveSize); } - + //fails in forza horizon 1 assert_true(x0 < x1 && y0 < y1); if (x0 >= x1 || y0 >= y1) { XELOGE("Resolve region is empty"); diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index dc38a42b8..4530f57d4 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -320,8 +320,7 @@ void Shader::GatherVertexFetchInformation( for (auto& vertex_binding : vertex_bindings_) { if (vertex_binding.fetch_constant == op.fetch_constant_index()) { // It may not hold that all strides are equal, but I hope it does. - assert_true(!fetch_instr.attributes.stride || - vertex_binding.stride_words == fetch_instr.attributes.stride); + vertex_binding.attributes.push_back({}); attrib = &vertex_binding.attributes.back(); break; diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index 89743b71c..428a18f78 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -14,6 +14,7 @@ #include "xenia/base/assert.h" #include "xenia/base/bit_range.h" +#include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/base/profiling.h" @@ -344,7 +345,7 @@ void SharedMemory::UnlinkWatchRange(WatchRange* range) { range->next_free = watch_range_first_free_; watch_range_first_free_ = range; } - +// todo: optimize, an enormous amount of cpu time (1.34%) is spent here. bool SharedMemory::RequestRange(uint32_t start, uint32_t length, bool* any_data_resolved_out) { if (!length) { @@ -364,14 +365,20 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, return false; } + unsigned int current_upload_range = 0; uint32_t page_first = start >> page_size_log2_; uint32_t page_last = (start + length - 1) >> page_size_log2_; upload_ranges_.clear(); + + std::pair* uploads = + reinterpret_cast*>(upload_ranges_.data()); + bool any_data_resolved = false; uint32_t block_first = page_first >> 6; uint32_t block_last = page_last >> 6; uint32_t range_start = UINT32_MAX; + { auto global_lock = global_critical_region_.Acquire(); for (uint32_t i = block_first; i <= block_last; ++i) { @@ -412,8 +419,13 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) { break; } - upload_ranges_.push_back( - std::make_pair(range_start, (i << 6) + block_page - range_start)); + if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) { + xe::FatalError( + "Hit max upload ranges in shared_memory.cc, tell a dev to " + "raise the limit!"); + } + uploads[current_upload_range++] = + std::make_pair(range_start, (i << 6) + block_page - range_start); // In the next iteration within this block, consider this range valid // since it has been queued for upload. block_valid |= (uint64_t(1) << block_page) - 1; @@ -423,17 +435,17 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length, } } if (range_start != UINT32_MAX) { - upload_ranges_.push_back( - std::make_pair(range_start, page_last + 1 - range_start)); + uploads[current_upload_range++] = + (std::make_pair(range_start, page_last + 1 - range_start)); } if (any_data_resolved_out) { *any_data_resolved_out = any_data_resolved; } - if (upload_ranges_.empty()) { + if (!current_upload_range) { return true; } - return UploadRanges(upload_ranges_); + return UploadRanges(uploads, current_upload_range); } std::pair SharedMemory::MemoryInvalidationCallbackThunk( diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 63cc380d0..7f8d3a892 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -50,8 +50,8 @@ class SharedMemory { void* callback_context); void UnregisterGlobalWatch(GlobalWatchHandle handle); typedef void (*WatchCallback)(const global_unique_lock_type& global_lock, - void* context, - void* data, uint64_t argument, bool invalidated_by_gpu); + void* context, void* data, uint64_t argument, + bool invalidated_by_gpu); typedef void* WatchHandle; // Registers a callback invoked when the specified memory range is invalidated // in the GPU memory copy by the CPU or (if triggered explicitly - such as by @@ -140,7 +140,8 @@ class SharedMemory { // ascending address order, so front and back can be used to determine the // overall bounds of pages to be uploaded. virtual bool UploadRanges( - const std::vector>& upload_page_ranges) = 0; + const std::pair* upload_page_ranges, + unsigned num_upload_ranges) = 0; const std::vector>& trace_download_ranges() { return trace_download_ranges_; @@ -174,10 +175,13 @@ class SharedMemory { void* memory_invalidation_callback_handle_ = nullptr; void* memory_data_provider_handle_ = nullptr; + static constexpr unsigned int MAX_UPLOAD_RANGES = 65536; // Ranges that need to be uploaded, generated by GetRangesToUpload (a // persistently allocated vector). - std::vector> upload_ranges_; + // std::vector> upload_ranges_; + FixedVMemVector)> + upload_ranges_; // GPU-written memory downloading for traces. . std::vector> trace_download_ranges_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 68a00cbe8..47d7506e6 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -1157,7 +1157,14 @@ void VulkanCommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } } } - +void VulkanCommandProcessor::WriteRegistersFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + for (uint32_t i = 0; i < num_registers; ++i) { + uint32_t data = xe::load_and_swap(base + i); + VulkanCommandProcessor::WriteRegister(start_index + i, data); + } +} void VulkanCommandProcessor::SparseBindBuffer( VkBuffer buffer, uint32_t bind_count, const VkSparseMemoryBind* binds, VkPipelineStageFlags wait_stage_mask) { diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 7920981fb..3b09e0fce 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -45,7 +45,7 @@ namespace xe { namespace gpu { namespace vulkan { -class VulkanCommandProcessor : public CommandProcessor { +class VulkanCommandProcessor final : public CommandProcessor { public: // Single-descriptor layouts for use within a single frame. enum class SingleTransientDescriptorLayout { @@ -259,8 +259,11 @@ class VulkanCommandProcessor : public CommandProcessor { protected: bool SetupContext() override; void ShutdownContext() override; - + XE_FORCEINLINE void WriteRegister(uint32_t index, uint32_t value) override; + XE_FORCEINLINE + virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers) override; void OnGammaRamp256EntryTableValueWritten() override; void OnGammaRampPWLValueWritten() override; diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc index c321b9840..eb6403441 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.cc +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.cc @@ -376,18 +376,21 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange( } bool VulkanSharedMemory::UploadRanges( - const std::vector>& upload_page_ranges) { - if (upload_page_ranges.empty()) { + const std::pair* upload_page_ranges, + unsigned num_upload_ranges) { + if (!num_upload_ranges) { return true; } + + auto& range_front = upload_page_ranges[0]; + auto& range_back = upload_page_ranges[num_upload_ranges - 1]; + // upload_page_ranges are sorted, use them to determine the range for the // ordering barrier. Use(Usage::kTransferDestination, - std::make_pair( - upload_page_ranges.front().first << page_size_log2(), - (upload_page_ranges.back().first + upload_page_ranges.back().second - - upload_page_ranges.front().first) - << page_size_log2())); + std::make_pair(range_front.first << page_size_log2(), + (range_back.first + range_back.second - range_front.first) + << page_size_log2())); command_processor_.SubmitBarriers(true); DeferredCommandBuffer& command_buffer = command_processor_.deferred_command_buffer(); @@ -395,9 +398,11 @@ bool VulkanSharedMemory::UploadRanges( bool successful = true; upload_regions_.clear(); VkBuffer upload_buffer_previous = VK_NULL_HANDLE; - for (auto upload_range : upload_page_ranges) { - uint32_t upload_range_start = upload_range.first; - uint32_t upload_range_length = upload_range.second; + + // for (auto upload_range : upload_page_ranges) { + for (unsigned int i = 0; i < num_upload_ranges; ++i) { + uint32_t upload_range_start = upload_page_ranges[i].first; + uint32_t upload_range_length = upload_page_ranges[i].second; trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(), upload_range_length << page_size_log2()); while (upload_range_length) { diff --git a/src/xenia/gpu/vulkan/vulkan_shared_memory.h b/src/xenia/gpu/vulkan/vulkan_shared_memory.h index 14214a5d0..b83877a38 100644 --- a/src/xenia/gpu/vulkan/vulkan_shared_memory.h +++ b/src/xenia/gpu/vulkan/vulkan_shared_memory.h @@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory { bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations, uint32_t length_allocations) override; - bool UploadRanges(const std::vector>& - upload_page_ranges) override; + bool UploadRanges(const std::pair* + upload_page_ranges, unsigned num_ranges) override; private: void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask, diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc index c61a24802..588faefe3 100644 --- a/src/xenia/hid/input_system.cc +++ b/src/xenia/hid/input_system.cc @@ -137,6 +137,8 @@ X_INPUT_VIBRATION InputSystem::ModifyVibrationLevel( modified_vibration.right_motor_speed = 0; return modified_vibration; } - +std::unique_lock InputSystem::lock() { + return std::unique_lock{lock_}; +} } // namespace hid } // namespace xe diff --git a/src/xenia/hid/input_system.h b/src/xenia/hid/input_system.h index 0063598c7..333116499 100644 --- a/src/xenia/hid/input_system.h +++ b/src/xenia/hid/input_system.h @@ -12,7 +12,7 @@ #include #include - +#include "xenia/base/mutex.h" #include "xenia/hid/input.h" #include "xenia/hid/input_driver.h" #include "xenia/xbox.h" @@ -48,6 +48,8 @@ class InputSystem { void UpdateUsedSlot(uint8_t slot, bool connected); uint8_t GetConnectedSlots() const { return connected_slot; } + std::unique_lock lock(); + private: xe::ui::Window* window_ = nullptr; @@ -55,6 +57,7 @@ class InputSystem { X_INPUT_VIBRATION ModifyVibrationLevel(X_INPUT_VIBRATION* vibration); uint8_t connected_slot = 0b0001; + xe_unlikely_mutex lock_; }; } // namespace hid diff --git a/src/xenia/hid/xinput/xinput_input_driver.cc b/src/xenia/hid/xinput/xinput_input_driver.cc index 497d80089..221914f1a 100644 --- a/src/xenia/hid/xinput/xinput_input_driver.cc +++ b/src/xenia/hid/xinput/xinput_input_driver.cc @@ -14,9 +14,9 @@ #include // NOLINT(build/include_order) +#include "xenia/base/clock.h" #include "xenia/base/logging.h" #include "xenia/hid/hid_flags.h" - namespace xe { namespace hid { namespace xinput { @@ -81,13 +81,39 @@ X_STATUS XInputInputDriver::Setup() { } return X_STATUS_SUCCESS; } +constexpr uint64_t SKIP_INVALID_CONTROLLER_TIME = 1100; +static uint64_t last_invalid_time[4]; + +static DWORD should_skip(uint32_t user_index) { + uint64_t time = last_invalid_time[user_index]; + if (time) { + uint64_t deltatime = xe::Clock::QueryHostUptimeMillis() - time; + + if (deltatime < SKIP_INVALID_CONTROLLER_TIME) { + return ERROR_DEVICE_NOT_CONNECTED; + } + last_invalid_time[user_index] = 0; + } + return 0; +} + +static void set_skip(uint32_t user_index) { + last_invalid_time[user_index] = xe::Clock::QueryHostUptimeMillis(); +} X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES* out_caps) { + DWORD skipper = should_skip(user_index); + if (skipper) { + return skipper; + } XINPUT_CAPABILITIES native_caps; auto xigc = (decltype(&XInputGetCapabilities))XInputGetCapabilities_; DWORD result = xigc(user_index, flags, &native_caps); if (result) { + if (result == ERROR_DEVICE_NOT_CONNECTED) { + set_skip(user_index); + } return result; } @@ -110,10 +136,18 @@ X_RESULT XInputInputDriver::GetCapabilities(uint32_t user_index, uint32_t flags, X_RESULT XInputInputDriver::GetState(uint32_t user_index, X_INPUT_STATE* out_state) { + DWORD skipper = should_skip(user_index); + if (skipper) { + return skipper; + } XINPUT_STATE native_state; auto xigs = (decltype(&XInputGetState))XInputGetState_; + DWORD result = xigs(user_index, &native_state); if (result) { + if (result == ERROR_DEVICE_NOT_CONNECTED) { + set_skip(user_index); + } return result; } @@ -131,11 +165,18 @@ X_RESULT XInputInputDriver::GetState(uint32_t user_index, X_RESULT XInputInputDriver::SetState(uint32_t user_index, X_INPUT_VIBRATION* vibration) { + DWORD skipper = should_skip(user_index); + if (skipper) { + return skipper; + } XINPUT_VIBRATION native_vibration; native_vibration.wLeftMotorSpeed = vibration->left_motor_speed; native_vibration.wRightMotorSpeed = vibration->right_motor_speed; auto xiss = (decltype(&XInputSetState))XInputSetState_; DWORD result = xiss(user_index, &native_vibration); + if (result == ERROR_DEVICE_NOT_CONNECTED) { + set_skip(user_index); + } return result; } diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc index 9b234f526..958489d48 100644 --- a/src/xenia/kernel/kernel_state.cc +++ b/src/xenia/kernel/kernel_state.cc @@ -948,7 +948,11 @@ bool KernelState::Restore(ByteStream* stream) { } uint8_t KernelState::GetConnectedUsers() const { - return emulator_->input_system()->GetConnectedSlots(); + auto input_sys = emulator_->input_system(); + + auto lock = input_sys->lock(); + + return input_sys->GetConnectedSlots(); } void KernelState::UpdateUsedUserProfiles() { diff --git a/src/xenia/kernel/xam/xam_input.cc b/src/xenia/kernel/xam/xam_input.cc index aeb3b403a..789c276ea 100644 --- a/src/xenia/kernel/xam/xam_input.cc +++ b/src/xenia/kernel/xam/xam_input.cc @@ -58,6 +58,7 @@ dword_result_t XamInputGetCapabilities_entry( } auto input_system = kernel_state()->emulator()->input_system(); + auto lock = input_system->lock(); return input_system->GetCapabilities(actual_user_index, flags, caps); } DECLARE_XAM_EXPORT1(XamInputGetCapabilities, kInput, kSketchy); @@ -81,6 +82,7 @@ dword_result_t XamInputGetCapabilitiesEx_entry( } auto input_system = kernel_state()->emulator()->input_system(); + auto lock = input_system->lock(); return input_system->GetCapabilities(actual_user_index, flags, caps); } DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy); @@ -88,6 +90,13 @@ DECLARE_XAM_EXPORT1(XamInputGetCapabilitiesEx, kInput, kSketchy); // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputgetstate(v=vs.85).aspx dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags, pointer_t input_state) { + if (input_state) { + memset((void*)input_state.host_address(), 0, sizeof X_INPUT_STATE); + } + if (user_index >= 4) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + // Games call this with a NULL state ptr, probably as a query. if ((flags & 0xFF) && (flags & XINPUT_FLAG_GAMEPAD) == 0) { @@ -96,12 +105,14 @@ dword_result_t XamInputGetState_entry(dword_t user_index, dword_t flags, } uint32_t actual_user_index = user_index; + // chrispy: change this, logic is not right if ((actual_user_index & 0xFF) == 0xFF || (flags & XINPUT_FLAG_ANY_USER)) { // Always pin user to 0. actual_user_index = 0; } auto input_system = kernel_state()->emulator()->input_system(); + auto lock = input_system->lock(); return input_system->GetState(user_index, input_state); } DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency); @@ -109,6 +120,9 @@ DECLARE_XAM_EXPORT2(XamInputGetState, kInput, kImplemented, kHighFrequency); // https://msdn.microsoft.com/en-us/library/windows/desktop/microsoft.directx_sdk.reference.xinputsetstate(v=vs.85).aspx dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk, pointer_t vibration) { + if (user_index >= 4) { + return X_E_DEVICE_NOT_CONNECTED; + } if (!vibration) { return X_ERROR_BAD_ARGUMENTS; } @@ -120,6 +134,7 @@ dword_result_t XamInputSetState_entry(dword_t user_index, dword_t unk, } auto input_system = kernel_state()->emulator()->input_system(); + auto lock = input_system->lock(); return input_system->SetState(user_index, vibration); } DECLARE_XAM_EXPORT1(XamInputSetState, kInput, kImplemented); @@ -147,6 +162,7 @@ dword_result_t XamInputGetKeystroke_entry( } auto input_system = kernel_state()->emulator()->input_system(); + auto lock = input_system->lock(); return input_system->GetKeystroke(user_index, flags, keystroke); } DECLARE_XAM_EXPORT1(XamInputGetKeystroke, kInput, kImplemented); @@ -166,14 +182,15 @@ dword_result_t XamInputGetKeystrokeEx_entry( uint32_t user_index = *user_index_ptr; auto input_system = kernel_state()->emulator()->input_system(); - + auto lock = input_system->lock(); if ((user_index & 0xFF) == 0xFF) { // Always pin user to 0. user_index = 0; } if (flags & XINPUT_FLAG_ANY_USER) { - // That flag means we should iterate over every connected controller and check which one have pending request. + // That flag means we should iterate over every connected controller and + // check which one have pending request. auto result = X_ERROR_DEVICE_NOT_CONNECTED; for (uint32_t i = 0; i < 4; i++) { auto result = input_system->GetKeystroke(i, flags, keystroke); @@ -188,6 +205,7 @@ dword_result_t XamInputGetKeystrokeEx_entry( } auto result = input_system->GetKeystroke(user_index, flags, keystroke); + if (XSUCCEEDED(result)) { *user_index_ptr = keystroke->user_index; } @@ -202,7 +220,8 @@ X_HRESULT_result_t XamUserGetDeviceContext_entry(dword_t user_index, // If this function fails they assume zero, so let's fail AND // set zero just to be safe. *out_ptr = 0; - if (kernel_state()->IsUserSignedIn(user_index) || (user_index & 0xFF) == 0xFF) { + if (kernel_state()->IsUserSignedIn(user_index) || + (user_index & 0xFF) == 0xFF) { *out_ptr = (uint32_t)user_index; return X_E_SUCCESS; } else { diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc index c21d88517..978f24062 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc @@ -121,7 +121,8 @@ dword_result_t NtAllocateVirtualMemory_entry(lpdword_t base_addr_ptr, ? -int32_t(region_size_ptr.value()) : region_size_ptr.value(); - adjusted_size = xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024); + adjusted_size = + xe::round_up(adjusted_size, adjusted_base ? page_size : 64 * 1024); // Allocate. uint32_t allocation_type = 0; @@ -295,10 +296,19 @@ struct X_MEMORY_BASIC_INFORMATION { be protect; be type; }; - +// chrispy: added region_type ? guessed name, havent seen any except 0 used dword_result_t NtQueryVirtualMemory_entry( dword_t base_address, - pointer_t memory_basic_information_ptr) { + pointer_t memory_basic_information_ptr, + dword_t region_type) { + switch (region_type) { + case 0: + case 1: + case 2: + break; + default: + return X_STATUS_INVALID_PARAMETER; + } auto heap = kernel_state()->memory()->LookupHeap(base_address); HeapAllocationInfo alloc_info; if (heap == nullptr || !heap->QueryRegionInfo(base_address, &alloc_info)) { @@ -373,8 +383,9 @@ dword_result_t MmAllocatePhysicalMemoryEx_entry( // min_addr_range/max_addr_range are bounds in physical memory, not virtual. uint32_t heap_base = heap->heap_base(); uint32_t heap_physical_address_offset = heap->GetPhysicalAddress(heap_base); - // TODO(Gliniak): Games like 545108B4 compares min_addr_range with value returned. - // 0x1000 offset causes it to go below that minimal range and goes haywire + // TODO(Gliniak): Games like 545108B4 compares min_addr_range with value + // returned. 0x1000 offset causes it to go below that minimal range and goes + // haywire if (min_addr_range && max_addr_range) { heap_physical_address_offset = 0; } diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc index db7f72ea4..000e75a1a 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_rtl.cc @@ -53,21 +53,20 @@ DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemory, kMemory, kImplemented); // https://msdn.microsoft.com/en-us/library/ff552123 dword_result_t RtlCompareMemoryUlong_entry(lpvoid_t source, dword_t length, dword_t pattern) { - // Return 0 if source/length not aligned - if (source.guest_address() % 4 || length % 4) { - return 0; - } + uint32_t num_compared_bytes = 0; - uint32_t n = 0; - for (uint32_t i = 0; i < (length / 4); i++) { - // FIXME: This assumes as_array returns xe::be - uint32_t val = source.as_array()[i]; - if (val == pattern) { - n++; + uint32_t swapped_pattern = xe::byte_swap(pattern.value()); + + char* host_source = (char*)source.host_address(); + + for (uint32_t aligned_length = length & 0xFFFFFFFCU; aligned_length; + num_compared_bytes += 4) { + if (*(uint32_t*)(host_source + num_compared_bytes) != swapped_pattern) { + break; } + aligned_length = aligned_length - 4; } - - return n; + return num_compared_bytes; } DECLARE_XBOXKRNL_EXPORT1(RtlCompareMemoryUlong, kMemory, kImplemented); @@ -85,23 +84,61 @@ void RtlFillMemoryUlong_entry(lpvoid_t destination, dword_t length, } DECLARE_XBOXKRNL_EXPORT1(RtlFillMemoryUlong, kMemory, kImplemented); -dword_result_t RtlUpperChar_entry(dword_t in) { - char c = in & 0xFF; - if (c >= 'a' && c <= 'z') { - return c ^ 0x20; - } +static constexpr const unsigned char rtl_lower_table[256] = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, + 0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, + 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, + 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, + 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, + 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, + 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, + 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, + 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, + 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, + 0xFC, 0xFD, 0xFE, 0xFF}; - return c; +static constexpr const unsigned char rtl_upper_table[256] = { + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, + 0xC, 0xD, 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, + 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, + 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, + 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, + 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, + 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, + 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xC0, 0xC1, 0xC2, 0xC3, + 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xF7, 0xD8, 0xD9, 0xDA, 0xDB, + 0xDC, 0xDD, 0xDE, 0x3F}; + +dword_result_t RtlUpperChar_entry(dword_t in) { + return rtl_upper_table[in & 0xff]; } DECLARE_XBOXKRNL_EXPORT1(RtlUpperChar, kNone, kImplemented); dword_result_t RtlLowerChar_entry(dword_t in) { - char c = in & 0xFF; - if (c >= 'A' && c <= 'Z') { - return c ^ 0x20; - } - - return c; + return rtl_lower_table[in & 0xff]; } DECLARE_XBOXKRNL_EXPORT1(RtlLowerChar, kNone, kImplemented); diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc index fdc9c7ef4..715578dee 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc @@ -473,7 +473,8 @@ void VdSwap_entry( dwords[i] = xenos::MakePacketType2(); } } -DECLARE_XBOXKRNL_EXPORT2(VdSwap, kVideo, kImplemented, kImportant); +DECLARE_XBOXKRNL_EXPORT3(VdSwap, kVideo, kImplemented, kHighFrequency, + kImportant); void RegisterVideoExports(xe::cpu::ExportResolver* export_resolver, KernelState* kernel_state) {