From 7cc364dcb8586fce2b1594a7808abb389cb45e83 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 14 Aug 2022 13:42:08 -0700 Subject: [PATCH] squash reallocs in command buffers by using large prealloced buffer, directly use virtual memory with it so os allocs on demand mark raw clock functions as noinline, the way msvc was inlining them and ordering the branches meant that rdtsc would often be speculatively executed add alternative clock impl for win, instead of using queryperformancecounter we grab systemtime from kusershared. it does not have the same precision as queryperformancecounter, we only have 100 nanosecond precision, but we round to milliseconds so it never made sense to use the performance counter in the first place stubbed out the "guest clock mutex"... (the entirety of clock.cc needs a rewrite) added some helpers for minf/maxf without the nan handling behavior --- src/xenia/base/clock.cc | 27 ++- src/xenia/base/clock.h | 4 + src/xenia/base/clock_win.cc | 17 +- src/xenia/base/clock_x64.cc | 9 +- src/xenia/base/math.h | 23 +++ src/xenia/base/memory.h | 7 +- src/xenia/base/platform_win.h | 178 ++++++++++++++++--- src/xenia/base/threading.h | 1 + src/xenia/base/threading_timer_queue.cc | 4 +- src/xenia/gpu/d3d12/deferred_command_list.cc | 23 ++- src/xenia/gpu/d3d12/deferred_command_list.h | 8 +- 11 files changed, 263 insertions(+), 38 deletions(-) diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc index 058eae43a..5f4905dda 100644 --- a/src/xenia/base/clock.cc +++ b/src/xenia/base/clock.cc @@ -15,6 +15,13 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/base/mutex.h" + +#if defined(_WIN32) + +#include "xenia/base/platform_win.h" + +#endif DEFINE_bool(clock_no_scaling, false, "Disable scaling code. Time management and locking is bypassed. " @@ -42,8 +49,19 @@ std::pair guest_tick_ratio_ = std::make_pair(1, 1); uint64_t last_guest_tick_count_ = 0; // Last sampled host tick count. uint64_t last_host_tick_count_ = Clock::QueryHostTickCount(); + +struct null_lock { + public: + static void lock() {} + static void unlock() {} + static bool try_lock() { return true; } +}; + +using tick_mutex_type = null_lock; // xe::xe_mutex; + // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync -std::mutex tick_mutex_; +// std::mutex tick_mutex_; +static tick_mutex_type tick_mutex_; void RecomputeGuestTickScalar() { // Create a rational number with numerator (first) and denominator (second) @@ -61,7 +79,7 @@ void RecomputeGuestTickScalar() { // Keep this a rational calculation and reduce the fraction reduce_fraction(frac); - std::lock_guard lock(tick_mutex_); + std::lock_guard lock(tick_mutex_); guest_tick_ratio_ = frac; } @@ -75,7 +93,7 @@ uint64_t UpdateGuestClock() { return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second; } - std::unique_lock lock(tick_mutex_, std::defer_lock); + std::unique_lock lock(tick_mutex_, std::defer_lock); if (lock.try_lock()) { // Translate host tick count to guest tick count. uint64_t host_tick_delta = host_tick_count > last_host_tick_count_ @@ -107,7 +125,6 @@ inline uint64_t QueryGuestSystemTimeOffset() { return guest_tick_count * numerator / denominator; } - uint64_t Clock::QueryHostTickFrequency() { #if XE_CLOCK_RAW_AVAILABLE if (cvars::clock_source_raw) { @@ -137,7 +154,7 @@ void Clock::set_guest_time_scalar(double scalar) { } std::pair Clock::guest_tick_ratio() { - std::lock_guard lock(tick_mutex_); + std::lock_guard lock(tick_mutex_); return guest_tick_ratio_; } diff --git a/src/xenia/base/clock.h b/src/xenia/base/clock.h index 67a3ebb67..81894ca97 100644 --- a/src/xenia/base/clock.h +++ b/src/xenia/base/clock.h @@ -33,11 +33,15 @@ class Clock { // Either from platform suplied time source or from hardware directly. static uint64_t host_tick_frequency_platform(); #if XE_CLOCK_RAW_AVAILABLE + XE_NOINLINE static uint64_t host_tick_frequency_raw(); #endif // Host tick count. Generally QueryHostTickCount() should be used. static uint64_t host_tick_count_platform(); #if XE_CLOCK_RAW_AVAILABLE + //chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time + //the branch history was lost + XE_NOINLINE static uint64_t host_tick_count_raw(); #endif diff --git a/src/xenia/base/clock_win.cc b/src/xenia/base/clock_win.cc index 4a0c8aeb5..e087aa946 100644 --- a/src/xenia/base/clock_win.cc +++ b/src/xenia/base/clock_win.cc @@ -12,7 +12,18 @@ #include "xenia/base/platform_win.h" namespace xe { + #if XE_USE_KUSER_SHARED==1 +uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; } +uint64_t Clock::host_tick_count_platform() { + return *reinterpret_cast(&KUserShared()->SystemTime); +} +uint64_t Clock::QueryHostSystemTime() { + return *reinterpret_cast(&KUserShared()->SystemTime); +} + + + #else uint64_t Clock::host_tick_frequency_platform() { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); @@ -27,7 +38,6 @@ uint64_t Clock::host_tick_count_platform() { } return time; } - uint64_t Clock::QueryHostSystemTime() { FILETIME t; GetSystemTimeAsFileTime(&t); @@ -37,5 +47,10 @@ uint64_t Clock::QueryHostSystemTime() { uint64_t Clock::QueryHostUptimeMillis() { return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); } +#endif +uint64_t Clock::QueryHostUptimeMillis() { + return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); +} + } // namespace xe diff --git a/src/xenia/base/clock_x64.cc b/src/xenia/base/clock_x64.cc index 14155303a..b07df79d2 100644 --- a/src/xenia/base/clock_x64.cc +++ b/src/xenia/base/clock_x64.cc @@ -41,10 +41,14 @@ "\n" \ "Set the cvar 'clock_source_raw' to 'false'."); + + + namespace xe { // Getting the TSC frequency can be a bit tricky. This method here only works on // Intel as it seems. There is no easy way to get the frequency outside of ring0 // on AMD, so we fail gracefully if not possible. +XE_NOINLINE uint64_t Clock::host_tick_frequency_raw() { uint32_t eax, ebx, ecx, edx; @@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() { return 0; } + + if (max_cpuid >= 0x15) { // 15H Get TSC/Crystal ratio and Crystal Hz. xe_cpu_cpuid(0x15, eax, ebx, ecx, edx); @@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() { return cpu_base_freq; } + CLOCK_FATAL("The clock frequency could not be determined."); return 0; } - +XE_NOINLINE uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); } } // namespace xe diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 889cf03ed..4cafc7178 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -376,6 +376,29 @@ template int64_t m128_i64(const __m128& v) { return m128_i64(_mm_castps_pd(v)); } +/* + + std::min/max float has handling for nans, where if either argument is nan the first argument is returned + + minss/maxss are different, if either argument is nan the second operand to the instruction is returned + this is problematic because we have no assurances from the compiler on the argument ordering + + so only use in places where nan handling is not needed +*/ +static float xe_minf(float x, float y) { + return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y))); +} +static float xe_maxf(float x, float y) { + return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y))); +} +static float xe_rcpf(float den) { + return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); +} + +#else +static float xe_minf(float x, float y) { return std::min(x, y); } +static float xe_maxf(float x, float y) { return std::max(x, y); } +static float xe_rcpf(float den) { return 1.0f / den; } #endif // Similar to the C++ implementation of XMConvertFloatToHalf and diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 01cf40f87..979b390ba 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -478,12 +478,13 @@ class fixed_vmem_vector { public: fixed_vmem_vector() - : data_((uint8_t*)AllocFixed(nullptr, sz, AllocationType::kReserveCommit, - PageAccess::kReadWrite)), + : data_((uint8_t*)memory::AllocFixed( + nullptr, sz, memory::AllocationType::kReserveCommit, + memory::PageAccess::kReadWrite)), nbytes_(0) {} ~fixed_vmem_vector() { if (data_) { - DeallocFixed(data_, sz, DeallocationType::kRelease); + memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease); data_ = nullptr; } nbytes_ = 0; diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index 3013a9c14..a9ea263d6 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -34,31 +34,169 @@ #undef DeleteFile #undef GetFirstChild -#define XE_USE_NTDLL_FUNCTIONS 1 -#if XE_USE_NTDLL_FUNCTIONS==1 +#define XE_USE_NTDLL_FUNCTIONS 1 +#define XE_USE_KUSER_SHARED 1 +#if XE_USE_NTDLL_FUNCTIONS == 1 /* - ntdll versions of functions often skip through a lot of extra garbage in KernelBase + ntdll versions of functions often skip through a lot of extra garbage in + KernelBase */ -#define XE_NTDLL_IMPORT(name, cls, clsvar) \ - static class cls { \ - public: \ - FARPROC fn;\ - cls() : fn(nullptr) {\ - auto ntdll = GetModuleHandleA("ntdll.dll");\ - if (ntdll) { \ - fn = GetProcAddress(ntdll, #name );\ - }\ - } \ - template \ - inline TRet invoke(TArgs... args) {\ - return reinterpret_cast(fn)(args...);\ - }\ - inline operator bool() const {\ - return fn!=nullptr;\ - }\ +#define XE_NTDLL_IMPORT(name, cls, clsvar) \ + static class cls { \ + public: \ + FARPROC fn; \ + cls() : fn(nullptr) { \ + auto ntdll = GetModuleHandleA("ntdll.dll"); \ + if (ntdll) { \ + fn = GetProcAddress(ntdll, #name); \ + } \ + } \ + template \ + inline TRet invoke(TArgs... args) { \ + return reinterpret_cast(fn)(args...); \ + } \ + inline operator bool() const { return fn != nullptr; } \ } clsvar #else #define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false #endif + +// KUSER_SHARED +struct __declspec(align(4)) _KSYSTEM_TIME { + unsigned int LowPart; + int High1Time; + int High2Time; +}; +enum _NT_PRODUCT_TYPE { + NtProductWinNt = 0x1, + NtProductLanManNt = 0x2, + NtProductServer = 0x3, +}; +enum _ALTERNATIVE_ARCHITECTURE_TYPE { + StandardDesign = 0x0, + NEC98x86 = 0x1, + EndAlternatives = 0x2, +}; + +#pragma pack(push, 1) +struct $3D940D5D03EF7F98CEE6737EDE752E57 { + __int8 _bf_0; +}; + +union $DA7A7E727E24E4DD62317E27558CCADA { + unsigned __int8 MitigationPolicies; + $3D940D5D03EF7F98CEE6737EDE752E57 __s1; +}; +struct __declspec(align(4)) $4BF4056B39611650D41923F164DAFA52 { + __int32 _bf_0; +}; + +union __declspec(align(4)) $BB68545E345A5F8046EF3BC0FE928142 { + unsigned int SharedDataFlags; + $4BF4056B39611650D41923F164DAFA52 __s1; +}; +union $5031D289C483414B89DA3F368D1FE62C { + volatile _KSYSTEM_TIME TickCount; + volatile unsigned __int64 TickCountQuad; + unsigned int ReservedTickCountOverlay[3]; +}; +struct $F91ACE6F13277DFC9425B9B8BBCB30F7 { + volatile unsigned __int8 QpcBypassEnabled; + unsigned __int8 QpcShift; +}; + +union __declspec(align(2)) $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 { + unsigned __int16 QpcData; + $F91ACE6F13277DFC9425B9B8BBCB30F7 __s1; +}; + +struct __declspec(align(8)) _KUSER_SHARED_DATA { + unsigned int TickCountLowDeprecated; + unsigned int TickCountMultiplier; + volatile _KSYSTEM_TIME InterruptTime; + volatile _KSYSTEM_TIME SystemTime; + volatile _KSYSTEM_TIME TimeZoneBias; + unsigned __int16 ImageNumberLow; + unsigned __int16 ImageNumberHigh; + wchar_t NtSystemRoot[260]; + unsigned int MaxStackTraceDepth; + unsigned int CryptoExponent; + unsigned int TimeZoneId; + unsigned int LargePageMinimum; + unsigned int AitSamplingValue; + unsigned int AppCompatFlag; + unsigned __int64 RNGSeedVersion; + unsigned int GlobalValidationRunlevel; + volatile int TimeZoneBiasStamp; + unsigned int NtBuildNumber; + _NT_PRODUCT_TYPE NtProductType; + unsigned __int8 ProductTypeIsValid; + unsigned __int8 Reserved0[1]; + unsigned __int16 NativeProcessorArchitecture; + unsigned int NtMajorVersion; + unsigned int NtMinorVersion; + unsigned __int8 ProcessorFeatures[64]; + unsigned int Reserved1; + unsigned int Reserved3; + volatile unsigned int TimeSlip; + _ALTERNATIVE_ARCHITECTURE_TYPE AlternativeArchitecture; + unsigned int BootId; + _LARGE_INTEGER SystemExpirationDate; + unsigned int SuiteMask; + unsigned __int8 KdDebuggerEnabled; + $DA7A7E727E24E4DD62317E27558CCADA ___u33; + unsigned __int8 Reserved6[2]; + volatile unsigned int ActiveConsoleId; + volatile unsigned int DismountCount; + unsigned int ComPlusPackage; + unsigned int LastSystemRITEventTickCount; + unsigned int NumberOfPhysicalPages; + unsigned __int8 SafeBootMode; + unsigned __int8 VirtualizationFlags; + unsigned __int8 Reserved12[2]; + $BB68545E345A5F8046EF3BC0FE928142 ___u43; + unsigned int DataFlagsPad[1]; + unsigned __int64 TestRetInstruction; + __int64 QpcFrequency; + unsigned int SystemCall; + unsigned int SystemCallPad0; + unsigned __int64 SystemCallPad[2]; + $5031D289C483414B89DA3F368D1FE62C ___u50; + unsigned int TickCountPad[1]; + unsigned int Cookie; + unsigned int CookiePad[1]; + __int64 ConsoleSessionForegroundProcessId; + unsigned __int64 TimeUpdateLock; + unsigned __int64 BaselineSystemTimeQpc; + unsigned __int64 BaselineInterruptTimeQpc; + unsigned __int64 QpcSystemTimeIncrement; + unsigned __int64 QpcInterruptTimeIncrement; + unsigned __int8 QpcSystemTimeIncrementShift; + unsigned __int8 QpcInterruptTimeIncrementShift; + unsigned __int16 UnparkedProcessorCount; + unsigned int EnclaveFeatureMask[4]; + unsigned int TelemetryCoverageRound; + unsigned __int16 UserModeGlobalLogger[16]; + unsigned int ImageFileExecutionOptions; + unsigned int LangGenerationCount; + unsigned __int64 Reserved4; + volatile unsigned __int64 InterruptTimeBias; + volatile unsigned __int64 QpcBias; + unsigned int ActiveProcessorCount; + volatile unsigned __int8 ActiveGroupCount; + unsigned __int8 Reserved9; + $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 ___u74; + _LARGE_INTEGER TimeZoneBiasEffectiveStart; + _LARGE_INTEGER TimeZoneBiasEffectiveEnd; + _XSTATE_CONFIGURATION XState; +}; +static constexpr unsigned KUSER_SIZE = sizeof(_KUSER_SHARED_DATA); + +static_assert(KUSER_SIZE == 1808, "yay"); +#pragma pack(pop) + +static _KUSER_SHARED_DATA* KUserShared() { + return (_KUSER_SHARED_DATA*)0x7FFE0000; +} #endif // XENIA_BASE_PLATFORM_WIN_H_ diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h index 28d9a780e..67297716b 100644 --- a/src/xenia/base/threading.h +++ b/src/xenia/base/threading.h @@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value); // be kept short or else all timers will be impacted. This is a simplified // wrapper around QueueTimerRecurring which automatically cancels the timer on // destruction. +//only used by XboxkrnlModule::XboxkrnlModule class HighResolutionTimer { HighResolutionTimer(std::chrono::milliseconds interval, std::function callback) { diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 8e19b50dd..e79d86f4e 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() { spinner.spin_once(); } } - +//unused std::weak_ptr QueueTimerOnce(std::function callback, void* userdata, WaitItem::clock::time_point due) { @@ -213,7 +213,7 @@ std::weak_ptr QueueTimerOnce(std::function callback, std::make_shared(std::move(callback), userdata, &timer_queue_, due, WaitItem::clock::duration::zero())); } - +// only used by HighResolutionTimer std::weak_ptr QueueTimerRecurring( std::function callback, void* userdata, WaitItem::clock::time_point due, WaitItem::clock::duration interval) { diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index 581d1b71a..c27c8b226 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - const uintmax_t* stream = command_stream_.data(); - size_t stream_remaining = command_stream_.size(); + const uintmax_t* stream = (const uintmax_t*)command_stream_.data(); + size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t); ID3D12PipelineState* current_pipeline_state = nullptr; while (stream_remaining != 0) { const CommandHeader& header = @@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, void* DeferredCommandList::WriteCommand(Command command, size_t arguments_size_bytes) { + size_t arguments_size_elements = - (arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t); + round_up(arguments_size_bytes, sizeof(uintmax_t), false); + + //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t); + #if 0 size_t offset = command_stream_.size(); command_stream_.resize(offset + kCommandHeaderSizeElements + arguments_size_elements); @@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command, header.command = command; header.arguments_size_elements = uint32_t(arguments_size_elements); return command_stream_.data() + (offset + kCommandHeaderSizeElements); + #else + + size_t offset = command_stream_.size(); + constexpr size_t kCommandHeaderSizeBytes = + kCommandHeaderSizeElements * sizeof(uintmax_t); + command_stream_.resize(offset + kCommandHeaderSizeBytes + + arguments_size_elements); + CommandHeader& header = + *reinterpret_cast(command_stream_.data() + offset); + header.command = command; + header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t); + return command_stream_.data() + (offset + kCommandHeaderSizeBytes); + #endif } } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index a1b063558..925956a8a 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -19,7 +19,7 @@ #include "xenia/base/literals.h" #include "xenia/base/math.h" #include "xenia/ui/d3d12/d3d12_api.h" - +#include "xenia/base/memory.h" namespace xe { namespace gpu { namespace d3d12 { @@ -30,11 +30,12 @@ class D3D12CommandProcessor; class DeferredCommandList { public: + static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb /* chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps */ DeferredCommandList(const D3D12CommandProcessor& command_processor, - size_t initial_size_bytes = 4_MiB); + size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST); void Reset(); void Execute(ID3D12GraphicsCommandList* command_list, @@ -565,7 +566,8 @@ class DeferredCommandList { const D3D12CommandProcessor& command_processor_; // uintmax_t to ensure uint64_t and pointer alignment of all structures. - std::vector command_stream_; + //std::vector command_stream_; + fixed_vmem_vector command_stream_; }; } // namespace d3d12