diff --git a/src/xenia/base/clock.cc b/src/xenia/base/clock.cc index 058eae43a..5f4905dda 100644 --- a/src/xenia/base/clock.cc +++ b/src/xenia/base/clock.cc @@ -15,6 +15,13 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/base/mutex.h" + +#if defined(_WIN32) + +#include "xenia/base/platform_win.h" + +#endif DEFINE_bool(clock_no_scaling, false, "Disable scaling code. Time management and locking is bypassed. " @@ -42,8 +49,19 @@ std::pair guest_tick_ratio_ = std::make_pair(1, 1); uint64_t last_guest_tick_count_ = 0; // Last sampled host tick count. uint64_t last_host_tick_count_ = Clock::QueryHostTickCount(); + +struct null_lock { + public: + static void lock() {} + static void unlock() {} + static bool try_lock() { return true; } +}; + +using tick_mutex_type = null_lock; // xe::xe_mutex; + // Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync -std::mutex tick_mutex_; +// std::mutex tick_mutex_; +static tick_mutex_type tick_mutex_; void RecomputeGuestTickScalar() { // Create a rational number with numerator (first) and denominator (second) @@ -61,7 +79,7 @@ void RecomputeGuestTickScalar() { // Keep this a rational calculation and reduce the fraction reduce_fraction(frac); - std::lock_guard lock(tick_mutex_); + std::lock_guard lock(tick_mutex_); guest_tick_ratio_ = frac; } @@ -75,7 +93,7 @@ uint64_t UpdateGuestClock() { return host_tick_count * guest_tick_ratio_.first / guest_tick_ratio_.second; } - std::unique_lock lock(tick_mutex_, std::defer_lock); + std::unique_lock lock(tick_mutex_, std::defer_lock); if (lock.try_lock()) { // Translate host tick count to guest tick count. uint64_t host_tick_delta = host_tick_count > last_host_tick_count_ @@ -107,7 +125,6 @@ inline uint64_t QueryGuestSystemTimeOffset() { return guest_tick_count * numerator / denominator; } - uint64_t Clock::QueryHostTickFrequency() { #if XE_CLOCK_RAW_AVAILABLE if (cvars::clock_source_raw) { @@ -137,7 +154,7 @@ void Clock::set_guest_time_scalar(double scalar) { } std::pair Clock::guest_tick_ratio() { - std::lock_guard lock(tick_mutex_); + std::lock_guard lock(tick_mutex_); return guest_tick_ratio_; } diff --git a/src/xenia/base/clock.h b/src/xenia/base/clock.h index 67a3ebb67..81894ca97 100644 --- a/src/xenia/base/clock.h +++ b/src/xenia/base/clock.h @@ -33,11 +33,15 @@ class Clock { // Either from platform suplied time source or from hardware directly. static uint64_t host_tick_frequency_platform(); #if XE_CLOCK_RAW_AVAILABLE + XE_NOINLINE static uint64_t host_tick_frequency_raw(); #endif // Host tick count. Generally QueryHostTickCount() should be used. static uint64_t host_tick_count_platform(); #if XE_CLOCK_RAW_AVAILABLE + //chrispy: the way msvc was ordering the branches was causing rdtsc to be speculatively executed each time + //the branch history was lost + XE_NOINLINE static uint64_t host_tick_count_raw(); #endif diff --git a/src/xenia/base/clock_win.cc b/src/xenia/base/clock_win.cc index 4a0c8aeb5..e087aa946 100644 --- a/src/xenia/base/clock_win.cc +++ b/src/xenia/base/clock_win.cc @@ -12,7 +12,18 @@ #include "xenia/base/platform_win.h" namespace xe { + #if XE_USE_KUSER_SHARED==1 +uint64_t Clock::host_tick_frequency_platform() { return 10000000ULL; } +uint64_t Clock::host_tick_count_platform() { + return *reinterpret_cast(&KUserShared()->SystemTime); +} +uint64_t Clock::QueryHostSystemTime() { + return *reinterpret_cast(&KUserShared()->SystemTime); +} + + + #else uint64_t Clock::host_tick_frequency_platform() { LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); @@ -27,7 +38,6 @@ uint64_t Clock::host_tick_count_platform() { } return time; } - uint64_t Clock::QueryHostSystemTime() { FILETIME t; GetSystemTimeAsFileTime(&t); @@ -37,5 +47,10 @@ uint64_t Clock::QueryHostSystemTime() { uint64_t Clock::QueryHostUptimeMillis() { return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); } +#endif +uint64_t Clock::QueryHostUptimeMillis() { + return host_tick_count_platform() * 1000 / host_tick_frequency_platform(); +} + } // namespace xe diff --git a/src/xenia/base/clock_x64.cc b/src/xenia/base/clock_x64.cc index 14155303a..b07df79d2 100644 --- a/src/xenia/base/clock_x64.cc +++ b/src/xenia/base/clock_x64.cc @@ -41,10 +41,14 @@ "\n" \ "Set the cvar 'clock_source_raw' to 'false'."); + + + namespace xe { // Getting the TSC frequency can be a bit tricky. This method here only works on // Intel as it seems. There is no easy way to get the frequency outside of ring0 // on AMD, so we fail gracefully if not possible. +XE_NOINLINE uint64_t Clock::host_tick_frequency_raw() { uint32_t eax, ebx, ecx, edx; @@ -71,6 +75,8 @@ uint64_t Clock::host_tick_frequency_raw() { return 0; } + + if (max_cpuid >= 0x15) { // 15H Get TSC/Crystal ratio and Crystal Hz. xe_cpu_cpuid(0x15, eax, ebx, ecx, edx); @@ -92,10 +98,11 @@ uint64_t Clock::host_tick_frequency_raw() { return cpu_base_freq; } + CLOCK_FATAL("The clock frequency could not be determined."); return 0; } - +XE_NOINLINE uint64_t Clock::host_tick_count_raw() { return xe_cpu_rdtsc(); } } // namespace xe diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 889cf03ed..4cafc7178 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -376,6 +376,29 @@ template int64_t m128_i64(const __m128& v) { return m128_i64(_mm_castps_pd(v)); } +/* + + std::min/max float has handling for nans, where if either argument is nan the first argument is returned + + minss/maxss are different, if either argument is nan the second operand to the instruction is returned + this is problematic because we have no assurances from the compiler on the argument ordering + + so only use in places where nan handling is not needed +*/ +static float xe_minf(float x, float y) { + return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y))); +} +static float xe_maxf(float x, float y) { + return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y))); +} +static float xe_rcpf(float den) { + return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); +} + +#else +static float xe_minf(float x, float y) { return std::min(x, y); } +static float xe_maxf(float x, float y) { return std::max(x, y); } +static float xe_rcpf(float den) { return 1.0f / den; } #endif // Similar to the C++ implementation of XMConvertFloatToHalf and diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 01cf40f87..979b390ba 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -478,12 +478,13 @@ class fixed_vmem_vector { public: fixed_vmem_vector() - : data_((uint8_t*)AllocFixed(nullptr, sz, AllocationType::kReserveCommit, - PageAccess::kReadWrite)), + : data_((uint8_t*)memory::AllocFixed( + nullptr, sz, memory::AllocationType::kReserveCommit, + memory::PageAccess::kReadWrite)), nbytes_(0) {} ~fixed_vmem_vector() { if (data_) { - DeallocFixed(data_, sz, DeallocationType::kRelease); + memory::DeallocFixed(data_, sz, memory::DeallocationType::kRelease); data_ = nullptr; } nbytes_ = 0; diff --git a/src/xenia/base/platform_win.h b/src/xenia/base/platform_win.h index 3013a9c14..a9ea263d6 100644 --- a/src/xenia/base/platform_win.h +++ b/src/xenia/base/platform_win.h @@ -34,31 +34,169 @@ #undef DeleteFile #undef GetFirstChild -#define XE_USE_NTDLL_FUNCTIONS 1 -#if XE_USE_NTDLL_FUNCTIONS==1 +#define XE_USE_NTDLL_FUNCTIONS 1 +#define XE_USE_KUSER_SHARED 1 +#if XE_USE_NTDLL_FUNCTIONS == 1 /* - ntdll versions of functions often skip through a lot of extra garbage in KernelBase + ntdll versions of functions often skip through a lot of extra garbage in + KernelBase */ -#define XE_NTDLL_IMPORT(name, cls, clsvar) \ - static class cls { \ - public: \ - FARPROC fn;\ - cls() : fn(nullptr) {\ - auto ntdll = GetModuleHandleA("ntdll.dll");\ - if (ntdll) { \ - fn = GetProcAddress(ntdll, #name );\ - }\ - } \ - template \ - inline TRet invoke(TArgs... args) {\ - return reinterpret_cast(fn)(args...);\ - }\ - inline operator bool() const {\ - return fn!=nullptr;\ - }\ +#define XE_NTDLL_IMPORT(name, cls, clsvar) \ + static class cls { \ + public: \ + FARPROC fn; \ + cls() : fn(nullptr) { \ + auto ntdll = GetModuleHandleA("ntdll.dll"); \ + if (ntdll) { \ + fn = GetProcAddress(ntdll, #name); \ + } \ + } \ + template \ + inline TRet invoke(TArgs... args) { \ + return reinterpret_cast(fn)(args...); \ + } \ + inline operator bool() const { return fn != nullptr; } \ } clsvar #else #define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false #endif + +// KUSER_SHARED +struct __declspec(align(4)) _KSYSTEM_TIME { + unsigned int LowPart; + int High1Time; + int High2Time; +}; +enum _NT_PRODUCT_TYPE { + NtProductWinNt = 0x1, + NtProductLanManNt = 0x2, + NtProductServer = 0x3, +}; +enum _ALTERNATIVE_ARCHITECTURE_TYPE { + StandardDesign = 0x0, + NEC98x86 = 0x1, + EndAlternatives = 0x2, +}; + +#pragma pack(push, 1) +struct $3D940D5D03EF7F98CEE6737EDE752E57 { + __int8 _bf_0; +}; + +union $DA7A7E727E24E4DD62317E27558CCADA { + unsigned __int8 MitigationPolicies; + $3D940D5D03EF7F98CEE6737EDE752E57 __s1; +}; +struct __declspec(align(4)) $4BF4056B39611650D41923F164DAFA52 { + __int32 _bf_0; +}; + +union __declspec(align(4)) $BB68545E345A5F8046EF3BC0FE928142 { + unsigned int SharedDataFlags; + $4BF4056B39611650D41923F164DAFA52 __s1; +}; +union $5031D289C483414B89DA3F368D1FE62C { + volatile _KSYSTEM_TIME TickCount; + volatile unsigned __int64 TickCountQuad; + unsigned int ReservedTickCountOverlay[3]; +}; +struct $F91ACE6F13277DFC9425B9B8BBCB30F7 { + volatile unsigned __int8 QpcBypassEnabled; + unsigned __int8 QpcShift; +}; + +union __declspec(align(2)) $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 { + unsigned __int16 QpcData; + $F91ACE6F13277DFC9425B9B8BBCB30F7 __s1; +}; + +struct __declspec(align(8)) _KUSER_SHARED_DATA { + unsigned int TickCountLowDeprecated; + unsigned int TickCountMultiplier; + volatile _KSYSTEM_TIME InterruptTime; + volatile _KSYSTEM_TIME SystemTime; + volatile _KSYSTEM_TIME TimeZoneBias; + unsigned __int16 ImageNumberLow; + unsigned __int16 ImageNumberHigh; + wchar_t NtSystemRoot[260]; + unsigned int MaxStackTraceDepth; + unsigned int CryptoExponent; + unsigned int TimeZoneId; + unsigned int LargePageMinimum; + unsigned int AitSamplingValue; + unsigned int AppCompatFlag; + unsigned __int64 RNGSeedVersion; + unsigned int GlobalValidationRunlevel; + volatile int TimeZoneBiasStamp; + unsigned int NtBuildNumber; + _NT_PRODUCT_TYPE NtProductType; + unsigned __int8 ProductTypeIsValid; + unsigned __int8 Reserved0[1]; + unsigned __int16 NativeProcessorArchitecture; + unsigned int NtMajorVersion; + unsigned int NtMinorVersion; + unsigned __int8 ProcessorFeatures[64]; + unsigned int Reserved1; + unsigned int Reserved3; + volatile unsigned int TimeSlip; + _ALTERNATIVE_ARCHITECTURE_TYPE AlternativeArchitecture; + unsigned int BootId; + _LARGE_INTEGER SystemExpirationDate; + unsigned int SuiteMask; + unsigned __int8 KdDebuggerEnabled; + $DA7A7E727E24E4DD62317E27558CCADA ___u33; + unsigned __int8 Reserved6[2]; + volatile unsigned int ActiveConsoleId; + volatile unsigned int DismountCount; + unsigned int ComPlusPackage; + unsigned int LastSystemRITEventTickCount; + unsigned int NumberOfPhysicalPages; + unsigned __int8 SafeBootMode; + unsigned __int8 VirtualizationFlags; + unsigned __int8 Reserved12[2]; + $BB68545E345A5F8046EF3BC0FE928142 ___u43; + unsigned int DataFlagsPad[1]; + unsigned __int64 TestRetInstruction; + __int64 QpcFrequency; + unsigned int SystemCall; + unsigned int SystemCallPad0; + unsigned __int64 SystemCallPad[2]; + $5031D289C483414B89DA3F368D1FE62C ___u50; + unsigned int TickCountPad[1]; + unsigned int Cookie; + unsigned int CookiePad[1]; + __int64 ConsoleSessionForegroundProcessId; + unsigned __int64 TimeUpdateLock; + unsigned __int64 BaselineSystemTimeQpc; + unsigned __int64 BaselineInterruptTimeQpc; + unsigned __int64 QpcSystemTimeIncrement; + unsigned __int64 QpcInterruptTimeIncrement; + unsigned __int8 QpcSystemTimeIncrementShift; + unsigned __int8 QpcInterruptTimeIncrementShift; + unsigned __int16 UnparkedProcessorCount; + unsigned int EnclaveFeatureMask[4]; + unsigned int TelemetryCoverageRound; + unsigned __int16 UserModeGlobalLogger[16]; + unsigned int ImageFileExecutionOptions; + unsigned int LangGenerationCount; + unsigned __int64 Reserved4; + volatile unsigned __int64 InterruptTimeBias; + volatile unsigned __int64 QpcBias; + unsigned int ActiveProcessorCount; + volatile unsigned __int8 ActiveGroupCount; + unsigned __int8 Reserved9; + $3C927F8BB7EAEE13CF0CFC3E60EDC8A9 ___u74; + _LARGE_INTEGER TimeZoneBiasEffectiveStart; + _LARGE_INTEGER TimeZoneBiasEffectiveEnd; + _XSTATE_CONFIGURATION XState; +}; +static constexpr unsigned KUSER_SIZE = sizeof(_KUSER_SHARED_DATA); + +static_assert(KUSER_SIZE == 1808, "yay"); +#pragma pack(pop) + +static _KUSER_SHARED_DATA* KUserShared() { + return (_KUSER_SHARED_DATA*)0x7FFE0000; +} #endif // XENIA_BASE_PLATFORM_WIN_H_ diff --git a/src/xenia/base/threading.h b/src/xenia/base/threading.h index 28d9a780e..67297716b 100644 --- a/src/xenia/base/threading.h +++ b/src/xenia/base/threading.h @@ -148,6 +148,7 @@ bool SetTlsValue(TlsHandle handle, uintptr_t value); // be kept short or else all timers will be impacted. This is a simplified // wrapper around QueueTimerRecurring which automatically cancels the timer on // destruction. +//only used by XboxkrnlModule::XboxkrnlModule class HighResolutionTimer { HighResolutionTimer(std::chrono::milliseconds interval, std::function callback) { diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 8e19b50dd..e79d86f4e 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -205,7 +205,7 @@ void TimerQueueWaitItem::Disarm() { spinner.spin_once(); } } - +//unused std::weak_ptr QueueTimerOnce(std::function callback, void* userdata, WaitItem::clock::time_point due) { @@ -213,7 +213,7 @@ std::weak_ptr QueueTimerOnce(std::function callback, std::make_shared(std::move(callback), userdata, &timer_queue_, due, WaitItem::clock::duration::zero())); } - +// only used by HighResolutionTimer std::weak_ptr QueueTimerRecurring( std::function callback, void* userdata, WaitItem::clock::time_point due, WaitItem::clock::duration interval) { diff --git a/src/xenia/gpu/d3d12/deferred_command_list.cc b/src/xenia/gpu/d3d12/deferred_command_list.cc index 581d1b71a..c27c8b226 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.cc +++ b/src/xenia/gpu/d3d12/deferred_command_list.cc @@ -31,8 +31,8 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - const uintmax_t* stream = command_stream_.data(); - size_t stream_remaining = command_stream_.size(); + const uintmax_t* stream = (const uintmax_t*)command_stream_.data(); + size_t stream_remaining = command_stream_.size() / sizeof(uintmax_t); ID3D12PipelineState* current_pipeline_state = nullptr; while (stream_remaining != 0) { const CommandHeader& header = @@ -266,8 +266,12 @@ void DeferredCommandList::Execute(ID3D12GraphicsCommandList* command_list, void* DeferredCommandList::WriteCommand(Command command, size_t arguments_size_bytes) { + size_t arguments_size_elements = - (arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t); + round_up(arguments_size_bytes, sizeof(uintmax_t), false); + + //(arguments_size_bytes + sizeof(uintmax_t) - 1) / sizeof(uintmax_t); + #if 0 size_t offset = command_stream_.size(); command_stream_.resize(offset + kCommandHeaderSizeElements + arguments_size_elements); @@ -276,6 +280,19 @@ void* DeferredCommandList::WriteCommand(Command command, header.command = command; header.arguments_size_elements = uint32_t(arguments_size_elements); return command_stream_.data() + (offset + kCommandHeaderSizeElements); + #else + + size_t offset = command_stream_.size(); + constexpr size_t kCommandHeaderSizeBytes = + kCommandHeaderSizeElements * sizeof(uintmax_t); + command_stream_.resize(offset + kCommandHeaderSizeBytes + + arguments_size_elements); + CommandHeader& header = + *reinterpret_cast(command_stream_.data() + offset); + header.command = command; + header.arguments_size_elements = uint32_t(arguments_size_elements) / sizeof(uintmax_t); + return command_stream_.data() + (offset + kCommandHeaderSizeBytes); + #endif } } // namespace d3d12 diff --git a/src/xenia/gpu/d3d12/deferred_command_list.h b/src/xenia/gpu/d3d12/deferred_command_list.h index a1b063558..925956a8a 100644 --- a/src/xenia/gpu/d3d12/deferred_command_list.h +++ b/src/xenia/gpu/d3d12/deferred_command_list.h @@ -19,7 +19,7 @@ #include "xenia/base/literals.h" #include "xenia/base/math.h" #include "xenia/ui/d3d12/d3d12_api.h" - +#include "xenia/base/memory.h" namespace xe { namespace gpu { namespace d3d12 { @@ -30,11 +30,12 @@ class D3D12CommandProcessor; class DeferredCommandList { public: + static constexpr size_t MAX_SIZEOF_COMMANDLIST = 65536 * 128; //around 8 mb /* chrispy: upped from 1_MiB to 4_MiB, m:durandal hits frequent resizes in large open maps */ DeferredCommandList(const D3D12CommandProcessor& command_processor, - size_t initial_size_bytes = 4_MiB); + size_t initial_size_bytes = MAX_SIZEOF_COMMANDLIST); void Reset(); void Execute(ID3D12GraphicsCommandList* command_list, @@ -565,7 +566,8 @@ class DeferredCommandList { const D3D12CommandProcessor& command_processor_; // uintmax_t to ensure uint64_t and pointer alignment of all structures. - std::vector command_stream_; + //std::vector command_stream_; + fixed_vmem_vector command_stream_; }; } // namespace d3d12