From efbeae660cd78601bc6c8f74feecec8b881d7f1a Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 15 Oct 2022 03:07:07 -0700 Subject: [PATCH] Drastically reduce cpu time wasted by XMADecoderThread spinning, went from 13% of all cpu time to about 0.6% in my tests Commented out lock in WatchMemoryRange, lock is always held by caller properly set the value/check the irql for spinlocks in xboxkrnl_threading --- src/xenia/apu/xma_decoder.cc | 11 +-- .../gpu/d3d12/d3d12_command_processor.cc | 4 +- src/xenia/gpu/d3d12/d3d12_command_processor.h | 6 +- src/xenia/gpu/shared_memory.cc | 4 +- .../kernel/xboxkrnl/xboxkrnl_threading.cc | 77 +++++++++++++------ src/xenia/kernel/xthread.h | 2 +- 6 files changed, 62 insertions(+), 42 deletions(-) diff --git a/src/xenia/apu/xma_decoder.cc b/src/xenia/apu/xma_decoder.cc index 43b82ea73..eac5d3d53 100644 --- a/src/xenia/apu/xma_decoder.cc +++ b/src/xenia/apu/xma_decoder.cc @@ -177,14 +177,7 @@ void XmaDecoder::WorkerThreadMain() { } else { idle_loop_count = 0; } - - if (idle_loop_count > 500) { - // Idle for an extended period. Introduce a 20ms wait. - xe::threading::Wait(work_event_.get(), false, - std::chrono::milliseconds(20)); - } - - xe::threading::MaybeYield(); + xe::threading::Wait(work_event_.get(), false); } } @@ -316,7 +309,7 @@ void XmaDecoder::WriteRegister(uint32_t addr, uint32_t value) { } } // Signal the decoder thread to start processing. - work_event_->Set(); + work_event_->SetBoostPriority(); } else if (r >= XmaRegister::Context0Lock && r <= XmaRegister::Context9Lock) { // Context lock command. // This requests a lock by flagging the context. diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 584917e45..f5d4bf002 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -4357,7 +4357,7 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry &= ~(1ull << float_constant_index); + float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) + (float_constant_index << 2)] @@ -4388,7 +4388,7 @@ bool D3D12CommandProcessor::UpdateBindings( uint32_t float_constant_index; while (xe::bit_scan_forward(float_constant_map_entry, &float_constant_index)) { - float_constant_map_entry &= ~(1ull << float_constant_index); + float_constant_map_entry = xe::clear_lowest_bit(float_constant_map_entry); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) + (float_constant_index << 2)] diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 9412116ac..53a23add8 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -680,9 +680,6 @@ class D3D12CommandProcessor final : public CommandProcessor { ID3D12Resource* readback_buffer_ = nullptr; uint32_t readback_buffer_size_ = 0; - std::atomic pix_capture_requested_ = false; - bool pix_capturing_; - // The current fixed-function drawing state. D3D12_VIEWPORT ff_viewport_; D3D12_RECT ff_scissor_; @@ -776,6 +773,9 @@ class D3D12CommandProcessor final : public CommandProcessor { // scratch memexport data MemExportRange memexport_ranges_[512]; uint32_t memexport_range_count_ = 0; + + std::atomic pix_capture_requested_ = false; + bool pix_capturing_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc index c15da8a9b..b891b5f38 100644 --- a/src/xenia/gpu/shared_memory.cc +++ b/src/xenia/gpu/shared_memory.cc @@ -150,8 +150,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange( watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2; uint32_t bucket_last = watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2; - - auto global_lock = global_critical_region_.Acquire(); + //chrispy: Not required the global lock is always held by the caller + // auto global_lock = global_critical_region_.Acquire(); // Allocate the range. WatchRange* range = watch_range_first_free_; diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 95b26dfb3..8d0283744 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -957,13 +957,14 @@ static void PrefetchForCAS(const void* value) { } } -uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) { +uint32_t xeKeKfAcquireSpinLock(uint32_t* lock, uint64_t r13 = 1) { // XELOGD( // "KfAcquireSpinLock({:08X})", // lock_ptr); PrefetchForCAS(lock); + assert_true(*lock != static_cast(r13)); // Lock. - while (!xe::atomic_cas(0, 1, lock)) { + while (!xe::atomic_cas(0, static_cast(r13), lock)) { // Spin! // TODO(benvanik): error on deadlock? xe::threading::MaybeYield(); @@ -976,34 +977,51 @@ uint32_t xeKeKfAcquireSpinLock(uint32_t* lock) { return old_irql; } -dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr) { +dword_result_t KfAcquireSpinLock_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_context) { auto lock = reinterpret_cast(lock_ptr.host_address()); - return xeKeKfAcquireSpinLock(lock); + return xeKeKfAcquireSpinLock(lock, ppc_context->r[13]); } DECLARE_XBOXKRNL_EXPORT3(KfAcquireSpinLock, kThreading, kImplemented, kBlocking, kHighFrequency); void xeKeKfReleaseSpinLock(uint32_t* lock, dword_t old_irql) { + // Unlock. + *lock = 0; + if (old_irql >= 2) { + return; + } // Restore IRQL. XThread* thread = XThread::GetCurrentThread(); thread->LowerIrql(old_irql); - - // Unlock. - xe::atomic_dec(lock); } -void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) { +void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql, + ppc_context_t& ppc_ctx) { auto lock = reinterpret_cast(lock_ptr.host_address()); - xeKeKfReleaseSpinLock(lock, old_irql); + + assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); + + *lock_ptr = 0; + if (old_irql >= 2) { + return; + } + // Restore IRQL. + XThread* thread = XThread::GetCurrentThread(); + thread->LowerIrql(old_irql); } DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented, kHighFrequency); // todo: this is not accurate -void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { +void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + // must not be our own thread + assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); + PrefetchForCAS(lock); - while (!xe::atomic_cas(0, 1, lock)) { + while (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { #if XE_ARCH_AMD64 == 1 // todo: this is just a nop if they don't have SMT, which is not great // either... @@ -1017,11 +1035,13 @@ void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { DECLARE_XBOXKRNL_EXPORT3(KeAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency); -dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { +dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry( + lpdword_t lock_ptr, ppc_context_t& ppc_ctx) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); + assert_true(*lock_ptr != static_cast(ppc_ctx->r[13])); PrefetchForCAS(lock); - if (!xe::atomic_cas(0, 1, lock)) { + if (!xe::atomic_cas(0, static_cast(ppc_ctx->r[13]), lock)) { return 0; } return 1; @@ -1029,10 +1049,12 @@ dword_result_t KeTryToAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { DECLARE_XBOXKRNL_EXPORT4(KeTryToAcquireSpinLockAtRaisedIrql, kThreading, kImplemented, kBlocking, kHighFrequency, kSketchy); -void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr) { +void KeReleaseSpinLockFromRaisedIrql_entry(lpdword_t lock_ptr, + ppc_context_t& ppc_ctx) { // Unlock. + assert_true(*lock_ptr == static_cast(ppc_ctx->r[13])); auto lock = reinterpret_cast(lock_ptr.host_address()); - xe::atomic_dec(lock); + *lock_ptr = 0; } DECLARE_XBOXKRNL_EXPORT2(KeReleaseSpinLockFromRaisedIrql, kThreading, kImplemented, kHighFrequency); @@ -1261,8 +1283,8 @@ void ExInitializeReadWriteLock_entry(pointer_t lock_ptr) { } DECLARE_XBOXKRNL_EXPORT1(ExInitializeReadWriteLock, kThreading, kImplemented); -void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExAcquireReadWriteLockExclusive_entry(pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count) { @@ -1279,8 +1301,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockExclusive, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( - pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); + pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); uint32_t result; if (lock_ptr->lock_count < 0) { @@ -1296,8 +1319,9 @@ dword_result_t ExTryToAcquireReadWriteLockExclusive_entry( DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockExclusive, kThreading, kImplemented); -void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExAcquireReadWriteLockShared_entry(pointer_t lock_ptr, + ppc_context_t& ppc_context) { + auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = ++lock_ptr->lock_count; if (!lock_count || @@ -1316,8 +1340,9 @@ DECLARE_XBOXKRNL_EXPORT2(ExAcquireReadWriteLockShared, kThreading, kImplemented, kBlocking); dword_result_t ExTryToAcquireReadWriteLockShared_entry( - pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); + pointer_t lock_ptr, ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); uint32_t result; if (lock_ptr->lock_count < 0 || @@ -1335,8 +1360,10 @@ dword_result_t ExTryToAcquireReadWriteLockShared_entry( DECLARE_XBOXKRNL_EXPORT1(ExTryToAcquireReadWriteLockShared, kThreading, kImplemented); -void ExReleaseReadWriteLock_entry(pointer_t lock_ptr) { - auto old_irql = xeKeKfAcquireSpinLock(&lock_ptr->spin_lock); +void ExReleaseReadWriteLock_entry(pointer_t lock_ptr, + ppc_context_t& ppc_context) { + auto old_irql = + xeKeKfAcquireSpinLock(&lock_ptr->spin_lock, ppc_context->r[13]); int32_t lock_count = --lock_ptr->lock_count; diff --git a/src/xenia/kernel/xthread.h b/src/xenia/kernel/xthread.h index 9eef807b2..35af2bc12 100644 --- a/src/xenia/kernel/xthread.h +++ b/src/xenia/kernel/xthread.h @@ -100,7 +100,7 @@ struct X_KTHREAD { uint8_t unk_58[0x4]; // 0x58 xe::be stack_base; // 0x5C xe::be stack_limit; // 0x60 - uint8_t unk_64[0x4]; // 0x64 + xe::be stack_kernel; // 0x64 xe::be tls_address; // 0x68 uint8_t unk_6C; // 0x6C uint8_t unk_6D[0x7]; // 0x6D