From 0f85e9123e781b6ec7fa78fa1c54b1de40179a23 Mon Sep 17 00:00:00 2001 From: Malcolm Date: Thu, 15 Jan 2026 04:20:25 +0000 Subject: [PATCH] utils: Scale busy_wait according to arm timer frequency > - This is a fatal issue that was impacting arm builds, since busy_waits were written assuming an approx 3Ghz x86 machine > and most arm machines have a hardware timer that runs south of 100mhz, meaning the top items in the profiler were calls to busy_wait(); > all over the code. Fixing this is a very significant speedup, on my snapdragon 8 gen 2 device. 27->37FPS in Metal Gear Rising, but almost > all games benefit when run on ARM. --- rpcs3/rpcs3.cpp | 5 +++++ rpcs3/util/asm.hpp | 29 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/rpcs3/rpcs3.cpp b/rpcs3/rpcs3.cpp index 007de26bdb..6c2130eca6 100644 --- a/rpcs3/rpcs3.cpp +++ b/rpcs3/rpcs3.cpp @@ -26,6 +26,7 @@ #include "Utilities/sema.h" #include "Utilities/date_time.h" #include "util/console.h" +#include "util/asm.hpp" #include "Crypto/decrypt_binaries.h" #ifdef _WIN32 #include "module_verifier.hpp" @@ -681,6 +682,10 @@ int run_rpcs3(int argc, char** argv) logs::set_init({std::move(ver), std::move(sys), std::move(os), std::move(qt), std::move(time)}); } +#ifdef ARCH_ARM64 + utils::init_arm_timer_scale(); +#endif + #ifdef _WIN32 sys_log.notice("Initialization times before main(): %fGc", intro_cycles / 1000000000.); #elif defined(RUSAGE_THREAD) diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp index 8942dc2a09..56aa955652 100644 --- a/rpcs3/util/asm.hpp +++ b/rpcs3/util/asm.hpp @@ -183,10 +183,35 @@ namespace utils #endif } - // Synchronization helper (cache-friendly busy waiting) - inline void busy_wait(usz cycles = 3000) + // The hardware clock on many arm timers run south of 100mhz + // and the busy waits in RPCS3 were written assuming an x86 machine + // with hardware timers that run around 3GHz. + // For instance, on the snapdragon 8 gen 2, the hardware timer runs at 19.2mhz. + // This means that a busy wait that would have taken nanoseconds on x86 will run for + // many microseconds on many arm machines. +#ifdef ARCH_ARM64 + + inline u64 arm_timer_scale = 1; + + inline void init_arm_timer_scale() { + u64 freq = 0; + asm volatile("mrs %0, cntfrq_el0" : "=r"(freq)); + + // Try to scale hardware timer to match 3GHz + u64 timer_scale = freq / 30000000; + if (timer_scale) + arm_timer_scale = timer_scale; + } +#endif + + inline void busy_wait(u64 cycles = 3000) + { +#ifdef ARCH_ARM64 + const u64 stop = get_tsc() + ((cycles / 100) * arm_timer_scale); +#else const u64 stop = get_tsc() + cycles; +#endif do pause(); while (get_tsc() < stop); }