#pragma once #include "util/types.hpp" #include "util/tsc.hpp" #include "util/atomic.hpp" #include #ifdef ARCH_X64 #if defined(_MSC_VER) && !defined(__clang__) #include #else #include #include #endif #endif namespace utils { // Try to prefetch to Level 2 cache since it's not split to data/code on most processors template constexpr void prefetch_exec(T func) { if (std::is_constant_evaluated()) { return; } const u64 value = reinterpret_cast(func); const void* ptr = reinterpret_cast(value); #ifdef ARCH_X64 return _mm_prefetch(static_cast(ptr), _MM_HINT_T1); #else return __builtin_prefetch(ptr, 0, 2); #endif } // Try to prefetch to Level 1 cache constexpr void prefetch_read(const void* ptr) { if (std::is_constant_evaluated()) { return; } #ifdef ARCH_X64 return _mm_prefetch(static_cast(ptr), _MM_HINT_T0); #else return __builtin_prefetch(ptr, 0, 3); #endif } constexpr void prefetch_write(const void* ptr) { if (std::is_constant_evaluated()) { return; } #if defined(ARCH_X64) return _m_prefetchw(const_cast(ptr)); #else return __builtin_prefetch(ptr, 1, 3); #endif } constexpr u32 popcnt128(const u128& v) { #if defined(_MSC_VER) && !defined(__clang__) return std::popcount(v.lo) + std::popcount(v.hi); #else const u64 lo = static_cast(v); const u64 hi = static_cast(v >> 64); return static_cast(std::popcount(lo) + std::popcount(hi)); #endif } constexpr u64 umulh64(u64 x, u64 y) { #ifdef _MSC_VER if (std::is_constant_evaluated()) #endif { return static_cast((u128{x} * u128{y}) >> 64); } #ifdef _MSC_VER return __umulh(x, y); #endif } inline s64 mulh64(s64 x, s64 y) { #ifdef _MSC_VER return __mulh(x, y); #else return (s128{x} * s128{y}) >> 64; #endif } inline s64 div128(s64 high, s64 low, s64 divisor, s64* remainder = nullptr) { #if defined(_MSC_VER) && !defined(__clang__) s64 rem = 0; s64 r = _div128(high, low, divisor, &rem); if (remainder) { *remainder = rem; } #else const s128 x = (u128{static_cast(high)} << 64) | u64(low); const s128 r = x / divisor; if (remainder) { *remainder = x % divisor; } #endif return r; } inline u64 udiv128(u64 high, u64 low, u64 divisor, u64* remainder = nullptr) { #if defined(_MSC_VER) && !defined(__clang__) u64 rem = 0; u64 r = _udiv128(high, low, divisor, &rem); if (remainder) { *remainder = rem; } #else const u128 x = (u128{high} << 64) | low; const u128 r = x / divisor; if (remainder) { *remainder = x % divisor; } #endif return r; } #if defined(_MSC_VER) && !defined(__clang__) inline u128 operator/(u128 lhs, u64 rhs) { u64 rem = 0; return _udiv128(lhs.hi, lhs.lo, rhs, &rem); } #endif constexpr u32 ctz128(u128 arg) { #if defined(_MSC_VER) && !defined(__clang__) if (!arg.lo) return std::countr_zero(arg.hi) + 64u; else return std::countr_zero(arg.lo); #else const u64 hi = static_cast(arg >> 64); if (hi != 0) return static_cast(std::countr_zero(hi)); const u64 lo = static_cast(arg); return static_cast(std::countr_zero(lo) + 64u); #endif } constexpr u32 clz128(u128 arg) { #if defined(_MSC_VER) && !defined(__clang__) if (arg.hi) return std::countl_zero(arg.hi); else return std::countl_zero(arg.lo) + 64; #else const u64 hi = static_cast(arg >> 64); if (hi != 0) return static_cast(std::countl_zero(hi)); const u64 lo = static_cast(arg); return static_cast(std::countl_zero(lo) + 64u); #endif } inline void pause() { #if defined(ARCH_ARM64) __asm__ volatile("isb" ::: "memory"); #elif defined(ARCH_X64) _mm_pause(); #else #error "Missing utils::pause() implementation" #endif } // The hardware clock on many arm timers run south of 100mhz // and the busy waits in RPCS3 were written assuming an x86 machine // with hardware timers that run around 3GHz. // For instance, on the snapdragon 8 gen 2, the hardware timer runs at 19.2mhz. // This means that a busy wait that would have taken nanoseconds on x86 will run for // many microseconds on many arm machines. #ifdef ARCH_ARM64 inline u64 arm_timer_scale = 1; inline void init_arm_timer_scale() { u64 freq = 0; asm volatile("mrs %0, cntfrq_el0" : "=r"(freq)); // Try to scale hardware timer to match 3GHz u64 timer_scale = freq / 30000000; if (timer_scale) arm_timer_scale = timer_scale; } #endif inline void busy_wait(u64 cycles = 3000) { #ifdef ARCH_ARM64 const u64 stop = get_tsc() + ((cycles / 100) * arm_timer_scale); #else const u64 stop = get_tsc() + cycles; #endif do pause(); while (get_tsc() < stop); } // Align to power of 2 template requires std::is_unsigned_v constexpr std::make_unsigned_t> align(T value, U align) { return static_cast>>((value + (align - 1)) & (T{0} - align)); } // General purpose aligned division, the result is rounded up not truncated template requires std::is_unsigned_v constexpr T aligned_div(T value, std::type_identity_t align) { return static_cast(value / align + T{!!(value % align)}); } // General purpose aligned division, the result is rounded to nearest template requires std::is_integral_v constexpr T rounded_div(T value, std::type_identity_t align) { if constexpr (std::is_unsigned_v) { return static_cast(value / align + T{(value % align) > (align / 2)}); } return static_cast(value / align + (value > 0 ? T{(value % align) > (align / 2)} : 0 - T{(value % align) < (align / 2)})); } // Multiplying by ratio, semi-resistant to overflows template constexpr T rational_mul(T value, std::type_identity_t numerator, std::type_identity_t denominator) { if constexpr (sizeof(T) <= sizeof(u64) / 2) { return static_cast(value * u64{numerator} / u64{denominator}); } return static_cast(value / denominator * numerator + (value % denominator) * numerator / denominator); } template constexpr T add_saturate(T addend1, T addend2) { return static_cast(~addend1) < addend2 ? T{umax} : static_cast(addend1 + addend2); } template constexpr T sub_saturate(T minuend, T subtrahend) { return minuend < subtrahend ? T{0} : static_cast(minuend - subtrahend); } template constexpr T mul_saturate(T factor1, T factor2) { return factor1 > 0 && T{umax} / factor1 < factor2 ? T{umax} : static_cast(factor1 * factor2); } inline void trigger_write_page_fault(void* ptr) { #if defined(ARCH_X64) && !defined(_MSC_VER) __asm__ volatile("lock orl $0, 0(%0)" :: "r" (ptr)); #elif defined(ARCH_ARM64) u32 value = 0; u32* u32_ptr = static_cast(ptr); __asm__ volatile("ldset %w0, %w0, %1" : "+r"(value), "=Q"(*u32_ptr) : "r"(value)); #else *static_cast *>(ptr) += 0; #endif } inline void trap() { #ifdef _M_X64 __debugbreak(); #elif defined(ARCH_X64) __asm__ volatile("int3"); #elif defined(ARCH_ARM64) __asm__ volatile("brk 0x42"); #else #error "Missing utils::trap() implementation" #endif } } // namespace utils using utils::busy_wait; #if defined(_MSC_VER) && !defined(__clang__) using utils::operator/; #endif