diff --git a/android/src/rpcsx-android.cpp b/android/src/rpcsx-android.cpp index f1359005e..ce38967c1 100644 --- a/android/src/rpcsx-android.cpp +++ b/android/src/rpcsx-android.cpp @@ -41,12 +41,13 @@ #include "rpcs3_version.h" #include "rpcsx/fw/ps3/cellMsgDialog.h" #include "rpcsx/fw/ps3/cellSysutil.h" +#include "rx/asm.hpp" +#include "rx/debug.hpp" #include "util/File.h" #include "util/JIT.h" #include "util/StrFmt.h" #include "util/StrUtil.h" #include "util/Thread.h" -#include "util/asm.hpp" #include "util/console.h" #include "util/fixed_typemap.hpp" #include "util/logs.hpp" @@ -241,7 +242,7 @@ void jit_announce(uptr, usz, std::string_view); __android_log_write(ANDROID_LOG_FATAL, "RPCS3", buf.c_str()); jit_announce(0, 0, ""); - utils::trap(); + rx::breakpoint(); std::abort(); std::terminate(); } diff --git a/kernel/cellos/src/lv2.cpp b/kernel/cellos/src/lv2.cpp index ef8a010d7..72b867e5d 100644 --- a/kernel/cellos/src/lv2.cpp +++ b/kernel/cellos/src/lv2.cpp @@ -53,10 +53,10 @@ #include "sys_usbd.h" #include "sys_vm.h" +#include "rx/tsc.hpp" #include "util/atomic_bit_set.h" #include "util/init_mutex.hpp" #include "util/sysinfo.hpp" -#include "util/tsc.hpp" #include #include #include @@ -2138,7 +2138,7 @@ void lv2_obj::schedule_all(u64 current_time) { } if (const u64 freq = s_yield_frequency) { - const u64 tsc = utils::get_tsc(); + const u64 tsc = rx::get_tsc(); const u64 last_tsc = s_last_yield_tsc; if (tsc >= last_tsc && tsc <= s_max_allowed_yield_tsc && @@ -2297,7 +2297,7 @@ mwaitx_func static void __mwaitx(u32 cycles, u32 cstate) { // First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 // state (higher power) waitpkg_func static void __tpause(u32 cycles, u32 cstate) { - const u64 tsc = utils::get_tsc() + cycles; + const u64 tsc = rx::get_tsc() + cycles; _tpause(cstate, tsc); } #endif diff --git a/kernel/cellos/src/sys_cond.cpp b/kernel/cellos/src/sys_cond.cpp index 17c066b85..8e7246bd6 100644 --- a/kernel/cellos/src/sys_cond.cpp +++ b/kernel/cellos/src/sys_cond.cpp @@ -9,7 +9,7 @@ #include "sys_cond.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_cond); @@ -454,7 +454,7 @@ error_code sys_cond_wait(ppu_thread &ppu, u32 cond_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_dbg.cpp b/kernel/cellos/src/sys_dbg.cpp index 40b3b38ba..002921e99 100644 --- a/kernel/cellos/src/sys_dbg.cpp +++ b/kernel/cellos/src/sys_dbg.cpp @@ -8,7 +8,8 @@ #include "Emu/Memory/vm_locking.h" #include "rpcsx/fw/ps3/sys_lv2dbg.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr = nullptr); @@ -92,7 +93,7 @@ error_code sys_dbg_write_process_memory(s32 pid, u32 address, u32 size, for (u32 i = address, exec_update_size = 0; i < end;) { const u32 op_size = - std::min(utils::align(i + 1, 0x10000), end) - i; + std::min(rx::alignUp(i + 1, 0x10000), end) - i; const bool is_exec = vm::check_addr(i, vm::page_executable | vm::page_readable); diff --git a/kernel/cellos/src/sys_event.cpp b/kernel/cellos/src/sys_event.cpp index c49f77b59..256d35940 100644 --- a/kernel/cellos/src/sys_event.cpp +++ b/kernel/cellos/src/sys_event.cpp @@ -11,7 +11,7 @@ #include "Emu/Cell/SPUThread.h" #include "sys_process.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_event); @@ -468,7 +468,7 @@ error_code sys_event_queue_receive(ppu_thread &ppu, u32 equeue_id, } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_event_flag.cpp b/kernel/cellos/src/sys_event_flag.cpp index 8fa85dc6a..cbcbbf138 100644 --- a/kernel/cellos/src/sys_event_flag.cpp +++ b/kernel/cellos/src/sys_event_flag.cpp @@ -7,7 +7,7 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/PPUThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_event_flag); @@ -195,7 +195,7 @@ error_code sys_event_flag_wait(ppu_thread &ppu, u32 id, u64 bitptn, u32 mode, } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_fs.cpp b/kernel/cellos/src/sys_fs.cpp index a0907463d..9f890644f 100644 --- a/kernel/cellos/src/sys_fs.cpp +++ b/kernel/cellos/src/sys_fs.cpp @@ -1,9 +1,9 @@ #include "stdafx.h" +#include "rx/asm.hpp" #include "sys_fs.h" #include "sys_memory.h" #include "sys_sync.h" -#include "util/asm.hpp" #include "Crypto/unedat.h" #include "Emu/Cell/PPUThread.h" @@ -618,7 +618,7 @@ struct lv2_file::file_view : fs::file_base { fs::stat_t stat = m_file->file.get_stat(); // TODO: Check this on realhw - // stat.size = utils::sub_saturate(stat.size, m_off); + // stat.size = rx::sub_saturate(stat.size, m_off); stat.is_writable = false; return stat; @@ -655,7 +655,7 @@ struct lv2_file::file_view : fs::file_base { } u64 size() override { - return utils::sub_saturate(m_file->file.size(), m_off); + return rx::sub_saturate(m_file->file.size(), m_off); } fs::file_id get_id() override { diff --git a/kernel/cellos/src/sys_lwcond.cpp b/kernel/cellos/src/sys_lwcond.cpp index b4873020f..93c7a47a6 100644 --- a/kernel/cellos/src/sys_lwcond.cpp +++ b/kernel/cellos/src/sys_lwcond.cpp @@ -8,7 +8,7 @@ #include "Emu/Cell/PPUThread.h" #include "sys_lwmutex.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_lwcond); @@ -490,7 +490,7 @@ error_code _sys_lwcond_queue_wait(ppu_thread &ppu, u32 lwcond_id, } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_lwmutex.cpp b/kernel/cellos/src/sys_lwmutex.cpp index 3b7c9aa92..b860823ad 100644 --- a/kernel/cellos/src/sys_lwmutex.cpp +++ b/kernel/cellos/src/sys_lwmutex.cpp @@ -7,7 +7,7 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/PPUThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_lwmutex); @@ -194,7 +194,7 @@ error_code _sys_lwmutex_lock(ppu_thread &ppu, u32 lwmutex_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_memory.cpp b/kernel/cellos/src/sys_memory.cpp index 180fed708..1a4979667 100644 --- a/kernel/cellos/src/sys_memory.cpp +++ b/kernel/cellos/src/sys_memory.cpp @@ -8,7 +8,8 @@ #include "Emu/IdManager.h" #include "Emu/Memory/vm_locking.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_memory); @@ -75,11 +76,11 @@ struct sys_memory_address_table { }; std::shared_ptr reserve_map(u32 alloc_size, u32 align) { - return vm::reserve_map( - align == 0x10000 ? vm::user64k : vm::user1m, 0, - align == 0x10000 ? 0x20000000 : utils::align(alloc_size, 0x10000000), - align == 0x10000 ? (vm::page_size_64k | vm::bf0_0x1) - : (vm::page_size_1m | vm::bf0_0x1)); + return vm::reserve_map(align == 0x10000 ? vm::user64k : vm::user1m, 0, + align == 0x10000 ? 0x20000000 + : rx::alignUp(alloc_size, 0x10000000), + align == 0x10000 ? (vm::page_size_64k | vm::bf0_0x1) + : (vm::page_size_1m | vm::bf0_0x1)); } // Todo: fix order of error checks diff --git a/kernel/cellos/src/sys_mutex.cpp b/kernel/cellos/src/sys_mutex.cpp index 93a749506..b43137661 100644 --- a/kernel/cellos/src/sys_mutex.cpp +++ b/kernel/cellos/src/sys_mutex.cpp @@ -5,7 +5,7 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/PPUThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "sys_mutex.h" @@ -147,7 +147,7 @@ error_code sys_mutex_lock(ppu_thread &ppu, u32 mutex_id, u64 timeout) { // Try busy waiting a bit if advantageous for (u32 i = 0, end = lv2_obj::has_ppus_in_running_state() ? 3 : 10; id_manager::g_mutex.is_lockable() && i < end; i++) { - busy_wait(300); + rx::busy_wait(300); result = mutex.try_lock(ppu); if (!result || @@ -212,7 +212,7 @@ error_code sys_mutex_lock(ppu_thread &ppu, u32 mutex_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 40; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_ppu_thread.cpp b/kernel/cellos/src/sys_ppu_thread.cpp index 7e0ca06bd..5e80ac5e2 100644 --- a/kernel/cellos/src/sys_ppu_thread.cpp +++ b/kernel/cellos/src/sys_ppu_thread.cpp @@ -15,7 +15,8 @@ #include "sys_mmapper.h" #include "sys_process.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include @@ -148,7 +149,7 @@ void _sys_ppu_thread_exit(ppu_thread &ppu, u64 errorcode) { // Need to wait until the current writers finish if (ppu.state & cpu_flag::memory) { for (; writer_mask; writer_mask &= vm::g_range_lock_bits[1]) { - busy_wait(200); + rx::busy_wait(200); } } } @@ -468,7 +469,7 @@ error_code _sys_ppu_thread_create(ppu_thread &ppu, vm::ptr thread_id, const u32 tls = param->tls; // Compute actual stack size and allocate - const u32 stack_size = utils::align(std::max(_stacksz, 4096), 4096); + const u32 stack_size = rx::alignUp(std::max(_stacksz, 4096), 4096); auto &dct = g_fxo->get(); diff --git a/kernel/cellos/src/sys_rsx.cpp b/kernel/cellos/src/sys_rsx.cpp index ade5a3a26..2407ad4d0 100644 --- a/kernel/cellos/src/sys_rsx.cpp +++ b/kernel/cellos/src/sys_rsx.cpp @@ -10,9 +10,9 @@ #include "Emu/RSX/Core/RSXReservationLock.hpp" #include "Emu/RSX/RSXThread.h" #include "Emu/System.h" +#include "rx/asm.hpp" #include "sys_event.h" #include "sys_vm.h" -#include "util/asm.hpp" LOG_CHANNEL(sys_rsx); @@ -46,7 +46,7 @@ static void set_rsx_dmactl(rsx::thread *render, u64 get_put) { } } - utils::pause(); + rx::pause(); } // Schedule FIFO interrupt to deal with this immediately diff --git a/kernel/cellos/src/sys_rwlock.cpp b/kernel/cellos/src/sys_rwlock.cpp index c6cdef964..e31a08e4f 100644 --- a/kernel/cellos/src/sys_rwlock.cpp +++ b/kernel/cellos/src/sys_rwlock.cpp @@ -7,7 +7,7 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/PPUThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_rwlock); @@ -151,7 +151,7 @@ error_code sys_rwlock_rlock(ppu_thread &ppu, u32 rw_lock_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { @@ -355,7 +355,7 @@ error_code sys_rwlock_wlock(ppu_thread &ppu, u32 rw_lock_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_semaphore.cpp b/kernel/cellos/src/sys_semaphore.cpp index d24632dbe..2f9dba9c7 100644 --- a/kernel/cellos/src/sys_semaphore.cpp +++ b/kernel/cellos/src/sys_semaphore.cpp @@ -7,7 +7,7 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/PPUThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_semaphore); @@ -167,7 +167,7 @@ error_code sys_semaphore_wait(ppu_thread &ppu, u32 sem_id, u64 timeout) { } for (usz i = 0; cpu_flag::signal - ppu.state && i < 50; i++) { - busy_wait(500); + rx::busy_wait(500); } if (ppu.state & cpu_flag::signal) { diff --git a/kernel/cellos/src/sys_spu.cpp b/kernel/cellos/src/sys_spu.cpp index b12c88cd9..d5794c396 100644 --- a/kernel/cellos/src/sys_spu.cpp +++ b/kernel/cellos/src/sys_spu.cpp @@ -21,7 +21,8 @@ #include "sys_mmapper.h" #include "sys_process.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_spu); @@ -129,7 +130,7 @@ void sys_spu_image::load(const fs::file &stream) { this->nsegs = 0; this->segs = vm::null; - vm::page_protect(segs.addr(), utils::align(mem_size, 4096), 0, 0, + vm::page_protect(segs.addr(), rx::alignUp(mem_size, 4096), 0, 0, vm::page_writable); } @@ -196,8 +197,8 @@ void sys_spu_image::deploy(u8 *loc, std::span segs, } auto mem_translate = [loc](u32 addr, u32 size) { - return utils::add_saturate(addr, size) <= SPU_LS_SIZE ? loc + addr - : nullptr; + return rx::add_saturate(addr, size) <= SPU_LS_SIZE ? loc + addr + : nullptr; }; // Apply the patch @@ -1259,7 +1260,7 @@ error_code sys_spu_thread_group_terminate(ppu_thread &ppu, u32 id, s32 value) { // termination auto short_sleep = [](ppu_thread &ppu) { lv2_obj::sleep(ppu); - busy_wait(3000); + rx::busy_wait(3000); ppu.check_state(); ppu.state += cpu_flag::wait; }; diff --git a/kernel/cellos/src/sys_time.cpp b/kernel/cellos/src/sys_time.cpp index 06ea57d17..20d66046c 100644 --- a/kernel/cellos/src/sys_time.cpp +++ b/kernel/cellos/src/sys_time.cpp @@ -5,8 +5,8 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/timers.hpp" #include "Emu/system_config.h" +#include "rx/tsc.hpp" #include "sys_process.h" -#include "util/tsc.hpp" #include "util/sysinfo.hpp" @@ -14,7 +14,7 @@ u64 g_timebase_offs{}; static u64 systemtime_offset; #ifndef __linux__ -#include "util/asm.hpp" +#include "rx/asm.hpp" #endif #ifdef _WIN32 @@ -151,7 +151,7 @@ u64 convert_to_timebased_time(u64 time) { u64 get_timebased_time() { if (u64 freq = utils::get_tsc_freq()) { - const u64 tsc = utils::get_tsc(); + const u64 tsc = rx::get_tsc(); #if _MSC_VER const u64 result = @@ -218,7 +218,7 @@ void initialize_timebased_time(u64 timebased_init, bool reset) { // Returns some relative time in microseconds, don't change this fact u64 get_system_time() { if (u64 freq = utils::get_tsc_freq()) { - const u64 tsc = utils::get_tsc(); + const u64 tsc = rx::get_tsc(); #if _MSC_VER const u64 result = static_cast(u128_from_mul(tsc, 1000000ull) / freq); @@ -358,7 +358,7 @@ error_code sys_time_get_current_time(vm::ptr sec, vm::ptr nsec) { // Get time difference in nanoseconds (using 128 bit accumulator) const u64 diff_sl = diff_base * 1000000000ull; - const u64 diff_sh = utils::umulh64(diff_base, 1000000000ull); + const u64 diff_sh = rx::umulh64(diff_base, 1000000000ull); const u64 diff = utils::udiv128(diff_sh, diff_sl, s_time_aux_info.perf_freq); // get time since Epoch in nanoseconds diff --git a/kernel/cellos/src/sys_timer.cpp b/kernel/cellos/src/sys_timer.cpp index 5409081f6..71678e83e 100644 --- a/kernel/cellos/src/sys_timer.cpp +++ b/kernel/cellos/src/sys_timer.cpp @@ -9,9 +9,9 @@ #include "Emu/System.h" #include "Emu/system_config.h" +#include "rx/asm.hpp" #include "sys_event.h" #include "sys_process.h" -#include "util/asm.hpp" #include #include @@ -77,9 +77,9 @@ u64 lv2_timer::check_unlocked(u64 _now) noexcept { if (period) { // Set next expiration time and check again - const u64 expire0 = utils::add_saturate(next, period); + const u64 expire0 = rx::add_saturate(next, period); expire.release(expire0); - return utils::sub_saturate(expire0, _now); + return rx::sub_saturate(expire0, _now); } // Stop after oneshot @@ -265,11 +265,11 @@ error_code _sys_timer_start(ppu_thread &ppu, u32 timer_id, u64 base_time, const u64 expire = period == 0 ? base_time : // oneshot base_time == 0 - ? utils::add_saturate(start_time, period) + ? rx::add_saturate(start_time, period) : // periodic timer with no base (using start time as base) - start_time < utils::add_saturate(base_time, period) - ? utils::add_saturate(base_time, period) + start_time < rx::add_saturate(base_time, period) + ? rx::add_saturate(base_time, period) : // periodic with base time over start time [&]() -> u64 // periodic timer base before start time (align to @@ -282,10 +282,10 @@ error_code _sys_timer_start(ppu_thread &ppu, u32 timer_id, u64 base_time, // } // while (base_time < start_time); - const u64 start_time_with_base_time_reminder = utils::add_saturate( + const u64 start_time_with_base_time_reminder = rx::add_saturate( start_time - start_time % period, base_time % period); - return utils::add_saturate( + return rx::add_saturate( start_time_with_base_time_reminder, start_time_with_base_time_reminder < start_time ? period : 0); }(); @@ -428,10 +428,10 @@ error_code sys_timer_usleep(ppu_thread &ppu, u64 sleep_time) { // Over/underflow checks if (add_time >= 0) { - sleep_time = utils::add_saturate(sleep_time, add_time); + sleep_time = rx::add_saturate(sleep_time, add_time); } else { sleep_time = - std::max(1, utils::sub_saturate(sleep_time, -add_time)); + std::max(1, rx::sub_saturate(sleep_time, -add_time)); } lv2_obj::sleep(ppu, g_cfg.core.sleep_timers_accuracy < diff --git a/ps3fw/cellAdec.cpp b/ps3fw/cellAdec.cpp index 3805e7072..c9b5d1c42 100644 --- a/ps3fw/cellAdec.cpp +++ b/ps3fw/cellAdec.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" + +#include "rx/align.hpp" #include "Emu/perf_meter.hpp" #include "Emu/Cell/PPUModule.h" #include "cellos/sys_sync.h" @@ -9,7 +11,7 @@ #include "cellAdec.h" #include "util/simd.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(cellAdec); @@ -415,7 +417,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) be_t* const _output = std::assume_aligned<0x80>(output.get_ptr()); s64 output_size = cmd.au_size; - s32 sample_num = static_cast(utils::align(+lpcm_param->audioPayloadSize, 0x10)); + s32 sample_num = static_cast(rx::alignUp(+lpcm_param->audioPayloadSize, 0x10)); s32 channel_num = 0; if (!dvd_packing) @@ -860,11 +862,11 @@ error_code _CellAdecCoreOpGetMemSize_lpcm(vm::ptr attr) cellAdec.notice("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr); constexpr u32 mem_size = - utils::align(static_cast(sizeof(LpcmDecContext)), 0x80) + utils::align(static_cast(sizeof(CellAdecParamLpcm)), 0x80) + 0x100 // Command data for Spurs task - + LPCM_DEC_OUTPUT_BUFFER_SIZE + 0x2900 // sizeof(CellSpurs) + sizeof(CellSpursTaskset) - + 0x3b400 // Spurs context - + 0x300 // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2 - + 0x855; // Unused + rx::alignUp(static_cast(sizeof(LpcmDecContext)), 0x80) + rx::alignUp(static_cast(sizeof(CellAdecParamLpcm)), 0x80) + 0x100 // Command data for Spurs task + + LPCM_DEC_OUTPUT_BUFFER_SIZE + 0x2900 // sizeof(CellSpurs) + sizeof(CellSpursTaskset) + + 0x3b400 // Spurs context + + 0x300 // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2 + + 0x855; // Unused static_assert(mem_size == 0x7ebd5); @@ -883,7 +885,7 @@ error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr ensure(handle.aligned(0x80)); // LLE doesn't check the alignment or aligns the address itself ensure(!!notifyAuDone && !!notifyAuDoneArg && !!notifyPcmOut && !!notifyPcmOutArg && !!notifyError && !!notifyErrorArg && !!notifySeqDone && !!notifySeqDoneArg); // These should always be set - const u32 end_of_context_addr = handle.addr() + utils::align(static_cast(sizeof(LpcmDecContext)), 0x80); + const u32 end_of_context_addr = handle.addr() + rx::alignUp(static_cast(sizeof(LpcmDecContext)), 0x80); handle->cmd_queue.front = 0; handle->cmd_queue.back = 0; @@ -1587,10 +1589,10 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr type, vm::cptrgetPcmHandleNum(ppu); const u32 bitstream_info_size = core_ops->getBsiInfoSize(ppu); - const auto _this = vm::ptr::make(utils::align(+res->startAddr, 0x80)); + const auto _this = vm::ptr::make(rx::alignUp(+res->startAddr, 0x80)); const auto frames = vm::ptr::make(_this.addr() + sizeof(AdecContext)); const u32 bitstream_infos_addr = frames.addr() + pcm_handle_num * sizeof(AdecFrame); - const auto core_handle = vm::ptr::make(utils::align(bitstream_infos_addr + bitstream_info_size * pcm_handle_num, 0x80)); + const auto core_handle = vm::ptr::make(rx::alignUp(bitstream_infos_addr + bitstream_info_size * pcm_handle_num, 0x80)); if (type->audioCodecType == CELL_ADEC_TYPE_LPCM_DVD) { diff --git a/ps3fw/cellAtracXdec.cpp b/ps3fw/cellAtracXdec.cpp index 4b856e044..70ab395d5 100644 --- a/ps3fw/cellAtracXdec.cpp +++ b/ps3fw/cellAtracXdec.cpp @@ -1,11 +1,13 @@ #include "stdafx.h" + +#include "rx/align.hpp" #include "Emu/perf_meter.hpp" #include "Emu/Cell/PPUModule.h" #include "cellos/sys_sync.h" #include "cellos/sys_ppu_thread.h" #include "Emu/savestate_utils.hpp" #include "sysPrxForUser.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/media_utils.h" #include "cellAtracXdec.h" @@ -182,7 +184,7 @@ error_code AtracXdecDecoder::set_config_info(u32 sampling_freq, u32 ch_config_id this->sampling_freq = sampling_freq; this->ch_config_idx = ch_config_idx; this->nbytes = nbytes; - this->nbytes_128_aligned = utils::align(nbytes, 0x80); + this->nbytes_128_aligned = rx::alignUp(nbytes, 0x80); this->nch_in = ch_config_idx <= 4 ? ch_config_idx : ch_config_idx + 1; if (ch_config_idx > 7u) @@ -741,7 +743,7 @@ error_code _CellAdecCoreOpGetMemSize_atracx(vm::ptr attr) constexpr u32 mem_size = sizeof(AtracXdecContext) + 0x7f + ATXDEC_SPURS_STRUCTS_SIZE + 0x1d8 + atracXdecGetSpursMemSize(nch_in) + ATXDEC_SAMPLES_PER_FRAME * sizeof(f32) * nch_in; - attr->workMemSize = utils::align(mem_size, 0x80); + attr->workMemSize = rx::alignUp(mem_size, 0x80); return CELL_OK; } @@ -765,7 +767,7 @@ error_code _CellAdecCoreOpOpenExt_atracx(ppu_thread& ppu, vm::ptr::make(handle.addr() + utils::align(static_cast(sizeof(AtracXdecContext)), 0x80) + ATXDEC_SPURS_STRUCTS_SIZE))); + vm::bptr::make(handle.addr() + rx::alignUp(static_cast(sizeof(AtracXdecContext)), 0x80) + ATXDEC_SPURS_STRUCTS_SIZE))); const vm::var mutex_attr{{SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, {"_atd001"_u64}}}; const vm::var cond_attr{{SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, {"_atd002"_u64}}}; diff --git a/ps3fw/cellDmux.cpp b/ps3fw/cellDmux.cpp index 095aa643d..5d2c0206c 100644 --- a/ps3fw/cellDmux.cpp +++ b/ps3fw/cellDmux.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" + +#include "rx/align.hpp" #include "Emu/System.h" #include "Emu/IdManager.h" #include "Emu/Cell/PPUModule.h" @@ -7,7 +9,7 @@ #include "cellPamf.h" #include "cellDmux.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include @@ -765,7 +767,7 @@ PesHeader::PesHeader(DemuxerStream& stream) } ElementaryStream::ElementaryStream(Demuxer* dmux, u32 addr, u32 size, u32 fidMajor, u32 fidMinor, u32 sup1, u32 sup2, vm::ptr cbFunc, u32 cbArg, u32 spec) - : put(utils::align(addr, 128)), dmux(dmux), memAddr(utils::align(addr, 128)), memSize(size - (addr - memAddr)), fidMajor(fidMajor), fidMinor(fidMinor), sup1(sup1), sup2(sup2), cbFunc(cbFunc), cbArg(cbArg), spec(spec) + : put(rx::alignUp(addr, 128)), dmux(dmux), memAddr(rx::alignUp(addr, 128)), memSize(size - (addr - memAddr)), fidMajor(fidMajor), fidMinor(fidMinor), sup1(sup1), sup2(sup2), cbFunc(cbFunc), cbArg(cbArg), spec(spec) { } @@ -849,7 +851,7 @@ void ElementaryStream::push_au(u32 size, u64 dts, u64 pts, u64 userdata, bool ra addr = put; - put = utils::align(put + 128 + size, 128); + put = rx::alignUp(put + 128 + size, 128); put_count++; } diff --git a/ps3fw/cellGame.cpp b/ps3fw/cellGame.cpp index d9e5f187b..b99d2913a 100644 --- a/ps3fw/cellGame.cpp +++ b/ps3fw/cellGame.cpp @@ -18,7 +18,7 @@ #include "Crypto/utils.h" #include "Loader/PSF.h" #include "util/StrUtil.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/init_mutex.hpp" #include @@ -691,7 +691,7 @@ error_code cellHddGameGetSizeKB(ppu_thread& ppu, vm::ptr size) // This function is very slow by nature // TODO: Check if after first use the result is being cached so the sleep can // be reduced in this case - lv2_sleep(utils::sub_saturate(dirsz == umax ? 2000 : 200000, + lv2_sleep(rx::sub_saturate(dirsz == umax ? 2000 : 200000, get_guest_system_time() - start_sleep), &ppu); @@ -757,7 +757,7 @@ error_code cellGameDataGetSizeKB(ppu_thread& ppu, vm::ptr size) // This function is very slow by nature // TODO: Check if after first use the result is being cached so the sleep can // be reduced in this case - lv2_sleep(utils::sub_saturate(dirsz == umax ? 2000 : 200000, + lv2_sleep(rx::sub_saturate(dirsz == umax ? 2000 : 200000, get_guest_system_time() - start_sleep), &ppu); @@ -1127,7 +1127,7 @@ cellGameContentPermit(ppu_thread& ppu, } // This function is very slow by nature - lv2_sleep(utils::sub_saturate( + lv2_sleep(rx::sub_saturate( !perm.temp.empty() || perm.can_create ? 200000 : 2000, get_guest_system_time() - start_sleep), &ppu); @@ -1886,7 +1886,7 @@ error_code cellGameGetSizeKB(ppu_thread& ppu, vm::ptr size) // This function is very slow by nature // TODO: Check if after first use the result is being cached so the sleep can // be reduced in this case - lv2_sleep(utils::sub_saturate(dirsz == umax ? 1000 : 200000, + lv2_sleep(rx::sub_saturate(dirsz == umax ? 1000 : 200000, get_guest_system_time() - start_sleep), &ppu); diff --git a/ps3fw/cellSaveData.cpp b/ps3fw/cellSaveData.cpp index 73503fce7..380f6b1b5 100644 --- a/ps3fw/cellSaveData.cpp +++ b/ps3fw/cellSaveData.cpp @@ -1,3 +1,5 @@ +#include "stdafx.h" + #include "cellSysutil.h" #include "cellUserInfo.h" #include "Emu/Cell/PPUModule.h" @@ -12,7 +14,6 @@ #include "Emu/localized_string.h" #include "Emu/savestate_utils.hpp" #include "Emu/system_config.h" -#include "stdafx.h" #include "cellMsgDialog.h" #include "cellSaveData.h" @@ -26,7 +27,9 @@ #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" +#include "rx/types.hpp" LOG_CHANNEL(cellSaveData); @@ -65,11 +68,11 @@ std::string SaveDataEntry::date() const std::string SaveDataEntry::data_size() const { std::string metric = "KB"; - u64 sz = utils::aligned_div(size, 1000); + u64 sz = rx::aligned_div(size, 1000); if (sz > 1000) { metric = "MB"; - sz = utils::aligned_div(sz, 1000); + sz = rx::aligned_div(sz, 1000); } return fmt::format("%lu %s", sz, metric); } @@ -1286,7 +1289,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, { if (!file.is_directory) { - size_bytes += utils::align(file.size, 1024); + size_bytes += rx::alignUp(file.size, 1024); } } @@ -1728,7 +1731,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, statGet->fileNum++; size_bytes += - utils::align(entry.size, 1024); // firmware rounds this value up + rx::alignUp(entry.size, 1024); // firmware rounds this value up if (statGet->fileListNum >= setBuf->fileListMax) continue; @@ -2345,7 +2348,7 @@ savedata_op(ppu_thread& ppu, u32 operation, u32 version, vm::cptr dirName, final_blist = fmt::merge(blist, "/"); psf::assign( psf, "RPCS3_BLIST", - psf::string(utils::align(::size32(final_blist) + 1, 4), final_blist)); + psf::string(rx::alignUp(::size32(final_blist) + 1, 4), final_blist)); // Write all files in temporary directory auto& fsfo = all_files["PARAM.SFO"]; diff --git a/ps3fw/cellSpurs.cpp b/ps3fw/cellSpurs.cpp index ba1ec29bd..691b9f56e 100644 --- a/ps3fw/cellSpurs.cpp +++ b/ps3fw/cellSpurs.cpp @@ -15,7 +15,7 @@ #include "sysPrxForUser.h" #include "cellSpurs.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" diff --git a/ps3fw/cellSpursSpu.cpp b/ps3fw/cellSpursSpu.cpp index d68cf93f3..3b1dc37fb 100644 --- a/ps3fw/cellSpursSpu.cpp +++ b/ps3fw/cellSpursSpu.cpp @@ -6,7 +6,7 @@ #include "Emu/Cell/SPURecompiler.h" #include "cellSpurs.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" diff --git a/ps3fw/cellSysutilAvc2.cpp b/ps3fw/cellSysutilAvc2.cpp index 5e7af74a5..fd543af67 100644 --- a/ps3fw/cellSysutilAvc2.cpp +++ b/ps3fw/cellSysutilAvc2.cpp @@ -1,7 +1,9 @@ #include "stdafx.h" + #include "Emu/Cell/PPUModule.h" #include "Emu/IdManager.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include "sceNp.h" #include "sceNp2.h" @@ -946,7 +948,7 @@ error_code cellSysutilAvc2Load_shared(SceNpMatching2ContextId /*ctx_id*/, u32 /* window_count++; } - total_bitrate = utils::align(window_count * bitrate, 0x100000) + 0x100000; + total_bitrate = rx::alignUp(window_count * bitrate, 0x100000) + 0x100000; } settings.video_stream_sharing = init_param->video_param.video_stream_sharing; diff --git a/ps3fw/cellVdec.cpp b/ps3fw/cellVdec.cpp index 202ab126e..13ccc71f9 100644 --- a/ps3fw/cellVdec.cpp +++ b/ps3fw/cellVdec.cpp @@ -1,3 +1,5 @@ +#include "stdafx.h" + #include "Emu/Cell/PPUModule.h" #include "cellos/sys_ppu_thread.h" #include "cellos/sys_process.h" @@ -5,7 +7,7 @@ #include "Emu/IdManager.h" #include "Emu/perf_meter.hpp" #include "Emu/savestate_utils.hpp" -#include "stdafx.h" +#include "rx/align.hpp" #include "sysPrxForUser.h" #include "util/media_utils.h" @@ -32,7 +34,7 @@ extern "C" #include "cellPamf.h" #include "cellVdec.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/lockless.h" #include #include @@ -1660,7 +1662,7 @@ error_code cellVdecGetPicItem(ppu_thread& ppu, u32 handle, const int buffer_size = av_image_get_buffer_size( vdec->ctx->pix_fmt, vdec->ctx->width, vdec->ctx->height, 1); ensure(buffer_size >= 0); - info->size = utils::align(buffer_size, 128); + info->size = rx::alignUp(buffer_size, 128); info->auNum = 1; info->auPts[0].lower = static_cast(pts); info->auPts[0].upper = static_cast(pts >> 32); diff --git a/ps3fw/sceNpTrophy.cpp b/ps3fw/sceNpTrophy.cpp index 7bf6cd263..b6f295e4c 100644 --- a/ps3fw/sceNpTrophy.cpp +++ b/ps3fw/sceNpTrophy.cpp @@ -20,7 +20,7 @@ #include "cellos/sys_event.h" #include "cellos/sys_fs.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include #include #include @@ -1490,7 +1490,7 @@ error_code sceNpTrophyGetGameProgress(u32 context, u32 handle, const u32 trp_count = ctxt->tropusr->GetTrophiesCount(); // Round result to nearest (TODO: Check 0 trophies) - *percentage = trp_count ? utils::rounded_div(unlocked * 100, trp_count) : 0; + *percentage = trp_count ? rx::rounded_div(unlocked * 100, trp_count) : 0; if (trp_count == 0 || trp_count > 128) { diff --git a/ps3fw/sys_lwmutex_.cpp b/ps3fw/sys_lwmutex_.cpp index ff64d2e90..a3fe6eb9d 100644 --- a/ps3fw/sys_lwmutex_.cpp +++ b/ps3fw/sys_lwmutex_.cpp @@ -7,7 +7,7 @@ #include "cellos/sys_mutex.h" #include "sysPrxForUser.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sysPrxForUser); @@ -151,7 +151,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr lwmutex, u64 for (u32 i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); if (lwmutex->vars.owner.load() == lwmutex_free) { @@ -210,7 +210,7 @@ error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr lwmutex, u64 { for (u32 i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); if (lwmutex->vars.owner.load() == lwmutex_free) { diff --git a/rpcs3/Crypto/unedat.cpp b/rpcs3/Crypto/unedat.cpp index c31a849b3..d2e6b9386 100644 --- a/rpcs3/Crypto/unedat.cpp +++ b/rpcs3/Crypto/unedat.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" + +#include "rx/align.hpp" #include "key_vault.h" #include "unedat.h" #include "sha1.h" @@ -8,7 +10,7 @@ #include "Emu/system_utils.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include #include @@ -233,7 +235,7 @@ s64 decrypt_block(const fs::file* in, u8* out, EDAT_HEADER* edat, NPD_HEADER* np // Locate the real data. const usz pad_length = length; - length = utils::align(pad_length, 0x10); + length = rx::alignUp(pad_length, 0x10); // Setup buffers for decryption and read the data. std::vector enc_data_buf(is_out_buffer_aligned || length == pad_length ? 0 : length); @@ -432,12 +434,12 @@ bool check_data(u8* key, EDAT_HEADER* edat, NPD_HEADER* npd, const fs::file* f, return false; } - const usz block_num = utils::aligned_div(edat->file_size, edat->block_size); + const usz block_num = rx::aligned_div(edat->file_size, edat->block_size); constexpr usz metadata_offset = 0x100; - const usz metadata_size = utils::mul_saturate(metadata_section_size, block_num); + const usz metadata_size = rx::mul_saturate(metadata_section_size, block_num); u64 metadata_section_offset = metadata_offset; - if (utils::add_saturate(utils::add_saturate(file_offset, metadata_section_offset), metadata_size) > f->size()) + if (rx::add_saturate(rx::add_saturate(file_offset, metadata_section_offset), metadata_size) > f->size()) { return false; } @@ -860,7 +862,7 @@ bool EDATADecrypter::ReadHeader() //} file_size = edatHeader.file_size; - total_blocks = ::narrow(utils::aligned_div(edatHeader.file_size, edatHeader.block_size)); + total_blocks = ::narrow(rx::aligned_div(edatHeader.file_size, edatHeader.block_size)); // Try decrypting the first block instead u8 data_sample[1]; @@ -886,7 +888,7 @@ u64 EDATADecrypter::ReadData(u64 pos, u8* data, u64 size) // Now we need to offset things to account for the actual 'range' requested const u64 startOffset = pos % edatHeader.block_size; - const u64 num_blocks = utils::aligned_div(startOffset + size, edatHeader.block_size); + const u64 num_blocks = rx::aligned_div(startOffset + size, edatHeader.block_size); // Find and decrypt block range covering pos + size const u32 starting_block = ::narrow(pos / edatHeader.block_size); diff --git a/rpcs3/Crypto/unself.cpp b/rpcs3/Crypto/unself.cpp index 5673e60cf..2bf55f926 100644 --- a/rpcs3/Crypto/unself.cpp +++ b/rpcs3/Crypto/unself.cpp @@ -1,7 +1,7 @@ #include "stdafx.h" #include "aes.h" #include "unself.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "Emu/System.h" #include "Emu/system_utils.hpp" #include "Crypto/unzip.h" @@ -887,7 +887,7 @@ bool SELFDecrypter::LoadHeaders(bool isElf32, SelfAdditionalInfo* out_info) m_seg_ext_hdr.back().Load(self_f); } - if (m_ext_hdr.version_hdr_offset == 0 || utils::add_saturate(m_ext_hdr.version_hdr_offset, sizeof(version_header)) > self_f.size()) + if (m_ext_hdr.version_hdr_offset == 0 || rx::add_saturate(m_ext_hdr.version_hdr_offset, sizeof(version_header)) > self_f.size()) { return false; } diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index d7a34a068..141a700ad 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -1,4 +1,5 @@ #include "stdafx.h" + #include "CPUThread.h" #include "CPUDisAsm.h" @@ -14,7 +15,7 @@ #include "Emu/RSX/RSXThread.h" #include "Emu/perf_meter.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include #include #include @@ -64,7 +65,6 @@ void fmt_class_string::format(std::string& out, u64 arg) case cpu_flag::dbg_global_pause: return "G-PAUSE"; case cpu_flag::dbg_pause: return "PAUSE"; case cpu_flag::dbg_step: return "STEP"; - case cpu_flag::bitset_last: break; } return unknown; @@ -124,7 +124,7 @@ void fmt_class_string::format(std::string& ou for (u32 i = 0; !rlock.try_lock() && i < 100; i++) { - busy_wait(); + rx::busy_wait(); } if (rlock) @@ -533,7 +533,7 @@ namespace cpu_counter if (ok) [[likely]] { // Get actual slot number - id = utils::ctz128(~bits); + id = rx::ctz128(~bits); // Register thread if (s_cpu_list[id].compare_and_swap_test(nullptr, _this)) [[likely]] @@ -552,7 +552,7 @@ namespace cpu_counter return; } - busy_wait(300); + rx::busy_wait(300); } s_tls_thread_slot = id; @@ -599,7 +599,7 @@ namespace cpu_counter { for (u128 bits = copy; bits; bits &= bits - 1) { - const u32 index = utils::ctz128(bits); + const u32 index = rx::ctz128(bits); if (cpu_thread* cpu = s_cpu_list[index].load()) { @@ -1062,7 +1062,7 @@ bool cpu_thread::check_state() noexcept { if (i < 20 || ctr & 1) { - busy_wait(300); + rx::busy_wait(300); } else { @@ -1404,7 +1404,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept { if (cpu != _this) { - utils::prefetch_write(&cpu->state); + rx::prefetch_write(&cpu->state); return true; } @@ -1446,7 +1446,7 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept break; } - utils::pause(); + rx::pause(); } // Second increment: all threads paused @@ -1480,13 +1480,13 @@ bool cpu_thread::suspend_work::push(cpu_thread* _this) noexcept { for (u32 i = 0; i < work->prf_size; i++) { - utils::prefetch_write(work->prf_list[0]); + rx::prefetch_write(work->prf_list[0]); } } cpu_counter::for_all_cpu(copy2, [&](cpu_thread* cpu) { - utils::prefetch_write(&cpu->state); + rx::prefetch_write(&cpu->state); return true; }); diff --git a/rpcs3/Emu/CPU/sse2neon.h b/rpcs3/Emu/CPU/sse2neon.h index 88ec9d6cb..dc60297f5 100644 --- a/rpcs3/Emu/CPU/sse2neon.h +++ b/rpcs3/Emu/CPU/sse2neon.h @@ -8719,10 +8719,22 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 - u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) - u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) - u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) - u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + u8[0x4], + u8[0x1], + u8[0xE], + u8[0xB], // SubBytes(X1) + u8[0x1], + u8[0xE], + u8[0xB], + u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], + u8[0x9], + u8[0x6], + u8[0x3], // SubBytes(X3) + u8[0x9], + u8[0x6], + u8[0x3], + u8[0xC], // ROT(SubBytes(X3)) }; uint32x4_t r = {0, (unsigned)rcon, 0, (unsigned)rcon}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); diff --git a/rpcs3/Emu/Cell/PPUAnalyser.cpp b/rpcs3/Emu/Cell/PPUAnalyser.cpp index 824c478a6..2f5917807 100644 --- a/rpcs3/Emu/Cell/PPUAnalyser.cpp +++ b/rpcs3/Emu/Cell/PPUAnalyser.cpp @@ -1,4 +1,5 @@ #include "stdafx.h" + #include "PPUAnalyser.h" #include "cellos/sys_sync.h" @@ -8,7 +9,8 @@ #include #include "util/yaml.hpp" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(ppu_validator); @@ -25,7 +27,6 @@ void fmt_class_string::format(std::string& out, u64 arg) case ppu_attr::no_return: return "no_return"; case ppu_attr::no_size: return "no_size"; case ppu_attr::has_mfvscr: return "has_mfvscr"; - case ppu_attr::bitset_last: break; } return unknown; @@ -2243,7 +2244,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con } } - jt_end = utils::align(static_cast(std::min(jt_end - 1, ctr(maxv) - 1) + 1), 4); + jt_end = rx::alignUp(static_cast(std::min(jt_end - 1, ctr(maxv) - 1) + 1), 4); get_jumptable_end(jumpatble_off, jumpatble_ptr, false); @@ -2882,7 +2883,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con block.attr = ppu_attr::no_size; } - per_instruction_bytes += utils::sub_saturate(lim, func.addr); + per_instruction_bytes += rx::sub_saturate(lim, func.addr); addr_next = std::max(addr_next, lim); continue; } @@ -3291,7 +3292,7 @@ bool ppu_module::analyse(u32 lib_toc, u32 entry, const u32 sec_end, con if (per_instruction_bytes) { - const bool error = per_instruction_bytes >= 200 && per_instruction_bytes / 4 >= utils::aligned_div(::size32(funcs), 128); + const bool error = per_instruction_bytes >= 200 && per_instruction_bytes / 4 >= rx::aligned_div(::size32(funcs), 128); (error ? ppu_log.error : ppu_log.notice)("%d instructions will be compiled on per-instruction basis in total", per_instruction_bytes / 4); } diff --git a/rpcs3/Emu/Cell/PPUAnalyser.h b/rpcs3/Emu/Cell/PPUAnalyser.h index abb76f1bc..2668bb3dc 100644 --- a/rpcs3/Emu/Cell/PPUAnalyser.h +++ b/rpcs3/Emu/Cell/PPUAnalyser.h @@ -1,11 +1,12 @@ #pragma once +#include #include #include #include #include -#include "util/types.hpp" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/types.hpp" #include "util/to_endian.hpp" #include "rx/EnumBitSet.hpp" @@ -218,7 +219,7 @@ struct ppu_module : public Type const u32 seg_size = seg.size; const u32 seg_addr = seg.addr; - if (seg_size >= std::max(size_bytes, 1) && addr <= utils::align(seg_addr + seg_size, 0x10000) - size_bytes) + if (seg_size >= std::max(size_bytes, 1) && addr <= rx::alignUp(seg_addr + seg_size, 0x10000) - size_bytes) { return reinterpret_cast*>(static_cast(seg.ptr) + (addr - seg_addr)); } diff --git a/rpcs3/Emu/Cell/PPUDisAsm.cpp b/rpcs3/Emu/Cell/PPUDisAsm.cpp index 78bf17e02..71d8d7d1d 100644 --- a/rpcs3/Emu/Cell/PPUDisAsm.cpp +++ b/rpcs3/Emu/Cell/PPUDisAsm.cpp @@ -4,7 +4,7 @@ #include "PPUAnalyser.h" #include "Emu/IdManager.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include @@ -222,7 +222,7 @@ std::pair PPUDisAsm::try_get_const_op_gpr_value(u32 re GET_CONST_REG(reg_rs, op.rs); - return {form, utils::rol64(reg_rs, op.sh64) & (~0ull << (op.mbe64 ^ 63))}; + return {form, rx::rol64(reg_rs, op.sh64) & (~0ull << (op.mbe64 ^ 63))}; } case ppu_itype::OR: { diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp index 646f2fc27..92fa96cdb 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/PPUInterpreter.cpp @@ -15,7 +15,7 @@ #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" @@ -3509,7 +3509,7 @@ auto RLWIMI() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 mask = ppu_rotate_mask(32 + op.mb32, 32 + op.me32); - ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (dup32(utils::rol32(static_cast(ppu.gpr[op.rs]), op.sh32)) & mask); + ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (dup32(rx::rol32(static_cast(ppu.gpr[op.rs]), op.sh32)) & mask); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3524,7 +3524,7 @@ auto RLWINM() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = dup32(utils::rol32(static_cast(ppu.gpr[op.rs]), op.sh32)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32); + ppu.gpr[op.ra] = dup32(rx::rol32(static_cast(ppu.gpr[op.rs]), op.sh32)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3539,7 +3539,7 @@ auto RLWNM() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = dup32(utils::rol32(static_cast(ppu.gpr[op.rs]), ppu.gpr[op.rb] & 0x1f)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32); + ppu.gpr[op.ra] = dup32(rx::rol32(static_cast(ppu.gpr[op.rs]), ppu.gpr[op.rb] & 0x1f)) & ppu_rotate_mask(32 + op.mb32, 32 + op.me32); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3634,7 +3634,7 @@ auto RLDICL() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull >> op.mbe64); + ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull >> op.mbe64); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3649,7 +3649,7 @@ auto RLDICR() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull << (op.mbe64 ^ 63)); + ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & (~0ull << (op.mbe64 ^ 63)); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3664,7 +3664,7 @@ auto RLDIC() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], op.sh64) & ppu_rotate_mask(op.mbe64, op.sh64 ^ 63); + ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], op.sh64) & ppu_rotate_mask(op.mbe64, op.sh64 ^ 63); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3680,7 +3680,7 @@ auto RLDIMI() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { const u64 mask = ppu_rotate_mask(op.mbe64, op.sh64 ^ 63); - ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (utils::rol64(ppu.gpr[op.rs], op.sh64) & mask); + ppu.gpr[op.ra] = (ppu.gpr[op.ra] & ~mask) | (rx::rol64(ppu.gpr[op.rs], op.sh64) & mask); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3695,7 +3695,7 @@ auto RLDCL() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull >> op.mbe64); + ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull >> op.mbe64); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3710,7 +3710,7 @@ auto RLDCR() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.ra] = utils::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull << (op.mbe64 ^ 63)); + ppu.gpr[op.ra] = rx::rol64(ppu.gpr[op.rs], ppu.gpr[op.rb] & 0x3f) & (~0ull << (op.mbe64 ^ 63)); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.ra], 0); }; @@ -3842,7 +3842,7 @@ auto MULHDU() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.rd] = utils::umulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]); + ppu.gpr[op.rd] = rx::umulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.rd], 0); }; @@ -4243,7 +4243,7 @@ auto MULHD() static const auto exec = [](ppu_thread& ppu, ppu_opcode_t op) { - ppu.gpr[op.rd] = utils::mulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]); + ppu.gpr[op.rd] = rx::mulh64(ppu.gpr[op.ra], ppu.gpr[op.rb]); if constexpr (((Flags == has_rc) || ...)) ppu_cr_set(ppu, 0, ppu.gpr[op.rd], 0); }; @@ -4675,7 +4675,7 @@ auto MULLD() ppu.gpr[op.rd] = RA * RB; if (op.oe) [[unlikely]] { - const s64 high = utils::mulh64(RA, RB); + const s64 high = rx::mulh64(RA, RB); ppu_ov_set(ppu, high != s64(ppu.gpr[op.rd]) >> 63); } if constexpr (((Flags == has_rc) || ...)) diff --git a/rpcs3/Emu/Cell/PPUModule.cpp b/rpcs3/Emu/Cell/PPUModule.cpp index 65c3ff0b6..3163ce199 100644 --- a/rpcs3/Emu/Cell/PPUModule.cpp +++ b/rpcs3/Emu/Cell/PPUModule.cpp @@ -28,7 +28,8 @@ #include #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" LOG_CHANNEL(ppu_loader); @@ -341,7 +342,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n if (!hle_funcs_addr) hle_funcs_addr = vm::alloc(::size32(hle_funcs) * 8, vm::main); else - vm::page_protect(hle_funcs_addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, vm::page_writable); + vm::page_protect(hle_funcs_addr, rx::alignUp(::size32(hle_funcs) * 8, 0x1000), 0, vm::page_writable); // Initialize as PPU executable code ppu_register_range(hle_funcs_addr, ::size32(hle_funcs) * 8); @@ -359,7 +360,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n } // Set memory protection to read-only - vm::page_protect(hle_funcs_addr, utils::align(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable); + vm::page_protect(hle_funcs_addr, rx::alignUp(::size32(hle_funcs) * 8, 0x1000), 0, 0, vm::page_writable); // Initialize function names const bool is_first = g_ppu_function_names.empty(); @@ -489,7 +490,7 @@ static void ppu_initialize_modules(ppu_linkage_info* link, utils::serial* ar = n } else { - const u32 next = utils::align(alloc_addr, variable.second.align); + const u32 next = rx::alignUp(alloc_addr, variable.second.align); const u32 end = next + variable.second.size - 1; if (!next || (end >> 16 != alloc_addr >> 16)) @@ -1191,7 +1192,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu u32 prev_bound = 0; - for (u32 i = find_first_of_multiple(seg_view, prefixes, 0); i < seg.size; i = find_first_of_multiple(seg_view, prefixes, utils::align(i + 1, 4))) + for (u32 i = find_first_of_multiple(seg_view, prefixes, 0); i < seg.size; i = find_first_of_multiple(seg_view, prefixes, rx::alignUp(i + 1, 4))) { const auto elf_header = ensure(mod.get_ptr(seg.addr + i)); @@ -1201,7 +1202,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu const u32 old_i = i; u32 guid_start = umax, guid_end = umax; - for (u32 search = i & -128, tries = 10; tries && search >= prev_bound; tries--, search = utils::sub_saturate(search, 128)) + for (u32 search = i & -128, tries = 10; tries && search >= prev_bound; tries--, search = rx::sub_saturate(search, 128)) { if (seg_view[search] != 0x42 && seg_view[search] != 0x43) { @@ -1271,7 +1272,7 @@ static void ppu_check_patch_spu_images(const ppu_module& mod, const ppu if (addr_last >= 0x80 && valid_count >= 2) { const u32 begin = i & -128; - u32 end = std::min(seg.size, utils::align(i + addr_last + 256, 128)); + u32 end = std::min(seg.size, rx::alignUp(i + addr_last + 256, 128)); u32 guessed_ls_addr = 0; @@ -1611,7 +1612,7 @@ shared_ptr ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c if (virtual_load) { - addr = std::exchange(allocating_address, allocating_address + utils::align(mem_size, 0x10000)); + addr = std::exchange(allocating_address, allocating_address + rx::alignUp(mem_size, 0x10000)); } else { @@ -1625,7 +1626,7 @@ shared_ptr ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c // Leave additional room for the analyser so it can safely access beyond limit a bit // Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries // TODO: Use make_shared_for_overwrite when all compilers support it - const usz alloc_size = utils::align(mem_size, 0x10000) + 4096; + const usz alloc_size = rx::alignUp(mem_size, 0x10000) + 4096; prx->allocations.push_back(std::shared_ptr(new u8[alloc_size])); _seg.ptr = prx->allocations.back().get(); std::memset(static_cast(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size()); @@ -1725,7 +1726,7 @@ shared_ptr ppu_load_prx(const ppu_prx_object& elf, bool virtual_load, c { const auto& rel = reinterpret_cast(prog.bin[i]); - if (rel.offset >= utils::align(::at32(prx->segs, rel.index_addr).size, 0x100)) + if (rel.offset >= rx::alignUp(::at32(prx->segs, rel.index_addr).size, 0x100)) { fmt::throw_exception("Relocation offset out of segment memory! (offset=0x%x, index_addr=%u, seg_size=0x%x)", rel.offset, rel.index_addr, prx->segs[rel.index_addr].size); } @@ -2201,7 +2202,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str // Leave additional room for the analyser so it can safely access beyond limit a bit // Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries // TODO: Use make_shared_for_overwrite when all compilers support it - const usz alloc_size = utils::align(size, 0x10000) + 4096; + const usz alloc_size = rx::alignUp(size, 0x10000) + 4096; _main.allocations.push_back(std::shared_ptr(new u8[alloc_size])); _seg.ptr = _main.allocations.back().get(); std::memset(static_cast(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size()); @@ -2247,7 +2248,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str else { // For backwards compatibility: already loaded memory will always be writable - const u32 size0 = utils::align(size + addr % 0x10000, 0x10000); + const u32 size0 = rx::alignUp(size + addr % 0x10000, 0x10000); const u32 addr0 = addr & -0x10000; vm::page_protect(addr0, size0, 0, vm::page_writable | vm::page_readable, vm::page_executable); } @@ -2721,7 +2722,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str default: { // According to elad335, the min value seems to be 64KB instead of the expected 4KB (SYS_PROCESS_PARAM_STACK_SIZE_MIN) - primary_stacksize = utils::align(std::clamp(sz, 0x10000, SYS_PROCESS_PARAM_STACK_SIZE_MAX), 4096); + primary_stacksize = rx::alignUp(std::clamp(sz, 0x10000, SYS_PROCESS_PARAM_STACK_SIZE_MAX), 4096); break; } } @@ -2738,29 +2739,29 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str if (!Emu.data.empty()) { std::memcpy(vm::base(ppu->stack_addr + ppu->stack_size - ::size32(Emu.data)), Emu.data.data(), Emu.data.size()); - ppu->gpr[1] -= utils::align(::size32(Emu.data), 0x10); + ppu->gpr[1] -= rx::alignUp(::size32(Emu.data), 0x10); } // Initialize process arguments // Calculate storage requirements on the stack - const u32 pointers_storage_size = u32{sizeof(u64)} * utils::align(::size32(Emu.envp) + ::size32(Emu.argv) + 2, 2); + const u32 pointers_storage_size = u32{sizeof(u64)} * rx::alignUp(::size32(Emu.envp) + ::size32(Emu.argv) + 2, 2); u32 stack_alloc_size = pointers_storage_size; for (const auto& arg : Emu.argv) { - stack_alloc_size += utils::align(::size32(arg) + 1, 0x10); + stack_alloc_size += rx::alignUp(::size32(arg) + 1, 0x10); } for (const auto& arg : Emu.envp) { - stack_alloc_size += utils::align(::size32(arg) + 1, 0x10); + stack_alloc_size += rx::alignUp(::size32(arg) + 1, 0x10); } ensure(ppu->stack_size > stack_alloc_size); - vm::ptr args = vm::cast(static_cast(ppu->stack_addr + ppu->stack_size - stack_alloc_size - utils::align(::size32(Emu.data), 0x10))); + vm::ptr args = vm::cast(static_cast(ppu->stack_addr + ppu->stack_size - stack_alloc_size - rx::alignUp(::size32(Emu.data), 0x10))); vm::ptr args_data = vm::cast(args.addr() + pointers_storage_size); const vm::ptr argv = args; @@ -2772,7 +2773,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str std::memcpy(args_data.get_ptr(), arg.data(), arg_size); *args++ = args_data.addr(); - args_data = vm::cast(args_data.addr() + utils::align(arg_size, 0x10)); + args_data = vm::cast(args_data.addr() + rx::alignUp(arg_size, 0x10)); } *args++ = 0; @@ -2787,7 +2788,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str std::memcpy(args_data.get_ptr(), arg.data(), arg_size); *args++ = args_data.addr(); - args_data = vm::cast(args_data.addr() + utils::align(arg_size, 0x10)); + args_data = vm::cast(args_data.addr() + rx::alignUp(arg_size, 0x10)); } *args++ = 0; @@ -2855,7 +2856,7 @@ bool ppu_load_exec(const ppu_exec_object& elf, bool virtual_load, const std::str if (prog.p_type == 0x1u /* LOAD */ && prog.p_memsz && (prog.p_flags & 0x022000002) == 0u /* W */) { // Set memory protection to read-only when necessary (only if PPU-W, SPU-W, RSX-W are all disabled) - ensure(vm::page_protect(addr, utils::align(size, 0x1000), 0, 0, vm::page_writable)); + ensure(vm::page_protect(addr, rx::alignUp(size, 0x1000), 0, 0, vm::page_writable)); } } @@ -2934,7 +2935,7 @@ std::pair, CellError> ppu_load_overlay(const ppu_exec_ob // Leave additional room for the analyser so it can safely access beyond limit a bit // Because with VM the address sapce is not really a limit so any u32 address is valid there, here it is UB to create pointer that goes beyond the boundaries // TODO: Use make_shared_for_overwrite when all compilers support it - const usz alloc_size = utils::align(size, 0x10000) + 4096; + const usz alloc_size = rx::alignUp(size, 0x10000) + 4096; ovlm->allocations.push_back(std::shared_ptr(new u8[alloc_size])); _seg.ptr = ovlm->allocations.back().get(); std::memset(static_cast(_seg.ptr) + prog.bin.size(), 0, alloc_size - 4096 - prog.bin.size()); @@ -3230,7 +3231,7 @@ bool ppu_load_rel_exec(const ppu_rel_object& elf) { if (s.sh_type != sec_type::sht_progbits) { - memsize = utils::align(memsize + vm::cast(s.sh_size), 128); + memsize = rx::alignUp(memsize + vm::cast(s.sh_size), 128); } } @@ -3278,7 +3279,7 @@ bool ppu_load_rel_exec(const ppu_rel_object& elf) relm.secs.emplace_back(_sec); std::memcpy(vm::base(addr), s.get_bin().data(), size); - addr = utils::align(addr + size, 128); + addr = rx::alignUp(addr + size, 128); } } diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 0f42a7b78..b1d559ef7 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -62,7 +62,8 @@ #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "util/vm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" @@ -217,7 +218,7 @@ public: user acquire(u64 amount) { - amount = utils::aligned_div(amount, k_block_size); + amount = rx::aligned_div(amount, k_block_size); u32 allocated = 0; while (!m_free.fetch_op([&, this](u32& value) @@ -225,7 +226,7 @@ public: if (value >= amount || value == m_total) { // Allow at least allocation, make 0 the "memory unavailable" sign value for atomic waiting efficiency - const u32 new_val = static_cast(utils::sub_saturate(value, amount)); + const u32 new_val = static_cast(rx::sub_saturate(value, amount)); allocated = value - new_val; value = new_val; return true; @@ -869,7 +870,7 @@ extern void ppu_register_range(u32 addr, u32 size) return; } - size = utils::align(size + addr % 0x10000, 0x10000); + size = rx::alignUp(size + addr % 0x10000, 0x10000); addr &= -0x10000; // Register executable range at @@ -1816,7 +1817,7 @@ std::vector> ppu_thread::dump_callstack_list() const if (pos_dist >= inst_pos.size()) { - const u32 inst_bound = utils::align(pos, 256); + const u32 inst_bound = rx::alignUp(pos, 256); const usz old_size = inst_pos.size(); const usz new_size = pos_dist + (inst_bound - pos) / 4 + 1; @@ -1903,7 +1904,7 @@ std::vector> ppu_thread::dump_callstack_list() const for (u32 back = 1; back < 20; back++) { - be_t& opcode = get_inst(utils::sub_saturate(_cia, back * 4)); + be_t& opcode = get_inst(rx::sub_saturate(_cia, back * 4)); if (!opcode) { @@ -3588,11 +3589,11 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) return false; } - utils::prefetch_read(ppu.rdata); - utils::prefetch_read(ppu.rdata + 64); + rx::prefetch_read(ppu.rdata); + rx::prefetch_read(ppu.rdata + 64); ppu.last_faddr = addr; ppu.last_ftime = res.load() & -128; - ppu.last_ftsc = utils::get_tsc(); + ppu.last_ftsc = rx::get_tsc(); return false; } default: @@ -3699,7 +3700,7 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value) ppu.last_faddr = addr; ppu.last_ftime = old_rtime & -128; - ppu.last_ftsc = utils::get_tsc(); + ppu.last_ftsc = rx::get_tsc(); std::memcpy(&ppu.rdata[addr & 0x78], &old_data, 8); } @@ -3941,7 +3942,7 @@ namespace fs::stat_t get_stat() override { fs::stat_t stat = m_file.get_stat(); - stat.size = std::min(utils::sub_saturate(stat.size, m_off), m_max_size); + stat.size = std::min(rx::sub_saturate(stat.size, m_off), m_max_size); stat.is_writable = false; return stat; } @@ -3960,7 +3961,7 @@ namespace u64 read_at(u64 offset, void* buffer, u64 size) override { - return m_file.read_at(offset + m_off, buffer, std::min(size, utils::sub_saturate(m_max_size, offset))); + return m_file.read_at(offset + m_off, buffer, std::min(size, rx::sub_saturate(m_max_size, offset))); } u64 write(const void*, u64) override @@ -3988,7 +3989,7 @@ namespace u64 size() override { - return std::min(utils::sub_saturate(m_file.size(), m_off), m_max_size); + return std::min(rx::sub_saturate(m_file.size(), m_off), m_max_size); } }; } // namespace @@ -5624,7 +5625,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s } // Initialize compiler instance - while (jits.size() < utils::aligned_div(module_counter, c_moudles_per_jit) && is_being_used_in_emulation) + while (jits.size() < rx::aligned_div(module_counter, c_moudles_per_jit) && is_being_used_in_emulation) { jits.emplace_back(std::make_shared(s_link_table, g_cfg.core.llvm_cpu, 0, symbols_cement)); @@ -5652,7 +5653,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s const bool divide_by_twenty = !workload.empty(); const usz increment_link_count_at = (divide_by_twenty ? 20 : 1); - g_progr_ptotal += static_cast(utils::aligned_div(link_workload.size(), increment_link_count_at)); + g_progr_ptotal += static_cast(rx::aligned_div(link_workload.size(), increment_link_count_at)); usz mod_index = umax; @@ -5785,7 +5786,7 @@ bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_s bool ppu_initialize(const ppu_module& info, bool check_only, u64 file_size) { - concurent_memory_limit memory_limit(utils::aligned_div(utils::get_total_memory(), 2)); + concurent_memory_limit memory_limit(rx::aligned_div(utils::get_total_memory(), 2)); return ppu_initialize(info, check_only, file_size, memory_limit); } diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index 419c4a8dd..85ec70e0b 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -847,7 +847,7 @@ Value* PPUTranslator::ReadMemory(Value* addr, Type* type, bool is_be, u32 align) m_may_be_mmio = false; - if (auto ptr = m_info.get_ptr(std::max(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + utils::sub_saturate(::narrow(m_addr), sizeof(instructions_to_test) / 2)))) + if (auto ptr = m_info.get_ptr(std::max(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + rx::sub_saturate(::narrow(m_addr), sizeof(instructions_to_test) / 2)))) { if (ppu_test_address_may_be_mmio(std::span(ptr->insts))) { @@ -920,7 +920,7 @@ void PPUTranslator::WriteMemory(Value* addr, Value* value, bool is_be, u32 align be_t insts[128]; }; - if (auto ptr = m_info.get_ptr(std::max(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + utils::sub_saturate(::narrow(m_addr), sizeof(instructions_to_test) / 2)))) + if (auto ptr = m_info.get_ptr(std::max(m_info.segs[0].addr, (m_reloc ? m_reloc->addr : 0) + rx::sub_saturate(::narrow(m_addr), sizeof(instructions_to_test) / 2)))) { if (ppu_test_address_may_be_mmio(std::span(ptr->insts))) { diff --git a/rpcs3/Emu/Cell/RawSPUThread.cpp b/rpcs3/Emu/Cell/RawSPUThread.cpp index be8d49782..ca8395aa0 100644 --- a/rpcs3/Emu/Cell/RawSPUThread.cpp +++ b/rpcs3/Emu/Cell/RawSPUThread.cpp @@ -1,7 +1,8 @@ #include "stdafx.h" #include "Emu/IdManager.h" #include "Loader/ELF.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "SPUThread.h" @@ -450,7 +451,7 @@ void spu_load_rel_exec(const spu_rel_object& elf) { if (shdr.sh_type == sec_type::sht_progbits && shdr.sh_flags().all_of(sh_flag::shf_alloc)) { - total_memsize = utils::align(total_memsize + shdr.sh_size, 4); + total_memsize = rx::alignUp(total_memsize + shdr.sh_size, 4); } } @@ -462,7 +463,7 @@ void spu_load_rel_exec(const spu_rel_object& elf) if (shdr.sh_type == sec_type::sht_progbits && shdr.sh_flags().all_of(sh_flag::shf_alloc)) { std::memcpy(spu->_ptr(offs), shdr.get_bin().data(), shdr.sh_size); - offs = utils::align(offs + shdr.sh_size, 4); + offs = rx::alignUp(offs + shdr.sh_size, 4); } } diff --git a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp index b57a07b53..187d022c8 100644 --- a/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUASMJITRecompiler.cpp @@ -9,7 +9,8 @@ #include "SPUInterpreter.h" #include "Crypto/sha1.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/sysinfo.hpp" @@ -282,7 +283,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 64; const u32 starta = start & -64; - const u32 enda = utils::align(end, 64); + const u32 enda = rx::alignUp(end, 64); const u32 sizea = (enda - starta) / 64; ensure(sizea); @@ -363,7 +364,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 32; const u32 starta = start & -32; - const u32 enda = utils::align(end, 32); + const u32 enda = rx::alignUp(end, 32); const u32 sizea = (enda - starta) / 32; ensure(sizea); @@ -486,7 +487,7 @@ spu_function_t spu_recompiler::compile(spu_program&& _func) words_align = 32; const u32 starta = start & -32; - const u32 enda = utils::align(end, 32); + const u32 enda = rx::alignUp(end, 32); const u32 sizea = (enda - starta) / 32; ensure(sizea); @@ -3211,7 +3212,7 @@ void spu_recompiler::ROTQBYI(spu_opcode_t op) } else if (s == 4 || s == 8 || s == 12) { - c->pshufd(va, va, utils::rol8(0xE4, s / 2)); + c->pshufd(va, va, rx::rol8(0xE4, s / 2)); } else if (utils::has_ssse3()) { diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 9657d6d4e..9acaaf515 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -25,6 +25,7 @@ #include #include +#include "rx/align.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" @@ -658,7 +659,7 @@ std::deque spu_cache::get() const u32 size = block_info.size; const u32 addr = block_info.addr; - if (utils::add_saturate(addr, size * 4) > SPU_LS_SIZE) + if (rx::add_saturate(addr, size * 4) > SPU_LS_SIZE) { break; } @@ -1253,7 +1254,7 @@ void spu_cache::initialize(bool build_existing_cache) fmt::append(dump, "\n\t%49s", ""); - for (u32 i = 0; i < std::min(f->data.size(), std::max(64, utils::aligned_div(depth_m, 4))); i++) + for (u32 i = 0; i < std::min(f->data.size(), std::max(64, rx::aligned_div(depth_m, 4))); i++) { fmt::append(dump, "%-10s", g_spu_iname.decode(std::bit_cast>(f->data[i]))); } @@ -2308,12 +2309,12 @@ std::vector spu_thread::discover_functions(u32 base_addr, std::span(base_addr, 0x10); i < std::min(base_addr + ::size32(ls), 0x3FFF0); i += 0x10) + for (u32 i = rx::alignUp(base_addr, 0x10); i < std::min(base_addr + ::size32(ls), 0x3FFF0); i += 0x10) { // Search for BRSL LR and BRASL LR or BR // TODO: BISL const v128 inst = read_from_ptr>(ls.data(), i - base_addr); - const v128 cleared_i16 = gv_and32(inst, v128::from32p(utils::rol32(~0xffff, 7))); + const v128 cleared_i16 = gv_and32(inst, v128::from32p(rx::rol32(~0xffff, 7))); const v128 eq_brsl = gv_eq32(cleared_i16, v128::from32p(0x66u << 23)); const v128 eq_brasl = gv_eq32(cleared_i16, brasl_mask); const v128 eq_br = gv_eq32(cleared_i16, v128::from32p(0x64u << 23)); @@ -5376,7 +5377,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s const usz block_tail = duplicate_positions[it_begin - it_tail]; // Check if the distance is precisely two times from the end - if (reg_state_it.size() - block_start != utils::rol64(reg_state_it.size() - block_tail, 1)) + if (reg_state_it.size() - block_start != rx::rol64(reg_state_it.size() - block_tail, 1)) { continue; } @@ -7143,7 +7144,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s v_reg2 = 3, }; - for (auto it = infos.lower_bound(utils::sub_saturate(pattern.put_pc, 512)); it != infos.end() && it->first < pattern.put_pc + 512; it++) + for (auto it = infos.lower_bound(rx::sub_saturate(pattern.put_pc, 512)); it != infos.end() && it->first < pattern.put_pc + 512; it++) { for (auto& state : it->second->end_reg_state) { @@ -7622,7 +7623,7 @@ struct spu_llvm // Notify all before queue runs out if there is considerable excess // Optimized that: if there are many workers, it acts soon // If there are only a few workers, it postpones notifications until there is some more workload - if (notify_compile_count && std::min(7, utils::aligned_div(worker_count * 2, 3) + 2) <= compile_pending) + if (notify_compile_count && std::min(7, rx::aligned_div(worker_count * 2, 3) + 2) <= compile_pending) { for (usz i = 0; i < worker_count; i++) { diff --git a/rpcs3/Emu/Cell/SPUInterpreter.cpp b/rpcs3/Emu/Cell/SPUInterpreter.cpp index 5e64a5aa2..f34fea30c 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.cpp +++ b/rpcs3/Emu/Cell/SPUInterpreter.cpp @@ -6,7 +6,7 @@ #include "Emu/Cell/SPUAnalyser.h" #include "Emu/system_config.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" @@ -289,7 +289,7 @@ bool ROT(spu_thread& spu, spu_opcode_t op) for (u32 i = 0; i < 4; i++) { - spu.gpr[op.rt]._u32[i] = utils::rol32(a._u32[i], b._u32[i]); + spu.gpr[op.rt]._u32[i] = rx::rol32(a._u32[i], b._u32[i]); } return true; } @@ -344,7 +344,7 @@ bool ROTH(spu_thread& spu, spu_opcode_t op) for (u32 i = 0; i < 8; i++) { - spu.gpr[op.rt]._u16[i] = utils::rol16(a._u16[i], b._u16[i]); + spu.gpr[op.rt]._u16[i] = rx::rol16(a._u16[i], b._u16[i]); } return true; } diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 57aa5492c..16bf4c863 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -1215,7 +1215,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator rsx::reservation_lock rsx_lock(raddr, 128); // Touch memory - utils::trigger_write_page_fault(vm::base(dest ^ (4096 / 2))); + rx::trigger_write_page_fault(vm::base(dest ^ (4096 / 2))); auto [old_res, ok] = res.fetch_op([&](u64& rval) { diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index c9b724572..f511b3c55 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -1,3 +1,4 @@ +#include "rx/align.hpp" #include "stdafx.h" #include "util/JIT.h" #include "util/date_time.h" @@ -31,7 +32,7 @@ #include #include #include "util/vm.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" @@ -448,7 +449,7 @@ mwaitx_func static void __mwaitx(u32 cycles, u32 cstate, const void* cline, cons // First bit indicates cstate, 0x0 for C.02 state (lower power) or 0x1 for C.01 state (higher power) waitpkg_func static void __tpause(u32 cycles, u32 cstate) { - const u64 tsc = utils::get_tsc() + cycles; + const u64 tsc = rx::get_tsc() + cycles; _tpause(cstate, tsc); } #endif @@ -522,7 +523,7 @@ namespace spu { // Slight pause if function is overburdened const auto count = atomic_instruction_table[pc_offset].observe() * 100ull; - busy_wait(count); + rx::busy_wait(count); } ensure(!spu.check_state()); @@ -1774,7 +1775,7 @@ void spu_thread::cpu_return() // Wait for all threads to have error codes if exited by sys_spu_thread_exit for (u32 status; !thread->exit_status.try_read(status) || status != thread->last_exit_status;) { - utils::pause(); + rx::pause(); } } } @@ -2307,60 +2308,6 @@ void spu_thread::push_snr(u32 number, u32 value) const u32 event_bit = SPU_EVENT_S1 >> (number & 1); const bool bitor_bit = !!((snr_config >> number) & 1); - // Redundant, g_use_rtm is checked inside tx_start now. - if (g_use_rtm && false) - { - bool channel_notify = false; - bool thread_notify = false; - - const bool ok = utils::tx_start([&] - { - channel_notify = (channel->data.raw() == spu_channel::bit_wait); - thread_notify = (channel->data.raw() & spu_channel::bit_count) == 0; - - if (channel_notify) - { - ensure(channel->jostling_value.raw() == spu_channel::bit_wait); - channel->jostling_value.raw() = value; - channel->data.raw() = 0; - } - else if (bitor_bit) - { - channel->data.raw() &= ~spu_channel::bit_wait; - channel->data.raw() |= spu_channel::bit_count | value; - } - else - { - channel->data.raw() = spu_channel::bit_count | value; - } - - if (thread_notify) - { - ch_events.raw().events |= event_bit; - - if (ch_events.raw().mask & event_bit) - { - ch_events.raw().count = 1; - thread_notify = ch_events.raw().waiting != 0; - } - else - { - thread_notify = false; - } - } - }); - - if (ok) - { - if (channel_notify) - channel->data.notify_one(); - if (thread_notify) - this->notify(); - - return; - } - } - // Lock event channel in case it needs event notification ch_events.atomic_op([](ch_events_t& ev) { @@ -2527,7 +2474,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* range_lock = _this->range_lock; } - utils::prefetch_write(range_lock); + rx::prefetch_write(range_lock); for (u32 size = args.size, size0; is_get; size -= size0, dst += size0, src += size0, eal += size0) { @@ -2541,7 +2488,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* } else if (++i < 25) [[likely]] { - busy_wait(300); + rx::busy_wait(300); } else { @@ -2706,7 +2653,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* if (true || ++i < 10) { - busy_wait(500); + rx::busy_wait(500); } else { @@ -2947,7 +2894,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* } u32 range_addr = eal & -128; - u32 range_end = utils::align(eal + size, 128); + u32 range_end = rx::alignUp(eal + size, 128); // Handle the case of crossing 64K page borders (TODO: maybe split in 4K fragments?) if (range_addr >> 16 != (range_end - 1) >> 16) @@ -3131,7 +3078,7 @@ plain_access: bool spu_thread::do_dma_check(const spu_mfc_cmd& args) { - const u32 mask = utils::rol32(1, args.tag); + const u32 mask = rx::rol32(1, args.tag); if (mfc_barrier & mask || (args.cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK) && mfc_fence & mask)) [[unlikely]] { @@ -3147,13 +3094,13 @@ bool spu_thread::do_dma_check(const spu_mfc_cmd& args) if ((mfc_queue[i].cmd & ~0xc) == MFC_BARRIER_CMD) { mfc_barrier |= -1; - mfc_fence |= utils::rol32(1, mfc_queue[i].tag); + mfc_fence |= rx::rol32(1, mfc_queue[i].tag); continue; } if (true) { - const u32 _mask = utils::rol32(1u, mfc_queue[i].tag); + const u32 _mask = rx::rol32(1u, mfc_queue[i].tag); // A command with barrier hard blocks that tag until it's been dealt with if (mfc_queue[i].cmd & MFC_BARRIER_MASK) @@ -3258,7 +3205,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) u8* dst = this->ls + arg_lsa; // Assume success, prepare the next elements - arg_lsa += fetch_size * utils::align(s_size, 16); + arg_lsa += fetch_size * rx::alignUp(s_size, 16); item_ptr += fetch_size; arg_size -= fetch_size * 8; @@ -3266,11 +3213,11 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) constexpr usz _128 = 128; // This whole function relies on many constraints to be met (crashes real MFC), we can a have minor optimization assuming EA alignment to be +16 with +16 byte transfers -#define MOV_T(type, index, _ea) \ - { \ - const usz ea = _ea; \ - *reinterpret_cast(dst + index * utils::align(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast(src + ea); \ - } \ +#define MOV_T(type, index, _ea) \ + { \ + const usz ea = _ea; \ + *reinterpret_cast(dst + index * rx::alignUp(sizeof(type), 16) + ea % (sizeof(type) < 16 ? 16 : 1)) = *reinterpret_cast(src + ea); \ + } \ void() #define MOV_128(index, ea) mov_rdata(*reinterpret_cast(dst + index * _128), *reinterpret_cast(src + (ea))) @@ -3522,7 +3469,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) #undef MOV_T #undef MOV_128 // Optimization miss, revert changes - arg_lsa -= fetch_size * utils::align(s_size, 16); + arg_lsa -= fetch_size * rx::alignUp(s_size, 16); item_ptr -= fetch_size; arg_size += fetch_size * 8; } @@ -3604,7 +3551,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) } } - arg_lsa += utils::align(size, 16); + arg_lsa += rx::alignUp(size, 16); } // Avoid inlining huge transfers because it intentionally drops range lock unlock else if (optimization_compatible == MFC_PUT_CMD && ((addr >> 28 == rsx::constants::local_mem_base >> 28) || (addr < RAW_SPU_BASE_ADDR && size - 1 <= 0x400 - 1 && (addr % 0x10000 + (size - 1)) < 0x10000))) @@ -3615,7 +3562,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) if (!g_use_rtm) { - vm::range_lock(range_lock, addr & -128, utils::align(addr + size, 128) - (addr & -128)); + vm::range_lock(range_lock, addr & -128, rx::alignUp(addr + size, 128) - (addr & -128)); } } else @@ -3690,7 +3637,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) } } - arg_lsa += utils::align(size, 16); + arg_lsa += rx::alignUp(size, 16); } else if (size) { @@ -3703,7 +3650,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) transfer.lsa = arg_lsa | (addr & 0xf); transfer.size = size; - arg_lsa += utils::align(size, 16); + arg_lsa += rx::alignUp(size, 16); do_dma_transfer(this, transfer, ls); } @@ -3721,14 +3668,14 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args) { range_lock->release(0); - ch_stall_mask |= utils::rol32(1, args.tag); + ch_stall_mask |= rx::rol32(1, args.tag); if (!ch_stall_stat.get_count()) { set_events(SPU_EVENT_SN); } - ch_stall_stat.set_value(utils::rol32(1, args.tag) | ch_stall_stat.get_value()); + ch_stall_stat.set_value(rx::rol32(1, args.tag) | ch_stall_stat.get_value()); args.tag |= 0x80; // Set stalled status args.eal = ::narrow(reinterpret_cast(item_ptr) - this->ls); @@ -3853,7 +3800,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return false; }); - const u64 count2 = utils::get_tsc() - perf2.get(); + const u64 count2 = rx::get_tsc() - perf2.get(); if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]] { @@ -3881,11 +3828,11 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) return false; } - utils::prefetch_read(rdata); - utils::prefetch_read(rdata + 64); + rx::prefetch_read(rdata); + rx::prefetch_read(rdata + 64); last_faddr = addr; last_ftime = res.load() & -128; - last_ftsc = utils::get_tsc(); + last_ftsc = rx::get_tsc(); return false; } default: @@ -3973,7 +3920,7 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args) if (!vm::check_addr(addr, vm::page_writable)) { - utils::trigger_write_page_fault(vm::base(addr)); + rx::trigger_write_page_fault(vm::base(addr)); } raddr = 0; @@ -4036,7 +3983,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) } else if (k < 15) { - busy_wait(500); + rx::busy_wait(500); } else { @@ -4053,7 +4000,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) } else if (j < 15) { - busy_wait(500); + rx::busy_wait(500); } else { @@ -4075,7 +4022,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) else if (!g_use_rtm) { // Provoke page fault - utils::trigger_write_page_fault(vm::base(addr)); + rx::trigger_write_page_fault(vm::base(addr)); // Hard lock auto spu = cpu ? cpu->try_get() : nullptr; @@ -4102,7 +4049,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write) }); vm::reservation_acquire(addr) += 32; - result = utils::get_tsc() - perf0.get(); + result = rx::get_tsc() - perf0.get(); } if (result > 20000 && g_cfg.core.perf_report) [[unlikely]] @@ -4150,7 +4097,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish) auto process_command = [&](spu_mfc_cmd& args) { // Select tag bit in the tag mask or the stall mask - const u32 mask = utils::rol32(1, args.tag); + const u32 mask = rx::rol32(1, args.tag); if ((args.cmd & ~0xc) == MFC_BARRIER_CMD) { @@ -4240,7 +4187,7 @@ bool spu_thread::do_mfc(bool can_escape, bool must_finish) { // Get commands' execution mask // Mask bits are always set when mfc_transfers_shuffling is 0 - return static_cast((0 - (1u << std::min(g_cfg.core.mfc_transfers_shuffling, size))) | utils::get_tsc()); + return static_cast((0 - (1u << std::min(g_cfg.core.mfc_transfers_shuffling, size))) | rx::get_tsc()); }; // Process enqueued commands @@ -4733,7 +4680,7 @@ bool spu_thread::process_mfc_cmd() else #endif { - busy_wait(300); + rx::busy_wait(300); } if (getllar_spin_count == 3) @@ -4875,7 +4822,7 @@ bool spu_thread::process_mfc_cmd() if (i < 24) [[likely]] { i++; - busy_wait(300); + rx::busy_wait(300); } else { @@ -5159,7 +5106,7 @@ bool spu_thread::process_mfc_cmd() std::memcpy(dump.data, _ptr(ch_mfc_cmd.lsa & 0x3ff80), 128); } - const u32 mask = utils::rol32(1, ch_mfc_cmd.tag); + const u32 mask = rx::rol32(1, ch_mfc_cmd.tag); if ((mfc_barrier | mfc_fence) & mask) [[unlikely]] { @@ -5214,11 +5161,11 @@ bool spu_thread::process_mfc_cmd() } mfc_queue[mfc_size++] = ch_mfc_cmd; - mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag); + mfc_fence |= rx::rol32(1, ch_mfc_cmd.tag); if (ch_mfc_cmd.cmd & MFC_BARRIER_MASK) { - mfc_barrier |= utils::rol32(1, ch_mfc_cmd.tag); + mfc_barrier |= rx::rol32(1, ch_mfc_cmd.tag); } return true; @@ -5267,11 +5214,11 @@ bool spu_thread::process_mfc_cmd() } mfc_size++; - mfc_fence |= utils::rol32(1, cmd.tag); + mfc_fence |= rx::rol32(1, cmd.tag); if (cmd.cmd & MFC_BARRIER_MASK) { - mfc_barrier |= utils::rol32(1, cmd.tag); + mfc_barrier |= rx::rol32(1, cmd.tag); } if (check_mfc_interrupts(pc + 4)) @@ -5297,7 +5244,7 @@ bool spu_thread::process_mfc_cmd() { mfc_queue[mfc_size++] = ch_mfc_cmd; mfc_barrier |= -1; - mfc_fence |= utils::rol32(1, ch_mfc_cmd.tag); + mfc_fence |= rx::rol32(1, ch_mfc_cmd.tag); } return true; @@ -5592,7 +5539,7 @@ retry: if (reading && res.locks && mask_hint & (SPU_EVENT_S1 | SPU_EVENT_S2)) { - busy_wait(100); + rx::busy_wait(100); goto retry; } @@ -5899,7 +5846,7 @@ s64 spu_thread::get_ch_value(u32 ch) } } - const usz seed = (utils::get_tsc() >> 8) % 100; + const usz seed = (rx::get_tsc() >> 8) % 100; #ifdef __linux__ const bool reservation_busy_waiting = false; @@ -5998,7 +5945,7 @@ s64 spu_thread::get_ch_value(u32 ch) { if (u32 work_count = g_spu_work_count) { - const u32 true_free = utils::sub_saturate(utils::get_thread_count(), 10); + const u32 true_free = rx::sub_saturate(utils::get_thread_count(), 10); if (work_count > true_free) { @@ -6123,7 +6070,7 @@ s64 spu_thread::get_ch_value(u32 ch) } else { - busy_wait(); + rx::busy_wait(); } continue; @@ -6490,7 +6437,7 @@ bool spu_thread::set_ch_value(u32 ch, u32 value) value &= 0x1f; // Reset stall status for specified tag - const u32 tag_mask = utils::rol32(1, value); + const u32 tag_mask = rx::rol32(1, value); if (ch_stall_mask & tag_mask) { @@ -7320,7 +7267,7 @@ bool spu_thread::try_load_debug_capture() void spu_thread::wakeup_delay(u32 div) const { if (g_cfg.core.spu_wakeup_delay_mask & (1u << index)) - thread_ctrl::wait_for_accurate(utils::aligned_div(+g_cfg.core.spu_wakeup_delay, div)); + thread_ctrl::wait_for_accurate(rx::aligned_div(+g_cfg.core.spu_wakeup_delay, div)); } spu_function_logger::spu_function_logger(spu_thread& spu, const char* func) noexcept @@ -7397,7 +7344,7 @@ s64 spu_channel::pop_wait(cpu_thread& spu, bool pop) for (int i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); if (!(data & bit_wait)) { @@ -7473,7 +7420,7 @@ bool spu_channel::push_wait(cpu_thread& spu, u32 value, bool push) return true; } - busy_wait(); + rx::busy_wait(); state = data; } @@ -7528,7 +7475,7 @@ std::pair spu_channel_4_t::pop_wait(cpu_thread& spu, bool pop_value) for (int i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); if (!atomic_storage::load(values.raw().waiting)) { diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index df398e6bc..d541b68c8 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" + +#include "rx/align.hpp" #include "vm_locking.h" #include "vm_ptr.h" #include "vm_ref.h" @@ -14,7 +16,8 @@ #include #include "util/vm.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "util/simd.hpp" #include "util/serialization.hpp" @@ -245,7 +248,7 @@ namespace vm // Try triggering a page fault (write) // TODO: Read memory if needed - utils::trigger_write_page_fault(vm::base(test / 4096 == begin / 4096 ? begin : test)); + rx::trigger_write_page_fault(vm::base(test / 4096 == begin / 4096 ? begin : test)); continue; } } @@ -258,7 +261,7 @@ namespace vm perf0.restart(); } - busy_wait(200); + rx::busy_wait(200); if (i >= 2 && !_cpu) { @@ -339,9 +342,9 @@ namespace vm auto range_lock = &*std::prev(std::end(vm::g_range_lock_set)); *range_lock = addr | u64{size} << 32 | flags; - utils::prefetch_read(g_range_lock_set + 0); - utils::prefetch_read(g_range_lock_set + 2); - utils::prefetch_read(g_range_lock_set + 4); + rx::prefetch_read(g_range_lock_set + 0); + rx::prefetch_read(g_range_lock_set + 2); + rx::prefetch_read(g_range_lock_set + 4); const auto range = utils::address_range::start_length(addr, size); @@ -364,7 +367,7 @@ namespace vm break; } - utils::pause(); + rx::pause(); } return range_lock; @@ -407,7 +410,7 @@ namespace vm } if (i < 100) - busy_wait(200); + rx::busy_wait(200); else std::this_thread::yield(); @@ -516,12 +519,12 @@ namespace vm if (to_prepare_memory) { // We have some spare time, prepare cache lines (todo: reservation tests here) - utils::prefetch_write(vm::get_super_ptr(addr)); - utils::prefetch_write(vm::get_super_ptr(addr) + 64); + rx::prefetch_write(vm::get_super_ptr(addr)); + rx::prefetch_write(vm::get_super_ptr(addr) + 64); to_prepare_memory = false; } - busy_wait(200); + rx::busy_wait(200); } else { @@ -552,9 +555,9 @@ namespace vm addr1 = static_cast(addr) | is_shared; } - utils::prefetch_read(g_range_lock_set + 0); - utils::prefetch_read(g_range_lock_set + 2); - utils::prefetch_read(g_range_lock_set + 4); + rx::prefetch_read(g_range_lock_set + 0); + rx::prefetch_read(g_range_lock_set + 2); + rx::prefetch_read(g_range_lock_set + 4); u64 to_clear = get_range_lock_bits(false); @@ -568,7 +571,7 @@ namespace vm for (u64 hi = addr2 >> 16, max = (addr2 + size2 - 1) >> 16; hi <= max; hi++) { u64 addr3 = addr2; - u64 size3 = std::min(addr2 + size2, utils::align(addr2, 0x10000)) - addr2; + u64 size3 = std::min(addr2 + size2, rx::alignUp(addr2, 0x10000)) - addr2; if (u64 is_shared = g_shmem[hi]) [[unlikely]] { @@ -594,12 +597,12 @@ namespace vm if (to_prepare_memory) { - utils::prefetch_write(vm::get_super_ptr(addr)); - utils::prefetch_write(vm::get_super_ptr(addr) + 64); + rx::prefetch_write(vm::get_super_ptr(addr)); + rx::prefetch_write(vm::get_super_ptr(addr) + 64); to_prepare_memory = false; } - utils::pause(); + rx::pause(); } for (auto lock = g_locks.cbegin(), end = lock + g_cfg.core.ppu_threads; lock != end; lock++) @@ -610,12 +613,12 @@ namespace vm { if (to_prepare_memory) { - utils::prefetch_write(vm::get_super_ptr(addr)); - utils::prefetch_write(vm::get_super_ptr(addr) + 64); + rx::prefetch_write(vm::get_super_ptr(addr)); + rx::prefetch_write(vm::get_super_ptr(addr) + 64); to_prepare_memory = false; } - utils::pause(); + rx::pause(); } } } @@ -642,7 +645,7 @@ namespace vm } else if (i < 15) { - busy_wait(500); + rx::busy_wait(500); } else { @@ -683,7 +686,7 @@ namespace vm } else if (i < 15) { - busy_wait(500); + rx::busy_wait(500); } else { @@ -1078,13 +1081,13 @@ namespace vm if (state & page_1m_size) { - i = utils::align(i + 1, 0x100000 / 4096); + i = rx::alignUp(i + 1, 0x100000 / 4096); continue; } if (state & page_64k_size) { - i = utils::align(i + 1, 0x10000 / 4096); + i = rx::alignUp(i + 1, 0x10000 / 4096); continue; } @@ -1359,7 +1362,7 @@ namespace vm const u32 min_page_size = flags & page_size_4k ? 0x1000 : 0x10000; // Align to minimal page size - const u32 size = utils::align(orig_size, min_page_size) + (flags & stack_guarded ? 0x2000 : 0); + const u32 size = rx::alignUp(orig_size, min_page_size) + (flags & stack_guarded ? 0x2000 : 0); // Check alignment (it's page allocation, so passing small values there is just silly) if (align < min_page_size || align != (0x80000000u >> std::countl_zero(align))) @@ -1387,7 +1390,7 @@ namespace vm const u32 max = (this->addr + this->size - size) & (0 - align); - u32 addr = utils::align(this->addr, align); + u32 addr = rx::alignUp(this->addr, align); if (this->addr > max || addr > max) { @@ -1434,7 +1437,7 @@ namespace vm const u32 size0 = orig_size + addr % min_page_size; // Align to minimal page size - const u32 size = utils::align(size0, min_page_size); + const u32 size = rx::alignUp(size0, min_page_size); // Return if addr or size is invalid // If shared memory is provided, addr/size must be aligned @@ -1870,7 +1873,7 @@ namespace vm return nullptr; } - for (u32 addr = utils::align(0x10000000, align);; addr += align) + for (u32 addr = rx::alignUp(0x10000000, align);; addr += align) { if (_test_map(addr, size)) { @@ -1950,7 +1953,7 @@ namespace vm vm::writer_lock lock; // Align to minimal page size - const u32 size = utils::align(orig_size, 0x10000); + const u32 size = rx::alignUp(orig_size, 0x10000); // Check alignment if (align < 0x10000 || align != (0x80000000u >> std::countl_zero(align))) @@ -2178,7 +2181,7 @@ namespace vm // Wait a bit before accessing global lock range_lock->release(0); - busy_wait(200); + rx::busy_wait(200); } const bool result = try_access_internal(begin, ptr, size, is_write); @@ -2399,7 +2402,7 @@ namespace vm // Prevent overflow const u32 size = 0 - max_size < addr ? (0 - addr) : max_size; - for (u32 i = addr, end = utils::align(addr + size, 4096) - 1; i <= end;) + for (u32 i = addr, end = rx::alignUp(addr + size, 4096) - 1; i <= end;) { if (check_pages && !vm::check_addr(i, vm::page_readable)) { diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index fd34a36a7..920db1396 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -3,7 +3,7 @@ #include "vm.h" #include "vm_locking.h" #include "util/atomic.hpp" -#include "util/tsc.hpp" +#include "rx/tsc.hpp" #include extern bool g_use_rtm; @@ -209,7 +209,7 @@ namespace vm unsigned status = -1; u64 _old = 0; - auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0; + auto stamp0 = rx::get_tsc(), stamp1 = stamp0, stamp2 = stamp0; #ifndef _MSC_VER __asm__ goto("xbegin %l[stage2];" ::: "memory" : stage2); @@ -271,16 +271,16 @@ namespace vm #ifndef _MSC_VER __asm__ volatile("mov %%eax, %0;" : "=r"(status)::"memory"); #endif - stamp1 = utils::get_tsc(); + stamp1 = rx::get_tsc(); // Stage 2: try to lock reservation first _old = res.fetch_add(1); // Compute stamps excluding memory touch - stamp2 = utils::get_tsc() - (stamp1 - stamp0); + stamp2 = rx::get_tsc() - (stamp1 - stamp0); // Start lightened transaction - for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc()) + for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = rx::get_tsc()) { if (cpu.has_pause_flag()) { diff --git a/rpcs3/Emu/NP/np_allocator.h b/rpcs3/Emu/NP/np_allocator.h index 5915fb371..3eed6ba62 100644 --- a/rpcs3/Emu/NP/np_allocator.h +++ b/rpcs3/Emu/NP/np_allocator.h @@ -4,7 +4,8 @@ #include "Emu/Memory/vm_ptr.h" #include "util/mutex.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "util/logs.hpp" LOG_CHANNEL(np_mem_allocator); @@ -52,7 +53,7 @@ namespace np } // Align allocs - const u32 alloc_size = utils::align(size, 4); + const u32 alloc_size = rx::alignUp(size, 4); if (alloc_size > m_avail) { np_mem_allocator.error("Not enough memory available in NP pool!"); diff --git a/rpcs3/Emu/NP/np_event_data.h b/rpcs3/Emu/NP/np_event_data.h index ba6d97588..dbf828dff 100644 --- a/rpcs3/Emu/NP/np_event_data.h +++ b/rpcs3/Emu/NP/np_event_data.h @@ -1,7 +1,8 @@ #pragma once #include "Emu/Memory/vm_ptr.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" namespace np { @@ -9,7 +10,7 @@ namespace np { public: event_data(u32 vm_addr, u32 initial_size, u32 max_size) - : m_max_size(max_size), m_cur_size(utils::align(initial_size, 4)) + : m_max_size(max_size), m_cur_size(rx::alignUp(initial_size, 4)) { m_data_ptr.set(vm_addr); } @@ -50,7 +51,7 @@ namespace np template T* allocate(u32 size, vm::bptr& dest) { - const u32 to_alloc = utils::align(size, 4); + const u32 to_alloc = rx::alignUp(size, 4); ensure((m_cur_size + to_alloc) <= m_max_size, "event_data::allocate: size would overflow the allocated buffer!"); u8* dest_ptr = reinterpret_cast(&dest); diff --git a/rpcs3/Emu/NP/np_gui_cache.cpp b/rpcs3/Emu/NP/np_gui_cache.cpp index fee08356d..39e85bdee 100644 --- a/rpcs3/Emu/NP/np_gui_cache.cpp +++ b/rpcs3/Emu/NP/np_gui_cache.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "np_gui_cache.h" LOG_CHANNEL(np_gui_cache); @@ -72,7 +73,7 @@ namespace np const auto& room = ::at32(rooms, room_id); - const u32 room_size = ::narrow(utils::align(sizeof(SceNpMatchingRoomStatus), 8) + (utils::align(sizeof(SceNpMatchingRoomMember), 8) * room.members.size())); + const u32 room_size = ::narrow(rx::alignUp(sizeof(SceNpMatchingRoomStatus), 8) + (rx::alignUp(sizeof(SceNpMatchingRoomMember), 8) * room.members.size())); if (!data) return not_an_error(room_size); @@ -94,12 +95,12 @@ namespace np { if (!cur_member_ptr) { - room_status->members = vm::cast(data.addr() + utils::align(sizeof(SceNpMatchingRoomStatus), 8)); + room_status->members = vm::cast(data.addr() + rx::alignUp(sizeof(SceNpMatchingRoomStatus), 8)); cur_member_ptr = room_status->members; } else { - cur_member_ptr->next = vm::cast(cur_member_ptr.addr() + utils::align(sizeof(SceNpMatchingRoomMember), 8)); + cur_member_ptr->next = vm::cast(cur_member_ptr.addr() + rx::alignUp(sizeof(SceNpMatchingRoomMember), 8)); cur_member_ptr = cur_member_ptr->next; } diff --git a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp index 96bc2b230..a659295e0 100644 --- a/rpcs3/Emu/RSX/Capture/rsx_replay.cpp +++ b/rpcs3/Emu/RSX/Capture/rsx_replay.cpp @@ -7,7 +7,8 @@ #include "cellos/sys_memory.h" #include "Emu/RSX/RSXThread.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include @@ -26,7 +27,7 @@ namespace rsx } // User memory + fifo size - buffer_size = utils::align(buffer_size, 0x100000) + 0x10000000; + buffer_size = rx::alignUp(buffer_size, 0x100000) + 0x10000000; // We are not allowed to drain all memory so add a little g_fxo->init(buffer_size + 0x1000000); diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.cpp b/rpcs3/Emu/RSX/Common/TextureUtils.cpp index b31144b1d..0084e0df4 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.cpp +++ b/rpcs3/Emu/RSX/Common/TextureUtils.cpp @@ -5,7 +5,8 @@ #include "../rsx_utils.h" #include "3rdparty/bcdec/bcdec.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" namespace utils { @@ -661,13 +662,13 @@ namespace } else if constexpr (block_edge_in_texel == 4) { - current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel); - current_subresource_layout.height_in_block = utils::aligned_div(miplevel_height_in_texel, block_edge_in_texel); + current_subresource_layout.width_in_block = rx::aligned_div(miplevel_width_in_texel, block_edge_in_texel); + current_subresource_layout.height_in_block = rx::aligned_div(miplevel_height_in_texel, block_edge_in_texel); } else { // Only the width is compressed - current_subresource_layout.width_in_block = utils::aligned_div(miplevel_width_in_texel, block_edge_in_texel); + current_subresource_layout.width_in_block = rx::aligned_div(miplevel_width_in_texel, block_edge_in_texel); current_subresource_layout.height_in_block = miplevel_height_in_texel; } @@ -699,7 +700,7 @@ namespace if (!padded_row) // Only swizzled textures obey this restriction { - offset_in_src = utils::align(offset_in_src, 128); + offset_in_src = rx::alignUp(offset_in_src, 128); } } @@ -1429,8 +1430,8 @@ namespace rsx usz result = 0; for (u16 i = 0; i < mipmap; ++i) { - usz rowPitch = utils::align(block_size_in_byte * width_in_blocks, row_pitch_alignment); - result += utils::align(rowPitch * height_in_blocks * depth, mipmap_alignment); + usz rowPitch = rx::alignUp(block_size_in_byte * width_in_blocks, row_pitch_alignment); + result += rx::alignUp(rowPitch * height_in_blocks * depth, mipmap_alignment); height_in_blocks = std::max(height_in_blocks / 2, 1); width_in_blocks = std::max(width_in_blocks / 2, 1); } diff --git a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h index babc24595..db746d1a2 100644 --- a/rpcs3/Emu/RSX/Common/ring_buffer_helper.h +++ b/rpcs3/Emu/RSX/Common/ring_buffer_helper.h @@ -1,7 +1,8 @@ #pragma once #include "util/StrFmt.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" /** * Ring buffer memory helper : @@ -20,8 +21,8 @@ protected: template bool can_alloc(usz size) const { - usz alloc_size = utils::align(size, Alignment); - usz aligned_put_pos = utils::align(m_put_pos, Alignment); + usz alloc_size = rx::alignUp(size, Alignment); + usz aligned_put_pos = rx::alignUp(m_put_pos, Alignment); if (aligned_put_pos + alloc_size < m_size) { // range before get @@ -85,8 +86,8 @@ public: template usz alloc(usz size) { - const usz alloc_size = utils::align(size, Alignment); - const usz aligned_put_pos = utils::align(m_put_pos, Alignment); + const usz alloc_size = rx::alignUp(size, Alignment); + const usz aligned_put_pos = rx::alignUp(m_put_pos, Alignment); if (!can_alloc(size) && !grow(alloc_size)) { diff --git a/rpcs3/Emu/RSX/Common/surface_store.cpp b/rpcs3/Emu/RSX/Common/surface_store.cpp index a96ca453e..3d329e5f6 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.cpp +++ b/rpcs3/Emu/RSX/Common/surface_store.cpp @@ -1,7 +1,8 @@ #include "stdafx.h" #include "surface_store.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" namespace rsx { @@ -39,20 +40,20 @@ namespace rsx { switch (format) { - case surface_color_format::b8: return utils::align(width, 256); + case surface_color_format::b8: return rx::alignUp(width, 256); case surface_color_format::g8b8: case surface_color_format::x1r5g5b5_o1r5g5b5: case surface_color_format::x1r5g5b5_z1r5g5b5: - case surface_color_format::r5g6b5: return utils::align(width * 2, 256); + case surface_color_format::r5g6b5: return rx::alignUp(width * 2, 256); case surface_color_format::a8b8g8r8: case surface_color_format::x8b8g8r8_o8b8g8r8: case surface_color_format::x8b8g8r8_z8b8g8r8: case surface_color_format::x8r8g8b8_o8r8g8b8: case surface_color_format::x8r8g8b8_z8r8g8b8: case surface_color_format::x32: - case surface_color_format::a8r8g8b8: return utils::align(width * 4, 256); - case surface_color_format::w16z16y16x16: return utils::align(width * 8, 256); - case surface_color_format::w32z32y32x32: return utils::align(width * 16, 256); + case surface_color_format::a8r8g8b8: return rx::alignUp(width * 4, 256); + case surface_color_format::w16z16y16x16: return rx::alignUp(width * 8, 256); + case surface_color_format::w32z32y32x32: return rx::alignUp(width * 16, 256); } fmt::throw_exception("Unknown color surface format"); } diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h index 867ff1aeb..09a22bd9b 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.h +++ b/rpcs3/Emu/RSX/Common/surface_store.h @@ -8,7 +8,8 @@ #include "../rsx_utils.h" #include -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" namespace rsx { @@ -806,7 +807,7 @@ namespace rsx continue; } - num_rows = utils::aligned_div(this_range.length(), rsx_pitch); + num_rows = rx::aligned_div(this_range.length(), rsx_pitch); } for (u32 row = 0, offset = (this_range.start - range.start), section_len = (this_range.end - range.start + 1); @@ -1186,7 +1187,7 @@ namespace rsx { // Width is calculated in the coordinate-space of the requester; normalize info.src_area.x = (info.src_area.x * required_bpp) / surface_bpp; - info.src_area.width = utils::align(width * required_bpp, surface_bpp) / surface_bpp; + info.src_area.width = rx::alignUp(width * required_bpp, surface_bpp) / surface_bpp; } else { diff --git a/rpcs3/Emu/RSX/Common/time.hpp b/rpcs3/Emu/RSX/Common/time.hpp index db176fb4c..433d3b4f6 100644 --- a/rpcs3/Emu/RSX/Common/time.hpp +++ b/rpcs3/Emu/RSX/Common/time.hpp @@ -1,4 +1,4 @@ #pragma once -#include +#include #include diff --git a/rpcs3/Emu/RSX/GL/GLCompute.cpp b/rpcs3/Emu/RSX/GL/GLCompute.cpp index 7fe9c342f..9bb290d14 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.cpp +++ b/rpcs3/Emu/RSX/GL/GLCompute.cpp @@ -1,6 +1,7 @@ #include "GLCompute.h" #include "GLTexture.h" #include "util/StrUtil.h" +#include "rx/align.hpp" namespace gl { @@ -196,7 +197,7 @@ namespace gl m_data_length = data_length; const auto num_bytes_per_invocation = optimal_group_size * kernel_size * 4; - const auto num_bytes_to_process = utils::align(data_length, num_bytes_per_invocation); + const auto num_bytes_to_process = rx::alignUp(data_length, num_bytes_per_invocation); const auto num_invocations = num_bytes_to_process / num_bytes_per_invocation; if ((num_bytes_to_process + data_offset) > data->size()) @@ -364,7 +365,7 @@ namespace gl dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(2), out_offset, row_pitch * 4 * region.height); - const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size); + const int num_invocations = rx::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size); compute_task::run(cmd, num_invocations); } @@ -411,7 +412,7 @@ namespace gl dst->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(1), out_offset, row_pitch * 4 * region.height); - const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size); + const int num_invocations = rx::aligned_div(region.width * region.height, optimal_kernel_size * optimal_group_size); compute_task::run(cmd, num_invocations); } @@ -437,7 +438,7 @@ namespace gl void cs_ssbo_to_color_image::run(gl::command_context& cmd, const buffer* src, const texture_view* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout) { const u32 bpp = dst->image()->pitch() / dst->image()->width(); - const u32 row_length = utils::align(dst_region.width * bpp, std::max(layout.alignment, 1)) / bpp; + const u32 row_length = rx::alignUp(dst_region.width * bpp, std::max(layout.alignment, 1)) / bpp; m_program.uniforms["swap_bytes"] = layout.swap_bytes; m_program.uniforms["src_pitch"] = row_length; @@ -448,7 +449,7 @@ namespace gl src->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * bpp * dst_region.height); glBindImageTexture(GL_COMPUTE_IMAGE_SLOT(0), dst->id(), 0, GL_FALSE, 0, GL_WRITE_ONLY, dst->view_format()); - const int num_invocations = utils::aligned_div(dst_region.width * dst_region.height, optimal_kernel_size * optimal_group_size); + const int num_invocations = rx::aligned_div(dst_region.width * dst_region.height, optimal_kernel_size * optimal_group_size); compute_task::run(cmd, num_invocations); } diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 1bb59a311..4f83c3fa5 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -337,7 +337,7 @@ namespace gl set_parameters(cmd); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); - const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); + const u32 linear_invocations = rx::aligned_div(data_length, num_bytes_per_invocation); compute_task::run(cmd, linear_invocations); } }; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index c6b7ab50e..d6f118104 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -12,6 +12,8 @@ #include "Emu/RSX/Host/RSXDMAWriter.h" #include "Emu/RSX/NV47/HW/context_accessors.define.h" +#include "rx/align.hpp" + [[noreturn]] extern void report_fatal_error(std::string_view _text, bool is_html = false, bool include_help_text = true); namespace @@ -895,7 +897,7 @@ void GLGSRender::load_program_env() if (update_fragment_texture_env) m_texture_parameters_buffer->reserve_storage_on_heap(256); if (update_fragment_constants) - m_fragment_constants_buffer->reserve_storage_on_heap(utils::align(fragment_constants_size, 256)); + m_fragment_constants_buffer->reserve_storage_on_heap(rx::alignUp(fragment_constants_size, 256)); if (update_transform_constants) m_transform_constants_buffer->reserve_storage_on_heap(8192); if (update_raster_env) diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index bc920493d..82b4f802e 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -15,7 +15,7 @@ #include "util/geometry.h" #include "util/File.h" #include "util/logs.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "glutils/common.h" // TODO: Include on use diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.cpp b/rpcs3/Emu/RSX/GL/GLOverlays.cpp index 3f90883a1..e6aaafe32 100644 --- a/rpcs3/Emu/RSX/GL/GLOverlays.cpp +++ b/rpcs3/Emu/RSX/GL/GLOverlays.cpp @@ -4,6 +4,8 @@ #include "../Program/RSXOverlay.h" #include "Emu/Cell/timers.hpp" +#include "rx/align.hpp" + namespace gl { // Lame @@ -544,7 +546,7 @@ namespace gl const pixel_buffer_layout& layout) { const u32 bpp = dst->image()->pitch() / dst->image()->width(); - const u32 row_length = utils::align(dst_region.width * bpp, std::max(layout.alignment, 1)) / bpp; + const u32 row_length = rx::alignUp(dst_region.width * bpp, std::max(layout.alignment, 1)) / bpp; program_handle.uniforms["src_pitch"] = row_length; program_handle.uniforms["swap_bytes"] = layout.swap_bytes; diff --git a/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp b/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp index 2601d5597..5b367b24f 100644 --- a/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp +++ b/rpcs3/Emu/RSX/GL/GLResolveHelper.cpp @@ -2,6 +2,8 @@ #include "GLResolveHelper.h" #include "GLTexture.h" +#include "rx/align.hpp" + #include #include @@ -225,8 +227,8 @@ namespace gl multisampled = msaa_image; resolve = resolve_image; - const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x; - const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y; + const u32 invocations_x = rx::alignUp(resolve_image->width(), cs_wave_x) / cs_wave_x; + const u32 invocations_y = rx::alignUp(resolve_image->height(), cs_wave_y) / cs_wave_y; compute_task::run(cmd, invocations_x, invocations_y); } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 2193b6531..26cfdde9d 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -9,7 +9,8 @@ #include "../RSXThread.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" namespace gl { @@ -664,7 +665,7 @@ namespace gl u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); u64 image_linear_size = staging_buffer.size(); - const auto min_required_buffer_size = std::max(utils::align(image_linear_size * 4, 0x100000), 16 * 0x100000); + const auto min_required_buffer_size = std::max(rx::alignUp(image_linear_size * 4, 0x100000), 16 * 0x100000); if (driver_caps.ARB_compute_shader_supported) { @@ -825,7 +826,7 @@ namespace gl } else { - const auto aligned_pitch = utils::align(dst->pitch(), 4); + const auto aligned_pitch = rx::alignUp(dst->pitch(), 4); const u32 texture_data_sz = dst->depth() * dst->height() * aligned_pitch; data_upload_buf.resize(texture_data_sz); } @@ -1002,7 +1003,7 @@ namespace gl u32 scratch_offset = 0; const u64 min_storage_requirement = src_mem.image_size_in_bytes + dst_mem.image_size_in_bytes; - const u64 min_required_buffer_size = utils::align(min_storage_requirement, 256); + const u64 min_required_buffer_size = rx::alignUp(min_storage_requirement, 256); if (g_typeless_transfer_buffer.size() >= min_required_buffer_size) [[likely]] { diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp index b6697cf14..5e793e9df 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.cpp +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.cpp @@ -3,7 +3,8 @@ #include "GLTextureCache.h" #include "../Common/BufferUtils.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" namespace gl { @@ -82,7 +83,7 @@ namespace gl } else { - const u32 num_rows = utils::align(valid_length, rsx_pitch) / rsx_pitch; + const u32 num_rows = rx::alignUp(valid_length, rsx_pitch) / rsx_pitch; u32* data = static_cast(dst); for (u32 row = 0; row < num_rows; ++row) { @@ -212,7 +213,7 @@ namespace gl // Dimensions were given in 'dst' space. Work out the real source coordinates const auto src_bpp = slice.src->pitch() / slice.src->width(); src_x = (src_x * dst_bpp) / src_bpp; - src_w = utils::aligned_div(src_w * dst_bpp, src_bpp); + src_w = rx::aligned_div(src_w * dst_bpp, src_bpp); } if (auto surface = dynamic_cast(slice.src)) diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index f6596da51..dcdb866b4 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -7,6 +7,8 @@ #include "../Common/texture_cache.h" +#include "rx/align.hpp" + #include #include @@ -49,7 +51,7 @@ namespace gl void init_buffer(const gl::texture* src) { const u32 vram_size = src->pitch() * src->height(); - const u32 buffer_size = utils::align(vram_size, 4096); + const u32 buffer_size = rx::alignUp(vram_size, 4096); if (pbo) { diff --git a/rpcs3/Emu/RSX/GL/glutils/capabilities.h b/rpcs3/Emu/RSX/GL/glutils/capabilities.h index 99af8172c..b62dfaaf4 100644 --- a/rpcs3/Emu/RSX/GL/glutils/capabilities.h +++ b/rpcs3/Emu/RSX/GL/glutils/capabilities.h @@ -2,7 +2,7 @@ #include "../OpenGL.h" #include -#include +#include #include namespace gl diff --git a/rpcs3/Emu/RSX/GL/glutils/image.cpp b/rpcs3/Emu/RSX/GL/glutils/image.cpp index e828ebdcd..c61ba3221 100644 --- a/rpcs3/Emu/RSX/GL/glutils/image.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/image.cpp @@ -3,6 +3,7 @@ #include "buffer_object.h" #include "state_tracker.hpp" #include "pixel_settings.hpp" +#include "rx/align.hpp" namespace gl { @@ -119,14 +120,14 @@ namespace gl case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT: { m_compressed = true; - m_pitch = utils::align(width, 4) / 2; + m_pitch = rx::alignUp(width, 4) / 2; break; } case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT: case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT: { m_compressed = true; - m_pitch = utils::align(width, 4); + m_pitch = rx::alignUp(width, 4); break; } default: diff --git a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp index 703e58197..f1ee3afd1 100644 --- a/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp +++ b/rpcs3/Emu/RSX/GL/glutils/ring_buffer.cpp @@ -1,6 +1,8 @@ #include "stdafx.h" #include "ring_buffer.h" +#include "rx/align.hpp" + namespace gl { void ring_buffer::recreate(GLsizeiptr size, const void* data) @@ -37,7 +39,7 @@ namespace gl { u32 offset = m_data_loc; if (m_data_loc) - offset = utils::align(offset, alignment); + offset = rx::alignUp(offset, alignment); if ((offset + alloc_size) > m_size) { @@ -56,7 +58,7 @@ namespace gl } // Align data loc to 256; allows some "guard" region so we dont trample our own data inadvertently - m_data_loc = utils::align(offset + alloc_size, 256); + m_data_loc = rx::alignUp(offset + alloc_size, 256); return std::make_pair(static_cast(m_memory_mapping) + offset, offset); } @@ -108,9 +110,9 @@ namespace gl u32 offset = m_data_loc; if (m_data_loc) - offset = utils::align(offset, 256); + offset = rx::alignUp(offset, 256); - const u32 block_size = utils::align(alloc_size + 16, 256); // Overallocate just in case we need to realign base + const u32 block_size = rx::alignUp(alloc_size + 16, 256); // Overallocate just in case we need to realign base if ((offset + block_size) > m_size) { @@ -144,10 +146,10 @@ namespace gl { u32 offset = m_data_loc; if (m_data_loc) - offset = utils::align(offset, alignment); + offset = rx::alignUp(offset, alignment); u32 padding = (offset - m_data_loc); - u32 real_size = utils::align(padding + alloc_size, alignment); // Ensures we leave the loc pointer aligned after we exit + u32 real_size = rx::alignUp(padding + alloc_size, alignment); // Ensures we leave the loc pointer aligned after we exit if (real_size > m_mapped_bytes) { @@ -158,10 +160,10 @@ namespace gl offset = m_data_loc; if (m_data_loc) - offset = utils::align(offset, alignment); + offset = rx::alignUp(offset, alignment); padding = (offset - m_data_loc); - real_size = utils::align(padding + alloc_size, alignment); + real_size = rx::alignUp(padding + alloc_size, alignment); } m_data_loc = offset + real_size; @@ -270,7 +272,7 @@ namespace gl u32 scratch_ring_buffer::alloc(u32 size, u32 alignment) { - u64 start = utils::align(m_alloc_pointer, alignment); + u64 start = rx::alignUp(m_alloc_pointer, alignment); m_alloc_pointer = (start + size); if (static_cast(m_alloc_pointer) > m_storage.size()) diff --git a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp index 3f23b082a..0c60b7c41 100644 --- a/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp +++ b/rpcs3/Emu/RSX/GL/upscalers/fsr1/fsr_pass.cpp @@ -58,7 +58,7 @@ namespace gl m_src = fmt::replace_all(m_src, replacement_table); // Fill with 0 to avoid sending incomplete/unused variables to the GPU - m_constants_buf.resize(utils::rounded_div(push_constants_size, 4), 0); + m_constants_buf.resize(rx::rounded_div(push_constants_size, 4), 0); create(); @@ -106,8 +106,8 @@ namespace gl glBindImageTexture(GL_COMPUTE_IMAGE_SLOT(0), dst->id(), 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); constexpr auto wg_size = 16; - const auto invocations_x = utils::aligned_div(output_size.width, wg_size); - const auto invocations_y = utils::aligned_div(output_size.height, wg_size); + const auto invocations_x = rx::aligned_div(output_size.width, wg_size); + const auto invocations_y = rx::aligned_div(output_size.height, wg_size); ensure(invocations_x == (output_size.width + (wg_size - 1)) / wg_size); ensure(invocations_y == (output_size.height + (wg_size - 1)) / wg_size); diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp index 0afb82553..27be24372 100644 --- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp +++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.cpp @@ -2,7 +2,7 @@ #include "RSXDMAWriter.h" #include "util//Thread.h" -#include +#include namespace rsx { @@ -56,7 +56,7 @@ namespace rsx // FIXME: This is a busy wait, consider yield to improve responsiveness on weak devices. while (!m_host_context_ptr->in_flight_commands_completed()) { - utils::pause(); + rx::pause(); if (thread_ctrl::state() == thread_state::aborting) { diff --git a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h index 8fdc5b9df..72655d274 100644 --- a/rpcs3/Emu/RSX/Host/RSXDMAWriter.h +++ b/rpcs3/Emu/RSX/Host/RSXDMAWriter.h @@ -86,7 +86,7 @@ namespace rsx { public: RSXDMAWriter(void* mem) - : m_host_context_ptr(new(mem) host_gpu_context_t) + : m_host_context_ptr(new (mem) host_gpu_context_t) { } diff --git a/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp b/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp index e83c702eb..2c551c7d9 100644 --- a/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp +++ b/rpcs3/Emu/RSX/Overlays/Shaders/shader_loading_dialog.cpp @@ -3,7 +3,7 @@ #include "Emu/System.h" #include "rpcsx/fw/ps3/cellMsgDialog.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" namespace rsx { @@ -36,7 +36,7 @@ namespace rsx while (ref_cnt.load() && !Emu.IsStopped()) { - utils::pause(); + rx::pause(); } } @@ -112,7 +112,7 @@ namespace rsx { while (ref_cnt.load() && !Emu.IsStopped()) { - utils::pause(); + rx::pause(); } } } // namespace rsx diff --git a/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp b/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp index b31adfa2b..9e66fbed3 100644 --- a/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp +++ b/rpcs3/Emu/RSX/Overlays/overlay_manager.cpp @@ -1,7 +1,7 @@ #include "stdafx.h" #include "overlay_manager.h" #include "Emu/System.h" -#include +#include namespace rsx { @@ -37,7 +37,7 @@ namespace rsx *m_input_thread = thread_state::aborting; while (*m_input_thread <= thread_state::aborting) { - utils::pause(); + rx::pause(); } } } diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp index 9b4c81146..b86e13e77 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.cpp +++ b/rpcs3/Emu/RSX/RSXFIFO.cpp @@ -9,7 +9,8 @@ #include "cellos/sys_rsx.h" #include "NV47/HW/context.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include #include @@ -139,7 +140,7 @@ namespace rsx u32 bytes_read = 0; // Find the next set bit after every iteration - for (int i = 0;; i = (std::countr_zero(utils::rol8(to_fetch, 0 - i - 1)) + i + 1) % 8) + for (int i = 0;; i = (std::countr_zero(rx::rol8(to_fetch, 0 - i - 1)) + i + 1) % 8) { // If a reservation is being updated, try to load another const auto& res = vm::reservation_acquire(addr1 + i * 128); @@ -193,7 +194,7 @@ namespace rsx } else { - busy_wait(200); + rx::busy_wait(200); } if (strict_fetch_ordering) @@ -247,7 +248,7 @@ namespace rsx for (u32 remaining = size, addr = m_internal_get, ptr = from; remaining > 0;) { - const u32 next_block = utils::align(addr + 1, _1M); + const u32 next_block = rx::alignUp(addr + 1, _1M); const u32 available = (next_block - addr); if (remaining <= available) { diff --git a/rpcs3/Emu/RSX/RSXOffload.cpp b/rpcs3/Emu/RSX/RSXOffload.cpp index 588b35ad1..a03e7e3d6 100644 --- a/rpcs3/Emu/RSX/RSXOffload.cpp +++ b/rpcs3/Emu/RSX/RSXOffload.cpp @@ -9,7 +9,7 @@ #include "util/lockless.h" #include -#include "util/asm.hpp" +#include "rx/asm.hpp" namespace rsx { @@ -181,13 +181,13 @@ namespace rsx while (_thr.m_enqueued_count.load() > _thr.m_processed_count.load()) { rsxthr->on_semaphore_acquire_wait(); - utils::pause(); + rx::pause(); } } else { while (_thr.m_enqueued_count.load() > _thr.m_processed_count.load()) - utils::pause(); + rx::pause(); } return true; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 3a8a45f8a..adbf67a19 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -27,7 +27,8 @@ #include "util/date_time.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include #include @@ -332,11 +333,11 @@ namespace rsx { // Division operator _min_index = std::min(_min_index, first / attrib.frequency); - _max_index = std::max(_max_index, utils::aligned_div(max_index, attrib.frequency)); + _max_index = std::max(_max_index, rx::aligned_div(max_index, attrib.frequency)); if (freq_count > 0 && freq_count != umax) { - const u32 max = utils::aligned_div(max_index, attrib.frequency); + const u32 max = rx::aligned_div(max_index, attrib.frequency); max_result_by_division = std::max(max_result_by_division, max); // Discard lower frequencies because it has been proven that there are indices higher than them @@ -365,7 +366,7 @@ namespace rsx // The alternative would be re-iterating again over all of them if (get_location(real_offset_address) == CELL_GCM_LOCATION_LOCAL) { - if (utils::add_saturate(real_offset_address - rsx::constants::local_mem_base, (_max_index + 1) * attribute_stride) <= render->local_mem_size) + if (rx::add_saturate(real_offset_address - rsx::constants::local_mem_base, (_max_index + 1) * attribute_stride) <= render->local_mem_size) { break; } @@ -734,7 +735,7 @@ namespace rsx { // Be compatible with previous bitwise serialization ar(std::span(reinterpret_cast(this), OFFSET_OF(avconf, scan_mode))); - ar.pos += utils::align(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode); + ar.pos += rx::alignUp(OFFSET_OF(avconf, scan_mode), alignof(avconf)) - OFFSET_OF(avconf, scan_mode); return; } @@ -1169,7 +1170,7 @@ namespace rsx for (; t == now; now = get_time_ns()) { - utils::pause(); + rx::pause(); } timestamp_ctrl = now; @@ -2590,7 +2591,7 @@ namespace rsx { if (u32 advance = disasm.disasm(pcs_of_valid_cmds.back())) { - pcs_of_valid_cmds.push_back(utils::add_saturate(pcs_of_valid_cmds.back(), advance)); + pcs_of_valid_cmds.push_back(rx::add_saturate(pcs_of_valid_cmds.back(), advance)); } else { @@ -2722,7 +2723,7 @@ namespace rsx } // Some cases do not need full delay - remaining = utils::aligned_div(remaining, div); + remaining = rx::aligned_div(remaining, div); const u64 until = get_system_time() + remaining; while (true) @@ -2751,7 +2752,7 @@ namespace rsx } else { - busy_wait(100); + rx::busy_wait(100); } const u64 current = get_system_time(); @@ -2862,7 +2863,7 @@ namespace rsx for (u32 ea = address >> 20, end = ea + (size >> 20); ea < end; ea++) { - const u32 io = utils::rol32(iomap_table.io[ea], 32 - 20); + const u32 io = rx::rol32(iomap_table.io[ea], 32 - 20); if (io + 1) { @@ -2892,7 +2893,7 @@ namespace rsx while (to_unmap) { - bit = (std::countr_zero(utils::rol64(to_unmap, 0 - bit)) + bit); + bit = (std::countr_zero(rx::rol64(to_unmap, 0 - bit)) + bit); to_unmap &= ~(1ull << bit); constexpr u16 null_entry = 0xFFFF; @@ -2998,7 +2999,7 @@ namespace rsx while (!external_interrupt_ack && !is_stopped()) { - utils::pause(); + rx::pause(); } } @@ -3022,7 +3023,7 @@ namespace rsx while (external_interrupt_lock && (cpu_flag::ret - state)) { // TODO: Investigate non busy-spinning method - utils::pause(); + rx::pause(); } external_interrupt_ack.store(false); @@ -3364,7 +3365,7 @@ namespace rsx } const u64 current_time = get_system_time(); - const u64 current_tsc = utils::get_tsc(); + const u64 current_tsc = rx::get_tsc(); u64 preempt_count = 0; if (frame_times.size() >= 60) diff --git a/rpcs3/Emu/RSX/VK/VKCompute.cpp b/rpcs3/Emu/RSX/VK/VKCompute.cpp index 7775cd9b5..4e2558896 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.cpp +++ b/rpcs3/Emu/RSX/VK/VKCompute.cpp @@ -4,6 +4,8 @@ #include "vkutils/buffer_object.h" #include "VKPipelineCompiler.h" +#include "rx/align.hpp" + #define VK_MAX_COMPUTE_TASKS 8192 // Max number of jobs per frame namespace vk @@ -219,7 +221,7 @@ namespace vk #include "../Program/GLSLSnippets/ShuffleBytes.glsl" ; - const auto parameters_size = utils::align(push_constants_size, 16) / 16; + const auto parameters_size = rx::alignUp(push_constants_size, 16) / 16; const std::pair syntax_replace[] = { {"%loc", "0"}, @@ -387,7 +389,7 @@ namespace vk word_count = num_words; block_length = num_words * 4; - const u32 linear_invocations = utils::aligned_div(word_count, optimal_group_size); + const u32 linear_invocations = rx::aligned_div(word_count, optimal_group_size); compute_task::run(cmd, linear_invocations); } } // namespace vk diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 8874b2604..5a438a276 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -6,7 +6,8 @@ #include "Emu/IdManager.h" #include "util/StrUtil.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include @@ -484,7 +485,7 @@ namespace vk set_parameters(cmd); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); - const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); + const u32 linear_invocations = rx::aligned_div(data_length, num_bytes_per_invocation); compute_task::run(cmd, linear_invocations); } }; @@ -602,8 +603,8 @@ namespace vk this->out_offset = config.dst_offset; const auto tile_aligned_height = std::min( - utils::align(config.image_height, 64), - utils::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch)); + rx::alignUp(config.image_height, 64), + rx::aligned_div(config.tile_size - config.tile_base_offset, config.tile_pitch)); if constexpr (Op == RSX_detiler_op::decode) { @@ -656,7 +657,7 @@ namespace vk const u32 subtexels_per_invocation = (config.image_bpp < 4) ? (4 / config.image_bpp) : 1; const u32 virtual_width = config.image_width / subtexels_per_invocation; - const u32 invocations_x = utils::aligned_div(virtual_width, optimal_group_size); + const u32 invocations_x = rx::aligned_div(virtual_width, optimal_group_size); compute_task::run(cmd, invocations_x, config.image_height, 1); } }; diff --git a/rpcs3/Emu/RSX/VK/VKDMA.cpp b/rpcs3/Emu/RSX/VK/VKDMA.cpp index 21c361400..403cb4f7d 100644 --- a/rpcs3/Emu/RSX/VK/VKDMA.cpp +++ b/rpcs3/Emu/RSX/VK/VKDMA.cpp @@ -7,7 +7,9 @@ #include "Emu/RSX/RSXThread.h" #include "util/mutex.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" + #include namespace vk @@ -413,7 +415,7 @@ namespace vk std::lock_guard lock(g_dma_mutex); const u32 start = (local_address & s_dma_block_mask); - const u32 end = utils::align(local_address + length, static_cast(s_dma_block_length)); + const u32 end = rx::alignUp(local_address + length, static_cast(s_dma_block_length)); for (u32 block = start; block < end;) { diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index f54e503d3..32abde0c4 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -22,7 +22,8 @@ #include "../Program/SPIRVCommon.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" namespace vk { @@ -919,7 +920,7 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing) // Wait for deadlock to clear while (m_queue_status & flush_queue_state::deadlock) { - utils::pause(); + rx::pause(); } g_fxo->get().clear_mem_fault_flag(); @@ -2081,13 +2082,13 @@ void VKGSRender::load_program_env() rsx::io_buffer indirection_table_buf([&](usz size) -> std::pair { - indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment)); + indirection_table_offset = m_instancing_buffer_ring_info.alloc<1>(rx::alignUp(size, alignment)); return std::make_pair(m_instancing_buffer_ring_info.map(indirection_table_offset, size), size); }); rsx::io_buffer constants_array_buf([&](usz size) -> std::pair { - constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(utils::align(size, alignment)); + constants_data_table_offset = m_instancing_buffer_ring_info.alloc<1>(rx::alignUp(size, alignment)); return std::make_pair(m_instancing_buffer_ring_info.map(constants_data_table_offset, size), size); }); @@ -2105,7 +2106,7 @@ void VKGSRender::load_program_env() auto alloc_storage = [&](usz size) -> std::pair { const auto alignment = m_device->gpu().get_limits().minUniformBufferOffsetAlignment; - mem_offset = m_transform_constants_ring_info.alloc<1>(utils::align(size, alignment)); + mem_offset = m_transform_constants_ring_info.alloc<1>(rx::alignUp(size, alignment)); return std::make_pair(m_transform_constants_ring_info.map(mem_offset, size), size); }; @@ -2921,7 +2922,7 @@ void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* } rsx_log.warning("[Performance warning] Unexpected ZCULL read caused a hard sync"); - busy_wait(); + rx::busy_wait(); } data.sync(); diff --git a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp index c5b80ccbd..beaaa7a9f 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp +++ b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp @@ -8,7 +8,7 @@ #include "Emu/RSX/rsx_utils.h" #include "Emu/RSX/rsx_cache.h" #include "util/mutex.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include #include @@ -289,7 +289,7 @@ namespace vk { while (num_waiters.load() != 0) { - utils::pause(); + rx::pause(); } } diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp index 8e8c8377c..8344be6c4 100644 --- a/rpcs3/Emu/RSX/VK/VKPresent.cpp +++ b/rpcs3/Emu/RSX/VK/VKPresent.cpp @@ -8,7 +8,8 @@ #include "upscalers/bilinear_pass.hpp" #include "upscalers/fsr_pass.h" #include "upscalers/nearest_pass.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "util/video_provider.h" extern atomic_t g_user_asked_for_screenshot; @@ -762,7 +763,7 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info) { const usz sshot_size = buffer_height * buffer_width * 4; - vk::buffer sshot_vkbuf(*m_device, utils::align(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, + vk::buffer sshot_vkbuf(*m_device, rx::alignUp(sshot_size, 0x100000), m_device->get_memory_mapping().host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0, VMM_ALLOCATION_POOL_UNDEFINED); VkBufferImageCopy copy_info; diff --git a/rpcs3/Emu/RSX/VK/VKQueryPool.cpp b/rpcs3/Emu/RSX/VK/VKQueryPool.cpp index f786495c0..ca154a212 100644 --- a/rpcs3/Emu/RSX/VK/VKQueryPool.cpp +++ b/rpcs3/Emu/RSX/VK/VKQueryPool.cpp @@ -4,7 +4,7 @@ #include "VKQueryPool.h" #include "VKRenderPass.h" #include "VKResourceManager.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "VKGSRender.h" namespace vk @@ -172,7 +172,7 @@ namespace vk while (!query_info.ready) { - utils::pause(); + rx::pause(); poke_query(query_info, index, result_flags); } } diff --git a/rpcs3/Emu/RSX/VK/VKResolveHelper.h b/rpcs3/Emu/RSX/VK/VKResolveHelper.h index 05148de93..dc274a4a5 100644 --- a/rpcs3/Emu/RSX/VK/VKResolveHelper.h +++ b/rpcs3/Emu/RSX/VK/VKResolveHelper.h @@ -4,6 +4,7 @@ #include "VKOverlays.h" #include "vkutils/image.h" +#include "rx/align.hpp" namespace vk { @@ -65,8 +66,8 @@ namespace vk multisampled = msaa_image; resolve = resolve_image; - const u32 invocations_x = utils::align(resolve_image->width(), cs_wave_x) / cs_wave_x; - const u32 invocations_y = utils::align(resolve_image->height(), cs_wave_y) / cs_wave_y; + const u32 invocations_x = rx::alignUp(resolve_image->width(), cs_wave_x) / cs_wave_x; + const u32 invocations_y = rx::alignUp(resolve_image->height(), cs_wave_y) / cs_wave_y; compute_task::run(cmd, invocations_x, invocations_y, 1); } diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index d73669d5e..5981f5bbf 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -13,7 +13,8 @@ #include "../GCM.h" #include "../rsx_utils.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" namespace vk { @@ -94,7 +95,7 @@ namespace vk ensure(dst->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z32_offset = utils::align(data_offset + packed16_length, 256); + const auto z32_offset = rx::alignUp(data_offset + packed16_length, 256); // 1. Copy the depth to buffer VkBufferImageCopy region2; @@ -148,8 +149,8 @@ namespace vk ensure(dst->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z_offset = utils::align(data_offset + packed_length, 256); - const auto s_offset = utils::align(z_offset + in_depth_size, 256); + const auto z_offset = rx::alignUp(data_offset + packed_length, 256); + const auto s_offset = rx::alignUp(z_offset + in_depth_size, 256); // 1. Copy the depth and stencil blocks to separate banks VkBufferImageCopy sub_regions[2]; @@ -246,7 +247,7 @@ namespace vk ensure(src->size() >= allocation_end); const auto data_offset = u32(region.bufferOffset); - const auto z32_offset = utils::align(data_offset + packed16_length, 256); + const auto z32_offset = rx::alignUp(data_offset + packed16_length, 256); // 1. Pre-compute barrier vk::insert_buffer_memory_barrier(cmd, src->value, z32_offset, packed32_length, @@ -281,11 +282,11 @@ namespace vk ensure(src->size() >= allocation_end); // "Out of memory (compute heap). Lower your resolution scale setting." const auto data_offset = u32(region.bufferOffset); - const auto z_offset = utils::align(data_offset + packed_length, 256); - const auto s_offset = utils::align(z_offset + in_depth_size, 256); + const auto z_offset = rx::alignUp(data_offset + packed_length, 256); + const auto s_offset = rx::alignUp(z_offset + in_depth_size, 256); // Zero out the stencil block - VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, src->value, s_offset, utils::align(in_stencil_size, 4), 0); + VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, src->value, s_offset, rx::alignUp(in_stencil_size, 4), 0); vk::insert_buffer_memory_barrier(cmd, src->value, s_offset, in_stencil_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, @@ -848,7 +849,7 @@ namespace vk const auto src_offset = section.bufferOffset; // Align output to 128-byte boundary to keep some drivers happy - dst_offset = utils::align(dst_offset, 128); + dst_offset = rx::alignUp(dst_offset, 128); u32 data_length = 0; for (unsigned i = 0, j = packet.first; i < packet.second; ++i, ++j) @@ -1124,7 +1125,7 @@ namespace vk if (layout.level == 0) { // Align mip0 on a 128-byte boundary - scratch_offset = utils::align(scratch_offset, 128); + scratch_offset = rx::alignUp(scratch_offset, 128); } // Copy from upload heap to scratch mem @@ -1254,7 +1255,7 @@ namespace vk { // Calculate the true length of the usable memory section const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); - const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64); + const auto max_content_size = tiled_region.tile->pitch * rx::alignUp(height, 64); const auto section_length = std::min(max_content_size, available_tile_size); // Sync the DMA layer diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index bf2cd110e..72b6bcf40 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -4,7 +4,7 @@ #include "VKCompute.h" #include "VKAsyncScheduler.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" namespace vk { @@ -450,7 +450,7 @@ namespace vk // Dimensions were given in 'dst' space. Work out the real source coordinates const auto src_bpp = vk::get_format_texel_width(section.src->format()); src_x = (src_x * dst_bpp) / src_bpp; - src_w = utils::aligned_div(src_w * dst_bpp, src_bpp); + src_w = rx::aligned_div(src_w * dst_bpp, src_bpp); transform &= ~(rsx::surface_transform::coordinate_transform); } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 2aa6e221f..b4b05be4a 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -4,11 +4,14 @@ #include "VKRenderTargets.h" #include "VKResourceManager.h" #include "VKRenderPass.h" +#include "VKGSRenderTypes.hpp" #include "vkutils/image_helpers.h" #include "../Common/texture_cache.h" #include "../Common/tiled_dma_copy.hpp" +#include "rx/align.hpp" + #include #include @@ -289,7 +292,7 @@ namespace vk if (tiled_region) { const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); - const auto max_content_size = tiled_region.tile->pitch * utils::align(height, 64); + const auto max_content_size = tiled_region.tile->pitch * rx::alignUp(height, 64); flush_length = std::min(max_content_size, available_tile_size); } diff --git a/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp b/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp index 1cea37d5e..e08195329 100644 --- a/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp +++ b/rpcs3/Emu/RSX/VK/upscalers/fsr1/fsr_pass.cpp @@ -117,8 +117,8 @@ namespace vk configure(cmd); constexpr auto wg_size = 16; - const auto invocations_x = utils::aligned_div(output_size.width, wg_size); - const auto invocations_y = utils::aligned_div(output_size.height, wg_size); + const auto invocations_x = rx::aligned_div(output_size.width, wg_size); + const auto invocations_y = rx::aligned_div(output_size.height, wg_size); ensure(invocations_x == (output_size.width + (wg_size - 1)) / wg_size); ensure(invocations_y == (output_size.height + (wg_size - 1)) / wg_size); diff --git a/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp b/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp index d68b4ca55..a71b095b4 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/data_heap.cpp @@ -6,6 +6,7 @@ #include "../VKHelpers.h" #include "../VKResourceManager.h" #include "Emu/IdManager.h" +#include "rx/align.hpp" #include @@ -60,7 +61,7 @@ namespace vk // Create new heap. All sizes are aligned up by 64M, upto 1GiB const usz size_limit = 1024 * 0x100000; - usz aligned_new_size = utils::align(m_size + size, 64 * 0x100000); + usz aligned_new_size = rx::alignUp(m_size + size, 64 * 0x100000); if (aligned_new_size >= size_limit) { diff --git a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp index 9d369d2fa..346f51f9e 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/scratch.cpp @@ -4,7 +4,8 @@ #include "../VKResourceManager.h" -#include +#include +#include namespace vk { @@ -123,8 +124,8 @@ namespace vk { auto create_texture = [&]() { - u32 new_width = utils::align(requested_width, 256u); - u32 new_height = utils::align(requested_height, 256u); + u32 new_width = rx::alignUp(requested_width, 256u); + u32 new_height = rx::alignUp(requested_height, 256u); return new vk::image(*g_render_device, g_render_device->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_IMAGE_TYPE_2D, format, new_width, new_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, @@ -165,7 +166,7 @@ namespace vk if (!scratch_buffer) { // Choose optimal size - const u64 alloc_size = utils::align(min_required_size, 0x100000); + const u64 alloc_size = rx::alignUp(min_required_size, 0x100000); scratch_buffer = std::make_unique(*g_render_device, alloc_size, g_render_device->get_memory_mapping().device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, @@ -184,7 +185,7 @@ namespace vk if (init_mem || zero_memory) { // Zero-initialize the allocated VRAM - const u64 zero_length = init_mem ? buf->size() : utils::align(min_required_size, 4); + const u64 zero_length = init_mem ? buf->size() : rx::alignUp(min_required_size, 4); VK_GET_SYMBOL(vkCmdFillBuffer)(cmd, buf->value, 0, zero_length, 0); insert_buffer_memory_barrier(cmd, buf->value, 0, zero_length, diff --git a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp index 0f851d7b1..6df7e3334 100644 --- a/rpcs3/Emu/RSX/VK/vkutils/sync.cpp +++ b/rpcs3/Emu/RSX/VK/vkutils/sync.cpp @@ -9,7 +9,7 @@ #include "Emu/Cell/timers.hpp" #include "util/sysinfo.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" namespace vk { @@ -170,7 +170,7 @@ namespace vk { while (!flushed) { - utils::pause(); + rx::pause(); } } @@ -553,7 +553,7 @@ namespace vk switch (status) { case VK_NOT_READY: - utils::pause(); + rx::pause(); continue; default: die_with_error(status); @@ -592,7 +592,7 @@ namespace vk if (timeout) { - const auto now = freq ? utils::get_tsc() : get_system_time(); + const auto now = freq ? rx::get_tsc() : get_system_time(); if (!start) { @@ -608,7 +608,7 @@ namespace vk } } - utils::pause(); + rx::pause(); } } } // namespace vk diff --git a/rpcs3/Emu/System.cpp b/rpcs3/Emu/System.cpp index daa6c62cf..b30a6dad3 100644 --- a/rpcs3/Emu/System.cpp +++ b/rpcs3/Emu/System.cpp @@ -52,6 +52,7 @@ #include "util/logs.hpp" #include "util/init_mutex.hpp" #include "util/sysinfo.hpp" +#include "rx/asm.hpp" #include #include @@ -1266,7 +1267,7 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch, auto load_tar = [&](const std::string& path, const std::string& special_file) { const usz size = m_ar->pop(); - const usz max_data_size = m_ar->get_size(utils::add_saturate(size, m_ar->pos)); + const usz max_data_size = m_ar->get_size(rx::add_saturate(size, m_ar->pos)); if (size % 512 || max_data_size < size || max_data_size - size < m_ar->pos) { @@ -3687,7 +3688,7 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s { // Write merged TTY output after emulation has been safely stopped - if (usz attempted_read_size = utils::sub_saturate(g_tty.pos(), m_tty_file_init_pos)) + if (usz attempted_read_size = rx::sub_saturate(g_tty.pos(), m_tty_file_init_pos)) { if (fs::file tty_read_fd{fs::get_log_dir() + "TTY.log"}) { @@ -3731,7 +3732,7 @@ void Emulator::Kill(bool allow_autoexit, bool savestate, savestate_stage* save_s { std::string_view to_log = not_logged; to_log = to_log.substr(0, 0x8000); - to_log = to_log.substr(0, utils::add_saturate(to_log.rfind("\n========== SPU BLOCK"sv), 1)); + to_log = to_log.substr(0, rx::add_saturate(to_log.rfind("\n========== SPU BLOCK"sv), 1)); to_remove = to_log.size(); std::string new_log(to_log); diff --git a/rpcs3/Emu/perf_meter.cpp b/rpcs3/Emu/perf_meter.cpp index f8b2dd243..c677ed7b3 100644 --- a/rpcs3/Emu/perf_meter.cpp +++ b/rpcs3/Emu/perf_meter.cpp @@ -3,7 +3,7 @@ #include "util/sysinfo.hpp" #include "util/fence.hpp" -#include "util/tsc.hpp" +#include "rx/tsc.hpp" #include "util/Thread.h" #include "util/mutex.h" @@ -75,7 +75,7 @@ SAFE_BUFFERS(void) perf_stat_base::push(u64 data[66], u64 start_time, const char* name) noexcept { // Event end - const u64 end_time = (utils::lfence(), utils::get_tsc()); + const u64 end_time = (utils::lfence(), rx::get_tsc()); // Compute difference in seconds const f64 diff = (end_time - start_time) * 1. / utils::get_tsc_freq(); diff --git a/rpcs3/Emu/perf_meter.hpp b/rpcs3/Emu/perf_meter.hpp index d06e0f63c..88dadda22 100644 --- a/rpcs3/Emu/perf_meter.hpp +++ b/rpcs3/Emu/perf_meter.hpp @@ -2,7 +2,7 @@ #include "util/types.hpp" #include "util/logs.hpp" -#include "util/tsc.hpp" +#include "rx/tsc.hpp" #include "system_config.h" #include #include @@ -146,7 +146,7 @@ public: if constexpr (std::array{(SubEvents == Event)...}[Index]) { // Push actual timestamp into an array - m_timestamps[Index + 1] = utils::get_tsc(); + m_timestamps[Index + 1] = rx::get_tsc(); } else if constexpr (Index < sizeof...(SubEvents)) { @@ -170,7 +170,7 @@ public: // Re-initialize first timestamp FORCE_INLINE SAFE_BUFFERS(void) restart() noexcept { - m_timestamps[0] = utils::get_tsc(); + m_timestamps[0] = rx::get_tsc(); std::memset(m_timestamps + 1, 0, sizeof(m_timestamps) - sizeof(u64)); } diff --git a/rpcs3/Emu/system_progress.cpp b/rpcs3/Emu/system_progress.cpp index 780deb81a..e3f001baa 100644 --- a/rpcs3/Emu/system_progress.cpp +++ b/rpcs3/Emu/system_progress.cpp @@ -8,7 +8,7 @@ #include "Emu/RSX/Overlays/overlay_compile_notification.h" #include "Emu/System.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" LOG_CHANNEL(sys_log, "SYS"); @@ -226,7 +226,7 @@ void progress_dialog_server::operator()() if (pdone < ptotal && g_cfg.misc.show_ppu_compilation_hint) { const u64 passed_usec = (get_system_time() - start_time); - const u64 remaining_usec = pdone ? utils::rational_mul(passed_usec, static_cast(ptotal) - pdone, pdone) : (passed_usec * ptotal); + const u64 remaining_usec = pdone ? rx::rational_mul(passed_usec, static_cast(ptotal) - pdone, pdone) : (passed_usec * ptotal); // Only show compile notification if we estimate at least 100ms if (remaining_usec >= 100'000ULL) @@ -260,7 +260,7 @@ void progress_dialog_server::operator()() // Assume not all programs were found if files were not compiled (as it may contain more) const bool use_bits = fknown_bits && ftotal_bits; const u64 known_files = use_bits ? fknown_bits : ftotal; - const u64 total = utils::rational_mul(std::max(ptotal, 1), std::max(use_bits ? ftotal_bits : ftotal, 1), std::max(known_files, 1)); + const u64 total = rx::rational_mul(std::max(ptotal, 1), std::max(use_bits ? ftotal_bits : ftotal, 1), std::max(known_files, 1)); const u64 done = pdone; const u32 value = static_cast(done >= total ? 100 : done * 100 / total); @@ -280,7 +280,7 @@ void progress_dialog_server::operator()() if (of_1000 >= 2) { const u64 passed = (get_system_time() - start_time); - const u64 total = utils::rational_mul(passed, 1000, of_1000); + const u64 total = rx::rational_mul(passed, 1000, of_1000); const u64 remaining = total - passed; // Stabilize the result by using the maximum one from the recent history diff --git a/rpcs3/Loader/PSF.cpp b/rpcs3/Loader/PSF.cpp index 4fe0897cc..eb8397e90 100644 --- a/rpcs3/Loader/PSF.cpp +++ b/rpcs3/Loader/PSF.cpp @@ -1,7 +1,8 @@ #include "stdafx.h" #include "PSF.h" -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include LOG_CHANNEL(psf_log, "PSF"); @@ -307,7 +308,7 @@ namespace psf } // Align next section (data) offset - key_offset = utils::align(key_offset, 4); + key_offset = rx::alignUp(key_offset, 4); // Generate header header_t header{}; diff --git a/rpcs3/Loader/TAR.cpp b/rpcs3/Loader/TAR.cpp index aba96d424..7d164e317 100644 --- a/rpcs3/Loader/TAR.cpp +++ b/rpcs3/Loader/TAR.cpp @@ -7,8 +7,8 @@ #include "TAR.h" -#include "util/asm.hpp" - +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "util/serialization_ext.hpp" #include @@ -164,7 +164,7 @@ std::unique_ptr tar_object::get_file(const std::string& path, std const u64 size = emplace_single_entry(largest_offset, m_ar->get_size(umax) - m_ar_tar_start).first; // Advance offset to next block - largest_offset += utils::align(size, 512); + largest_offset += rx::alignUp(size, 512); } // Continue scanning from last file entered else if (m_file) @@ -181,7 +181,7 @@ std::unique_ptr tar_object::get_file(const std::string& path, std } // Advance offset to next block - largest_offset += utils::align(size, 512); + largest_offset += rx::alignUp(size, 512); if (!path.empty() && path == filename) { @@ -408,7 +408,7 @@ void tar_object::save_directory(const std::string& target_path, utils::serial& a return; } - ptr += utils::aligned_div(static_cast(std::bit_width(i)), 3) - 1; + ptr += rx::aligned_div(static_cast(std::bit_width(i)), 3) - 1; for (; i; ptr--, i /= 8) { @@ -425,7 +425,7 @@ void tar_object::save_directory(const std::string& target_path, utils::serial& a if (is_null && !func) { - ar.pos += utils::align(file_stat.size, 512); + ar.pos += rx::alignUp(file_stat.size, 512); return; } @@ -458,7 +458,7 @@ void tar_object::save_directory(const std::string& target_path, utils::serial& a if (is_null) { // Align - ar.pos += utils::align(ar.pos - old_pos, 512); + ar.pos += rx::alignUp(ar.pos - old_pos, 512); return; } } @@ -485,7 +485,7 @@ void tar_object::save_directory(const std::string& target_path, utils::serial& a // Align const usz diff = ar.pos - old_pos; - ar.data.resize(ar.data.size() + utils::align(diff, 512) - diff); + ar.data.resize(ar.data.size() + rx::alignUp(diff, 512) - diff); ar.seek_end(); fd.close(); diff --git a/rpcs3/util/File.cpp b/rpcs3/util/File.cpp index c93c4a007..c41e82ebf 100644 --- a/rpcs3/util/File.cpp +++ b/rpcs3/util/File.cpp @@ -9,7 +9,8 @@ #include #include -#include "util/asm.hpp" +#include "rx/align.hpp" +#include "rx/asm.hpp" #include "util/coro.hpp" using namespace std::literals::string_literals; @@ -2386,7 +2387,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment, atomic_t @@ -158,8 +159,8 @@ static u8* add_jit_memory(usz size, usz align) // Simple allocation by incrementing pointer to the next free data const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64 { - const u64 _pos = utils::align(ctr & 0xffff'ffff, align); - const u64 _new = utils::align(_pos + size, align); + const u64 _pos = rx::alignUp(ctr & 0xffff'ffff, align); + const u64 _new = rx::alignUp(_pos + size, align); if (_new > 0x40000000) [[unlikely]] { @@ -175,7 +176,7 @@ static u8* add_jit_memory(usz size, usz align) // Check the necessity to commit more memory if (_new > olda) [[unlikely]] { - newa = utils::align(_new, 0x200000); + newa = rx::alignUp(_new, 0x200000); } ctr += _new - (ctr & 0xffff'ffff); @@ -237,9 +238,9 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code, usz align) noexcept for (asmjit::Section* section : code->_sections) { - if (section->offset() + section->bufferSize() > utils::align(codeSize, align)) + if (section->offset() + section->bufferSize() > rx::alignUp(codeSize, align)) { - fmt::throw_exception("CodeHolder section exceeds range: Section->offset: 0x%x, Section->bufferSize: 0x%x, alloted-memory=0x%x", section->offset(), section->bufferSize(), utils::align(codeSize, align)); + fmt::throw_exception("CodeHolder section exceeds range: Section->offset: 0x%x, Section->bufferSize: 0x%x, alloted-memory=0x%x", section->offset(), section->bufferSize(), rx::alignUp(codeSize, align)); } std::memcpy(p + section->offset(), section->data(), section->bufferSize()); @@ -365,7 +366,7 @@ jit_runtime_base& asmjit::get_global_runtime() { return m_pos.atomic_op([&](uchar*& pos) -> uchar* { - const auto r = reinterpret_cast(utils::align(uptr(pos), align)); + const auto r = reinterpret_cast(rx::alignUp(uptr(pos), align)); if (r >= pos && r + size > pos && r + size <= m_max) { diff --git a/rpcs3/util/JITLLVM.cpp b/rpcs3/util/JITLLVM.cpp index 77c44272d..b67d303a3 100644 --- a/rpcs3/util/JITLLVM.cpp +++ b/rpcs3/util/JITLLVM.cpp @@ -7,7 +7,8 @@ #include "util/logs.hpp" #include "mutex.h" #include "util/vm.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include "Crypto/unzip.h" #include @@ -216,7 +217,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager ~MemoryManager1() override { // Hack: don't release to prevent reuse of address space, see jit_announce - // constexpr auto how_much = [](u64 pos) { return utils::align(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); }; + // constexpr auto how_much = [](u64 pos) { return rx::alignUp(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); }; // utils::memory_decommit(m_code_mems, how_much(code_ptr)); // utils::memory_decommit(m_data_ro_mems, how_much(data_ro_ptr)); // utils::memory_decommit(m_data_rw_mems, how_much(data_rw_ptr)); @@ -249,7 +250,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager { align = align ? align : 16; - const u64 sizea = utils::align(size, align); + const u64 sizea = rx::alignUp(size, align); if (!size || align > c_page_size || sizea > c_max_size || sizea < size) { @@ -259,7 +260,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager u64 oldp = alloc_pos; - u64 olda = utils::align(oldp, align); + u64 olda = rx::alignUp(oldp, align); ensure(olda >= oldp); ensure(olda < ~sizea); @@ -285,8 +286,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager // Optimization: split the first allocation to 512 KiB for single-module compilers if (oldp < c_page_size && align < page_quarter && (std::min(newp, c_page_size) - 1) / page_quarter != (oldp - 1) / page_quarter) { - const u64 pagea = utils::align(oldp, page_quarter); - const u64 psize = utils::align(std::min(newp, c_page_size) - pagea, page_quarter); + const u64 pagea = rx::alignUp(oldp, page_quarter); + const u64 psize = rx::alignUp(std::min(newp, c_page_size) - pagea, page_quarter); utils::memory_commit(reinterpret_cast(block) + (pagea % c_max_size), psize, prot); // Advance @@ -296,8 +297,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager if ((newp - 1) / c_page_size != (oldp - 1) / c_page_size) { // Allocate pages on demand - const u64 pagea = utils::align(oldp, c_page_size); - const u64 psize = utils::align(newp - pagea, c_page_size); + const u64 pagea = rx::alignUp(oldp, c_page_size); + const u64 psize = rx::alignUp(newp - pagea, c_page_size); utils::memory_commit(reinterpret_cast(block) + (pagea % c_max_size), psize, prot); } diff --git a/rpcs3/util/Thread.cpp b/rpcs3/util/Thread.cpp index 343bff14f..b9c909bbb 100644 --- a/rpcs3/util/Thread.cpp +++ b/rpcs3/util/Thread.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" + +#include "rx/debug.hpp" #include "Emu/Cell/timers.hpp" #include "Emu/System.h" #include "Emu/Cell/SPUThread.h" @@ -88,7 +90,7 @@ DYNAMIC_IMPORT_RENAME("Kernel32.dll", SetThreadDescriptionImport, "SetThreadDesc #include "util/vm.hpp" #include "util/logs.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/v128.hpp" #include "util/simd.hpp" #include "util/sysinfo.hpp" @@ -141,74 +143,11 @@ std::string dump_useful_thread_info() return result; } -#ifndef _WIN32 -bool IsDebuggerPresent() -{ -#if defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) - int mib[] = { - CTL_KERN, - KERN_PROC, - KERN_PROC_PID, - getpid(), -#if defined(__NetBSD__) || defined(__OpenBSD__) - sizeof(struct kinfo_proc), - 1, -#endif - }; - u_int miblen = std::size(mib); - struct kinfo_proc info; - usz size = sizeof(info); - - if (sysctl(mib, miblen, &info, &size, NULL, 0)) - { - return false; - } - - return info.KP_FLAGS & P_TRACED; -#else - char buf[4096]; - fs::file status_fd("/proc/self/status"); - if (!status_fd) - { - std::fprintf(stderr, "Failed to open /proc/self/status\n"); - return false; - } - - const auto num_read = status_fd.read(buf, sizeof(buf) - 1); - if (num_read == 0 || num_read == umax) - { - std::fprintf(stderr, "Failed to read /proc/self/status (%d)\n", errno); - return false; - } - - buf[num_read] = '\0'; - std::string_view status = buf; - - const auto found = status.find("TracerPid:"); - if (found == umax) - { - std::fprintf(stderr, "Failed to find 'TracerPid:' in /proc/self/status\n"); - return false; - } - - for (const char* cp = status.data() + found + 10; cp <= status.data() + num_read; ++cp) - { - if (!std::isspace(*cp)) - { - return std::isdigit(*cp) != 0 && *cp != '0'; - } - } - - return false; -#endif -} -#endif - bool is_debugger_present() { if (g_cfg.core.external_debugger) return true; - return IsDebuggerPresent(); + return rx::isDebuggerPresent(); } #if defined(ARCH_X64) @@ -2071,7 +2010,7 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept sys_log.notice("\n%s", dump_useful_thread_info()); logs::listener::sync_all(); - if (IsDebuggerPresent()) + if (rx::isDebuggerPresent()) { // Convert to SIGTRAP raise(SIGTRAP); @@ -2091,7 +2030,7 @@ static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept sys_log.notice("\n%s", dump_useful_thread_info()); logs::listener::sync_all(); - if (IsDebuggerPresent()) + if (rx::isDebuggerPresent()) { // Convert to SIGTRAP raise(SIGTRAP); @@ -2140,7 +2079,7 @@ const bool s_exception_handler_set = []() -> bool std::abort(); } - std::printf("Debugger: %d\n", +IsDebuggerPresent()); + std::printf("Debugger: %d\n", +rx::isDebuggerPresent()); return true; }(); @@ -2150,10 +2089,10 @@ const bool s_terminate_handler_set = []() -> bool { std::set_terminate([]() { - if (IsDebuggerPresent()) + if (rx::isDebuggerPresent()) { logs::listener::sync_all(); - utils::trap(); + rx::breakpoint(); } report_fatal_error("RPCS3 has abnormally terminated."); @@ -2214,7 +2153,7 @@ void thread_base::initialize(void (*error_cb)()) { if (attempts == umax) { - g_tls_wait_time += utils::get_tsc() - stamp0; + g_tls_wait_time += rx::get_tsc() - stamp0; } else if (attempts > 1) { @@ -2246,7 +2185,7 @@ void thread_base::set_name(std::string name) }; // Set thread name for VS debugger - if (IsDebuggerPresent()) + if (rx::isDebuggerPresent()) [&]() NEVER_INLINE { THREADNAME_INFO info; @@ -2527,7 +2466,7 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */) void thread_ctrl::wait_until(u64* wait_time, u64 add_time, u64 min_wait, bool update_to_current_time) { - *wait_time = utils::add_saturate(*wait_time, add_time); + *wait_time = rx::add_saturate(*wait_time, add_time); // TODO: Implement proper support for "waiting until" inside atomic wait engine const u64 current_time = get_system_time(); @@ -2546,7 +2485,7 @@ void thread_ctrl::wait_until(u64* wait_time, u64 add_time, u64 min_wait, bool up if (min_wait) { - *wait_time = std::max(*wait_time, utils::add_saturate(current_time, min_wait)); + *wait_time = std::max(*wait_time, rx::add_saturate(current_time, min_wait)); } wait_for(*wait_time - current_time); @@ -2588,7 +2527,7 @@ void thread_ctrl::wait_for_accurate(u64 usec) } else { - busy_wait(100); + rx::busy_wait(100); } const auto current = std::chrono::steady_clock::now(); @@ -2663,7 +2602,7 @@ bool thread_base::join(bool dtor) const // Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf; - auto stamp0 = utils::get_tsc(); + auto stamp0 = rx::get_tsc(); for (u64 i = 0; (m_sync & 3) <= 1; i++) { @@ -2676,7 +2615,7 @@ bool thread_base::join(bool dtor) const if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf) { - sig_log.error("Thread [%s] is too sleepy. Waiting for it %.3fus already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.)); + sig_log.error("Thread [%s] is too sleepy. Waiting for it %.3fus already!", *m_tname.load(), (rx::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.)); } } @@ -2764,7 +2703,7 @@ void thread_base::exec() for (thread_future* prev{};;) { - utils::prefetch_exec(prev_head->exec.load()); + rx::prefetch_exec(prev_head->exec.load()); if (auto next = prev_head->next.get()) { @@ -2836,7 +2775,7 @@ void thread_base::exec() logs::listener::sync_all(); - if (IsDebuggerPresent()) + if (rx::isDebuggerPresent()) { // Prevent repeatedly halting the debugger in case multiple threads crashed at once static atomic_t s_last_break = 0; @@ -2861,7 +2800,7 @@ void thread_base::exec() }) .second) { - utils::trap(); + rx::breakpoint(); } } diff --git a/rpcs3/util/asm.hpp b/rpcs3/util/asm.hpp deleted file mode 100644 index f9183da9c..000000000 --- a/rpcs3/util/asm.hpp +++ /dev/null @@ -1,476 +0,0 @@ -#pragma once - -#include "util/types.hpp" -#include "util/tsc.hpp" -#include "util/atomic.hpp" -#include - -extern bool g_use_rtm; -extern u64 g_rtm_tx_limit1; - -#ifdef _M_X64 -#ifdef _MSC_VER -extern "C" -{ - u32 _xbegin(); - void _xend(); - void _mm_pause(); - void _mm_prefetch(const char*, int); - void _m_prefetchw(const volatile void*); - - uchar _rotl8(uchar, uchar); - ushort _rotl16(ushort, uchar); - u64 __popcnt64(u64); - - s64 __mulh(s64, s64); - u64 __umulh(u64, u64); - - s64 _div128(s64, s64, s64, s64*); - u64 _udiv128(u64, u64, u64, u64*); - void __debugbreak(); -} -#include -#else -#include -#endif -#endif - -namespace utils -{ - // Transaction helper (result = pair of success and op result, or just bool) - template > - inline auto tx_start(F op) - { -#if defined(ARCH_X64) - uint status = -1; - - for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc()) - { -#ifndef _MSC_VER - __asm__ goto("xbegin %l[retry];" ::: "memory" : retry); -#else - status = _xbegin(); - - if (status != _XBEGIN_STARTED) [[unlikely]] - { - goto retry; - } -#endif - - if constexpr (std::is_void_v) - { - std::invoke(op); -#ifndef _MSC_VER - __asm__ volatile("xend;" ::: "memory"); -#else - _xend(); -#endif - return true; - } - else - { - auto result = std::invoke(op); -#ifndef _MSC_VER - __asm__ volatile("xend;" ::: "memory"); -#else - _xend(); -#endif - return std::make_pair(true, std::move(result)); - } - - retry: -#ifndef _MSC_VER - __asm__ volatile("movl %%eax, %0;" : "=r"(status)::"memory"); -#endif - if (!status) [[unlikely]] - { - break; - } - } -#else - static_cast(op); -#endif - - if constexpr (std::is_void_v) - { - return false; - } - else - { - return std::make_pair(false, R()); - } - }; - - // Try to prefetch to Level 2 cache since it's not split to data/code on most processors - template - constexpr void prefetch_exec(T func) - { - if (std::is_constant_evaluated()) - { - return; - } - - const u64 value = reinterpret_cast(func); - const void* ptr = reinterpret_cast(value); - -#ifdef _M_X64 - return _mm_prefetch(static_cast(ptr), _MM_HINT_T1); -#else - return __builtin_prefetch(ptr, 0, 2); -#endif - } - - // Try to prefetch to Level 1 cache - constexpr void prefetch_read(const void* ptr) - { - if (std::is_constant_evaluated()) - { - return; - } - -#ifdef _M_X64 - return _mm_prefetch(static_cast(ptr), _MM_HINT_T0); -#else - return __builtin_prefetch(ptr, 0, 3); -#endif - } - - constexpr void prefetch_write(void* ptr) - { - if (std::is_constant_evaluated()) - { - return; - } - -#if defined(_M_X64) && !defined(__clang__) - return _m_prefetchw(ptr); -#else - return __builtin_prefetch(ptr, 1, 0); -#endif - } - - constexpr u8 rol8(u8 x, u8 n) - { - if (std::is_constant_evaluated()) - { - return (x << (n & 7)) | (x >> ((-n & 7))); - } - -#ifdef _MSC_VER - return _rotl8(x, n); -#elif defined(__clang__) - return __builtin_rotateleft8(x, n); -#elif defined(ARCH_X64) - return __builtin_ia32_rolqi(x, n); -#else - return (x << (n & 7)) | (x >> ((-n & 7))); -#endif - } - - constexpr u16 rol16(u16 x, u16 n) - { - if (std::is_constant_evaluated()) - { - return (x << (n & 15)) | (x >> ((-n & 15))); - } - -#ifdef _MSC_VER - return _rotl16(x, static_cast(n)); -#elif defined(__clang__) - return __builtin_rotateleft16(x, n); -#elif defined(ARCH_X64) - return __builtin_ia32_rolhi(x, n); -#else - return (x << (n & 15)) | (x >> ((-n & 15))); -#endif - } - - constexpr u32 rol32(u32 x, u32 n) - { - if (std::is_constant_evaluated()) - { - return (x << (n & 31)) | (x >> (((0 - n) & 31))); - } - -#ifdef _MSC_VER - return _rotl(x, n); -#elif defined(__clang__) - return __builtin_rotateleft32(x, n); -#else - return (x << (n & 31)) | (x >> (((0 - n) & 31))); -#endif - } - - constexpr u64 rol64(u64 x, u64 n) - { - if (std::is_constant_evaluated()) - { - return (x << (n & 63)) | (x >> (((0 - n) & 63))); - } - -#ifdef _MSC_VER - return _rotl64(x, static_cast(n)); -#elif defined(__clang__) - return __builtin_rotateleft64(x, n); -#else - return (x << (n & 63)) | (x >> (((0 - n) & 63))); -#endif - } - - constexpr u32 popcnt64(u64 v) - { -#if !defined(_MSC_VER) || defined(__SSE4_2__) - if (std::is_constant_evaluated()) -#endif - { - v = (v & 0xaaaaaaaaaaaaaaaa) / 2 + (v & 0x5555555555555555); - v = (v & 0xcccccccccccccccc) / 4 + (v & 0x3333333333333333); - v = (v & 0xf0f0f0f0f0f0f0f0) / 16 + (v & 0x0f0f0f0f0f0f0f0f); - v = (v & 0xff00ff00ff00ff00) / 256 + (v & 0x00ff00ff00ff00ff); - v = ((v & 0xffff0000ffff0000) >> 16) + (v & 0x0000ffff0000ffff); - return static_cast((v >> 32) + v); - } - -#if !defined(_MSC_VER) || defined(__SSE4_2__) -#ifdef _MSC_VER - return static_cast(__popcnt64(v)); -#else - return __builtin_popcountll(v); -#endif -#endif - } - - constexpr u32 popcnt128(const u128& v) - { -#ifdef _MSC_VER - return popcnt64(v.lo) + popcnt64(v.hi); -#else - return popcnt64(v) + popcnt64(v >> 64); -#endif - } - - constexpr u64 umulh64(u64 x, u64 y) - { -#ifdef _MSC_VER - if (std::is_constant_evaluated()) -#endif - { - return static_cast((u128{x} * u128{y}) >> 64); - } - -#ifdef _MSC_VER - return __umulh(x, y); -#endif - } - - inline s64 mulh64(s64 x, s64 y) - { -#ifdef _MSC_VER - return __mulh(x, y); -#else - return (s128{x} * s128{y}) >> 64; -#endif - } - - inline s64 div128(s64 high, s64 low, s64 divisor, s64* remainder = nullptr) - { -#ifdef _MSC_VER - s64 rem = 0; - s64 r = _div128(high, low, divisor, &rem); - - if (remainder) - { - *remainder = rem; - } -#else - const s128 x = (u128{static_cast(high)} << 64) | u64(low); - const s128 r = x / divisor; - - if (remainder) - { - *remainder = x % divisor; - } -#endif - return r; - } - - inline u64 udiv128(u64 high, u64 low, u64 divisor, u64* remainder = nullptr) - { -#ifdef _MSC_VER - u64 rem = 0; - u64 r = _udiv128(high, low, divisor, &rem); - - if (remainder) - { - *remainder = rem; - } -#else - const u128 x = (u128{high} << 64) | low; - const u128 r = x / divisor; - - if (remainder) - { - *remainder = x % divisor; - } -#endif - return r; - } - -#ifdef _MSC_VER - inline u128 operator/(u128 lhs, u64 rhs) - { - u64 rem = 0; - return _udiv128(lhs.hi, lhs.lo, rhs, &rem); - } -#endif - - constexpr u32 ctz128(u128 arg) - { -#ifdef _MSC_VER - if (!arg.lo) - return std::countr_zero(arg.hi) + 64u; - else - return std::countr_zero(arg.lo); -#else - if (u64 lo = static_cast(arg)) - return std::countr_zero(lo); - else - return std::countr_zero(arg >> 64) + 64; -#endif - } - - constexpr u32 clz128(u128 arg) - { -#ifdef _MSC_VER - if (arg.hi) - return std::countl_zero(arg.hi); - else - return std::countl_zero(arg.lo) + 64; -#else - if (u64 hi = static_cast(arg >> 64)) - return std::countl_zero(hi); - else - return std::countl_zero(arg) + 64; -#endif - } - - inline void pause() - { -#if defined(ARCH_ARM64) - __asm__ volatile("yield"); -#elif defined(_M_X64) - _mm_pause(); -#elif defined(ARCH_X64) - __builtin_ia32_pause(); -#else -#error "Missing utils::pause() implementation" -#endif - } - - // Synchronization helper (cache-friendly busy waiting) - inline void busy_wait(usz cycles = 3000) - { - const u64 stop = get_tsc() + cycles; - do - pause(); - while (get_tsc() < stop); - } - - // Align to power of 2 - template - requires std::is_unsigned_v - constexpr std::make_unsigned_t> align(T value, U align) - { - return static_cast>>((value + (align - 1)) & (T{0} - align)); - } - - // General purpose aligned division, the result is rounded up not truncated - template - requires std::is_unsigned_v - constexpr T aligned_div(T value, std::type_identity_t align) - { - return static_cast(value / align + T{!!(value % align)}); - } - - // General purpose aligned division, the result is rounded to nearest - template - requires std::is_integral_v - constexpr T rounded_div(T value, std::type_identity_t align) - { - if constexpr (std::is_unsigned_v) - { - return static_cast(value / align + T{(value % align) > (align / 2)}); - } - - return static_cast(value / align + (value > 0 ? T{(value % align) > (align / 2)} : 0 - T{(value % align) < (align / 2)})); - } - - // Multiplying by ratio, semi-resistant to overflows - template - constexpr T rational_mul(T value, std::type_identity_t numerator, std::type_identity_t denominator) - { - if constexpr (sizeof(T) <= sizeof(u64) / 2) - { - return static_cast(value * u64{numerator} / u64{denominator}); - } - -#if is_u128_emulated - if constexpr (sizeof(T) <= sizeof(u128) / 2) - { - return static_cast(u128_from_mul(value, numerator) / u64{denominator}); - } -#endif - - return static_cast(value / denominator * numerator + (value % denominator) * numerator / denominator); - } - - template - constexpr T add_saturate(T addend1, T addend2) - { - return static_cast(~addend1) < addend2 ? T{umax} : static_cast(addend1 + addend2); - } - - template - constexpr T sub_saturate(T minuend, T subtrahend) - { - return minuend < subtrahend ? T{0} : static_cast(minuend - subtrahend); - } - - template - constexpr T mul_saturate(T factor1, T factor2) - { - return factor1 > 0 && T{umax} / factor1 < factor2 ? T{umax} : static_cast(factor1 * factor2); - } - - inline void trigger_write_page_fault(void* ptr) - { -#if defined(ARCH_X64) && !defined(_MSC_VER) - __asm__ volatile("lock orl $0, 0(%0)" ::"r"(ptr)); -#elif defined(ARCH_ARM64) && !defined(ANDROID) - u32 value = 0; - u32* u32_ptr = static_cast(ptr); - __asm__ volatile("ldset %w0, %w0, %1" : "+r"(value), "=Q"(*u32_ptr) : "r"(value)); -#else - *static_cast*>(ptr) += 0; -#endif - } - - inline void trap() - { -#ifdef _M_X64 - __debugbreak(); -#elif defined(ARCH_X64) - __asm__ volatile("int3"); -#elif defined(ARCH_ARM64) - __asm__ volatile("brk 0x42"); -#else -#error "Missing utils::trap() implementation" -#endif - } -} // namespace utils - -using utils::busy_wait; - -#ifdef _MSC_VER -using utils::operator/; -#endif diff --git a/rpcs3/util/atomic.cpp b/rpcs3/util/atomic.cpp index d12c8b8f7..a04ad8cf5 100644 --- a/rpcs3/util/atomic.cpp +++ b/rpcs3/util/atomic.cpp @@ -50,9 +50,9 @@ static bool has_waitv() #include #include -#include "asm.hpp" +#include "rx/asm.hpp" #include "endian.hpp" -#include "tsc.hpp" +#include "rx/tsc.hpp" // Total number of entries. static constexpr usz s_hashtable_size = 1u << 17; @@ -402,7 +402,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1) constexpr u128 max_mask = dup8(8192); // Leave only bits indicating sub-semaphore is full, find free one - const u32 pos = utils::ctz128(~val & max_mask); + const u32 pos = rx::ctz128(~val & max_mask); if (pos == 128) [[unlikely]] { @@ -422,7 +422,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1) { constexpr u128 max_mask = dup8(1024); - const u32 pos = utils::ctz128(~val & max_mask); + const u32 pos = rx::ctz128(~val & max_mask); val += u128{1} << (pos / 11 * 11); @@ -433,7 +433,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1) { constexpr u128 max_mask = dup8(64) | (dup8(64) << 56); - const u32 pos = utils::ctz128(~val & max_mask); + const u32 pos = rx::ctz128(~val & max_mask); val += u128{1} << (pos / 7 * 7); @@ -495,15 +495,15 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1) } // Call the destructor if necessary - utils::prefetch_write(s_cond_bits + cond_id / 64); + rx::prefetch_write(s_cond_bits + cond_id / 64); const u32 level3 = cond_id / 64 % 16; const u32 level2 = cond_id / 1024 % 8; const u32 level1 = cond_id / 8192 % 8; - utils::prefetch_write(s_cond_sem3 + level2); - utils::prefetch_write(s_cond_sem2 + level1); - utils::prefetch_write(&s_cond_sem1); + rx::prefetch_write(s_cond_sem3 + level2); + rx::prefetch_write(s_cond_sem2 + level1); + rx::prefetch_write(&s_cond_sem1); cond->destroy(); @@ -676,7 +676,7 @@ namespace u64 utils::get_unique_tsc() { - const u64 stamp0 = utils::get_tsc(); + const u64 stamp0 = rx::get_tsc(); if (!s_min_tsc.fetch_op([=](u64& tsc) { @@ -832,7 +832,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, F func) noexcept { if (u16 cond_id = _this->slots[std::countr_zero(bits)]) { - utils::prefetch_read(s_cond_list + cond_id); + rx::prefetch_read(s_cond_list + cond_id); cond_ids[cond_count++] = cond_id; } } diff --git a/rpcs3/util/bin_patch.cpp b/rpcs3/util/bin_patch.cpp index 70cff3c27..2432de965 100644 --- a/rpcs3/util/bin_patch.cpp +++ b/rpcs3/util/bin_patch.cpp @@ -9,7 +9,8 @@ #include "Emu/VFS.h" #include "util/types.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" #include #include @@ -972,10 +973,10 @@ static usz apply_modification(std::vector& applied, patch_engine::patch_inf // Do not allow null address or if resultant ptr is not a VM ptr if (const u32 alloc_at = (p.offset & -4096); alloc_at >> 16) { - const u32 alloc_size = utils::align(static_cast(p.value.long_value) + alloc_at % 4096, 4096); + const u32 alloc_size = rx::alignUp(static_cast(p.value.long_value) + alloc_at % 4096, 4096); // Allocate map if needed, if allocated flags will indicate that bit 62 is set (unique identifier) - auto alloc_map = vm::reserve_map(vm::any, alloc_at & -0x10000, utils::align(alloc_size, 0x10000), vm::page_size_64k | (1ull << 62)); + auto alloc_map = vm::reserve_map(vm::any, alloc_at & -0x10000, rx::alignUp(alloc_size, 0x10000), vm::page_size_64k | (1ull << 62)); u64 flags = vm::alloc_unwritable; @@ -1106,7 +1107,7 @@ static usz apply_modification(std::vector& applied, patch_engine::patch_inf } case patch_type::c_utf8: { - memory_size = utils::add_saturate(::size32(p.original_value), 1); + memory_size = rx::add_saturate(::size32(p.original_value), 1); break; } case patch_type::move_file: @@ -1165,7 +1166,7 @@ static usz apply_modification(std::vector& applied, patch_engine::patch_inf continue; } - const u32 alloc_size = utils::align(static_cast(p.value.long_value + 1) * 4, 0x10000); + const u32 alloc_size = rx::alignUp(static_cast(p.value.long_value + 1) * 4, 0x10000); // Check if should maybe reuse previous code cave allocation (0 size) if (alloc_size - 4 != 0) diff --git a/rpcs3/util/cfmt.h b/rpcs3/util/cfmt.h index cca564c15..0228df10f 100644 --- a/rpcs3/util/cfmt.h +++ b/rpcs3/util/cfmt.h @@ -3,7 +3,7 @@ #include "util/types.hpp" #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" /* C-style format parser. Appends formatted string to `out`, returns number of characters written. @@ -59,7 +59,7 @@ usz cfmt_append(Dst& out, const Char* fmt, Src&& src) { if constexpr (sizeof(value) == 16) { - out.resize(out.size() + std::max(min_num, 129 / 3 - (utils::clz128(value | 1) + 1) / 3), '0'); + out.resize(out.size() + std::max(min_num, 129 / 3 - (rx::clz128(value | 1) + 1) / 3), '0'); } else { @@ -77,7 +77,7 @@ usz cfmt_append(Dst& out, const Char* fmt, Src&& src) { if constexpr (sizeof(value) == 16) { - out.resize(out.size() + std::max(min_num, 128 / 4 - utils::clz128(value | 1) / 4), '0'); + out.resize(out.size() + std::max(min_num, 128 / 4 - rx::clz128(value | 1) / 4), '0'); } else { diff --git a/rpcs3/util/cpu_stats.cpp b/rpcs3/util/cpu_stats.cpp index 1fe48c969..69814e65f 100644 --- a/rpcs3/util/cpu_stats.cpp +++ b/rpcs3/util/cpu_stats.cpp @@ -7,7 +7,7 @@ #include #ifdef _WIN32 -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "windows.h" #include "tlhelp32.h" #ifdef _MSC_VER @@ -148,7 +148,7 @@ namespace utils status = PdhGetFormattedCounterArray(m_cpu_cores, PDH_FMT_DOUBLE, &dwBufferSize, &dwItemCount, nullptr); if (static_cast(PDH_MORE_DATA) == status) { - std::vector items(utils::aligned_div(dwBufferSize, sizeof(PDH_FMT_COUNTERVALUE_ITEM))); + std::vector items(rx::aligned_div(dwBufferSize, sizeof(PDH_FMT_COUNTERVALUE_ITEM))); if (items.size() >= dwItemCount) { status = PdhGetFormattedCounterArray(m_cpu_cores, PDH_FMT_DOUBLE, &dwBufferSize, &dwItemCount, items.data()); diff --git a/rpcs3/util/mutex.cpp b/rpcs3/util/mutex.cpp index ae0582c49..9da107a7e 100644 --- a/rpcs3/util/mutex.cpp +++ b/rpcs3/util/mutex.cpp @@ -1,6 +1,6 @@ #include "mutex.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" void shared_mutex::imp_lock_shared(u32 val) { @@ -26,7 +26,7 @@ void shared_mutex::imp_lock_shared(u32 val) return; } - busy_wait(); + rx::busy_wait(); } // Acquire writer lock and downgrade @@ -96,7 +96,7 @@ void shared_mutex::imp_lock(u32 val) for (int i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); const u32 old = m_value; @@ -138,7 +138,7 @@ void shared_mutex::imp_lock_upgrade() { for (int i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); if (try_lock_upgrade()) { @@ -178,7 +178,7 @@ void shared_mutex::imp_lock_unlock() _max = val / c_one; - busy_wait(1500); + rx::busy_wait(1500); } // Lock and unlock diff --git a/rpcs3/util/sema.cpp b/rpcs3/util/sema.cpp index 193924a72..162b204c9 100644 --- a/rpcs3/util/sema.cpp +++ b/rpcs3/util/sema.cpp @@ -1,12 +1,12 @@ #include "sema.h" -#include "util/asm.hpp" +#include "rx/asm.hpp" void semaphore_base::imp_wait() { for (int i = 0; i < 10; i++) { - busy_wait(); + rx::busy_wait(); const u32 value = m_value.load(); diff --git a/rpcs3/util/serialization_ext.cpp b/rpcs3/util/serialization_ext.cpp index 8bdbcf2f2..ea5220730 100644 --- a/rpcs3/util/serialization_ext.cpp +++ b/rpcs3/util/serialization_ext.cpp @@ -1,6 +1,6 @@ #include "util/types.hpp" #include "util/logs.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/sysinfo.hpp" #include "util/endian.hpp" #include "util/lockless.h" @@ -112,7 +112,7 @@ bool uncompressed_serialization_file_handler::handle_file_op(utils::serial& ar, ar.data_offset = pos; } - const usz read_pre_buffer = ar.data.empty() ? 0 : utils::sub_saturate(ar.data_offset, pos); + const usz read_pre_buffer = ar.data.empty() ? 0 : rx::sub_saturate(ar.data_offset, pos); if (read_pre_buffer) { @@ -128,8 +128,8 @@ bool uncompressed_serialization_file_handler::handle_file_op(utils::serial& ar, // Adjustment to prevent overflow const usz subtrahend = ar.data.empty() ? 0 : 1; - const usz read_past_buffer = utils::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); - const usz read_limit = utils::sub_saturate(ar.m_max_data, ar.data_offset); + const usz read_past_buffer = rx::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); + const usz read_limit = rx::sub_saturate(ar.m_max_data, ar.data_offset); if (read_past_buffer) { @@ -410,7 +410,7 @@ bool compressed_serialization_file_handler::handle_file_op(utils::serial& ar, us // ar.seek_pos(pos); // } - const usz read_pre_buffer = utils::sub_saturate(ar.data_offset, pos); + const usz read_pre_buffer = rx::sub_saturate(ar.data_offset, pos); if (read_pre_buffer) { @@ -421,8 +421,8 @@ bool compressed_serialization_file_handler::handle_file_op(utils::serial& ar, us // Adjustment to prevent overflow const usz subtrahend = ar.data.empty() ? 0 : 1; - const usz read_past_buffer = utils::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); - const usz read_limit = utils::sub_saturate(ar.m_max_data, ar.data_offset); + const usz read_past_buffer = rx::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); + const usz read_limit = rx::sub_saturate(ar.m_max_data, ar.data_offset); if (read_past_buffer) { @@ -506,7 +506,7 @@ usz compressed_serialization_file_handler::read_at(utils::serial& ar, usz read_p m_stream_data_index = m_zs.avail_in ? m_zs.next_in - m_stream_data.data() : m_stream_data.size(); // Adjust again in case the values simply did not fit into uInt - m_zs.avail_out = adjust_for_uint(utils::sub_saturate(total_to_read, read_size)); + m_zs.avail_out = adjust_for_uint(rx::sub_saturate(total_to_read, read_size)); m_zs.avail_in = adjust_for_uint(m_stream_data.size() - m_stream_data_index); if (need_more_file_memory) @@ -779,7 +779,7 @@ usz compressed_serialization_file_handler::get_size(const utils::serial& ar, usz return memory_available; } - return std::max(utils::mul_saturate(m_file->size(), 6), memory_available); + return std::max(rx::mul_saturate(m_file->size(), 6), memory_available); } struct compressed_zstd_stream_data @@ -973,7 +973,7 @@ bool compressed_zstd_serialization_file_handler::handle_file_op(utils::serial& a // ar.seek_pos(pos); // } - const usz read_pre_buffer = utils::sub_saturate(ar.data_offset, pos); + const usz read_pre_buffer = rx::sub_saturate(ar.data_offset, pos); if (read_pre_buffer) { @@ -984,8 +984,8 @@ bool compressed_zstd_serialization_file_handler::handle_file_op(utils::serial& a // Adjustment to prevent overflow const usz subtrahend = ar.data.empty() ? 0 : 1; - const usz read_past_buffer = utils::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); - const usz read_limit = utils::sub_saturate(ar.m_max_data, ar.data_offset); + const usz read_past_buffer = rx::sub_saturate(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend)); + const usz read_limit = rx::sub_saturate(ar.m_max_data, ar.data_offset); if (read_past_buffer) { @@ -1326,7 +1326,7 @@ usz compressed_zstd_serialization_file_handler::get_size(const utils::serial& ar } return recommended; - // return std::max(utils::mul_saturate(ZSTD_decompressBound(m_file->size()), 2), memory_available); + // return std::max(rx::mul_saturate(ZSTD_decompressBound(m_file->size()), 2), memory_available); } bool null_serialization_file_handler::handle_file_op(utils::serial&, usz, usz, const void*) diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index e0b082fff..4c4a7ab9e 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -4,7 +4,7 @@ #include "util/types.hpp" #include "util/v128.hpp" #include "util/sysinfo.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/JIT.h" #include diff --git a/rpcs3/util/sysinfo.cpp b/rpcs3/util/sysinfo.cpp index 1367ac661..c69f07c36 100644 --- a/rpcs3/util/sysinfo.cpp +++ b/rpcs3/util/sysinfo.cpp @@ -18,14 +18,14 @@ #include #ifndef __APPLE__ #include -#include +#include #endif #endif #include #include -#include "util/asm.hpp" +#include "rx/asm.hpp" #include "util/fence.hpp" #if defined(_M_X64) && defined(_MSC_VER) @@ -790,7 +790,7 @@ static constexpr ullong round_tsc(ullong val, ullong known_error) known_error /= 10; } - return utils::rounded_div(val, by) * by; + return rx::rounded_div(val, by) * by; } namespace utils @@ -898,7 +898,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool { for (usz i = 0; i < retry_count; i++) { - const u64 rdtsc_read = (utils::lfence(), utils::get_tsc()); + const u64 rdtsc_read = (utils::lfence(), rx::get_tsc()); #ifdef _WIN32 LARGE_INTEGER ctr; QueryPerformanceCounter(&ctr); @@ -906,7 +906,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); #endif - const u64 rdtsc_read2 = (utils::lfence(), utils::get_tsc()); + const u64 rdtsc_read2 = (utils::lfence(), rx::get_tsc()); #ifdef _WIN32 const u64 timer_read = ctr.QuadPart - time_base; @@ -961,10 +961,10 @@ static const bool s_tsc_freq_evaluated = []() -> bool const u128 data = u128_from_mul(rdtsc_data[1] - rdtsc_data[0], timer_freq); - const u64 res = utils::udiv128(static_cast(data >> 64), static_cast(data), (timer_data[1] - timer_data[0])); + const u64 res = rx::udiv128(static_cast(data >> 64), static_cast(data), (timer_data[1] - timer_data[0])); // Rounding - return round_tsc(res, utils::mul_saturate(utils::add_saturate(rdtsc_diff[0], rdtsc_diff[1]), utils::aligned_div(timer_freq, timer_data[1] - timer_data[0]))); + return round_tsc(res, rx::mul_saturate(rx::add_saturate(rdtsc_diff[0], rdtsc_diff[1]), rx::aligned_div(timer_freq, timer_data[1] - timer_data[0]))); }(); atomic_storage::store(utils::s_tsc_freq, cal_tsc); diff --git a/rpcs3/util/tsc.hpp b/rpcs3/util/tsc.hpp deleted file mode 100644 index 5b38dd7e7..000000000 --- a/rpcs3/util/tsc.hpp +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "util/types.hpp" - -#ifdef _M_X64 -#ifdef _MSC_VER -extern "C" u64 __rdtsc(); -#else -#include -#endif -#endif - -namespace utils -{ - inline u64 get_tsc() - { -#if defined(ARCH_ARM64) - u64 r = 0; - __asm__ volatile("mrs %0, cntvct_el0" : "=r"(r)); - return r; -#elif defined(_M_X64) - return __rdtsc(); -#elif defined(ARCH_X64) - return __builtin_ia32_rdtsc(); -#else -#error "Missing utils::get_tsc() implementation" -#endif - } -} // namespace utils diff --git a/rpcs3/util/vm_native.cpp b/rpcs3/util/vm_native.cpp index 188857a03..aa52bdaf2 100644 --- a/rpcs3/util/vm_native.cpp +++ b/rpcs3/util/vm_native.cpp @@ -1,6 +1,8 @@ #include "stdafx.h" #include "util/vm.hpp" -#include "util/asm.hpp" +#include "rx/asm.hpp" +#include "rx/align.hpp" + #ifdef _WIN32 #include "util/File.h" #include "util/dyn_lib.hpp" @@ -492,7 +494,7 @@ namespace utils } shm::shm(u64 size, u32 flags) - : m_flags(flags), m_size(utils::align(size, 0x10000)) + : m_flags(flags), m_size(rx::alignUp(size, 0x10000)) { #ifdef _WIN32 const ULARGE_INTEGER max_size{.QuadPart = m_size}; @@ -535,7 +537,7 @@ namespace utils } shm::shm(u64 size, const std::string& storage) - : m_size(utils::align(size, 0x10000)) + : m_size(rx::alignUp(size, 0x10000)) { #ifdef _WIN32 fs::file f; @@ -857,7 +859,7 @@ namespace utils { const u64 res64 = reinterpret_cast(::mmap(reinterpret_cast(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0)); - const u64 aligned = utils::align(res64, 0x10000); + const u64 aligned = rx::alignUp(res64, 0x10000); const auto result = ::mmap(reinterpret_cast(aligned), m_size, +prot, (cow ? MAP_PRIVATE : MAP_SHARED) | MAP_FIXED, m_file, 0); // Now cleanup remnants diff --git a/rpcsx/thread.cpp b/rpcsx/thread.cpp index f0511a168..1b26de1c0 100644 --- a/rpcsx/thread.cpp +++ b/rpcsx/thread.cpp @@ -155,7 +155,7 @@ handleSigUser(int sig, siginfo_t *info, void *ucontext) { std::size_t rx::thread::getSigAltStackSize() { static auto sigStackSize = std::max( - SIGSTKSZ, ::rx::alignUp(64 * 1024 * 1024, rx::mem::pageSize)); + SIGSTKSZ, rx::alignUp(64u * 1024 * 1024, rx::mem::pageSize)); return sigStackSize; } @@ -199,7 +199,8 @@ bool rx::thread::invokeSignalHandler(orbis::Thread *thread, int guestSignal, guestContext->uc_mcontext.gregs[REG_RCX] = 0; // arg4, si_addr guestContext->uc_mcontext.gregs[REG_RIP] = handlerPtr; - guestContext->uc_mcontext.gregs[REG_RSP] = rx::alignDown(rsp, 16); + guestContext->uc_mcontext.gregs[REG_RSP] = + rx::alignDown(static_cast(rsp), 16); return true; } diff --git a/rx/include/rx/SharedAtomic.hpp b/rx/include/rx/SharedAtomic.hpp index 6422f50f4..f45dae627 100644 --- a/rx/include/rx/SharedAtomic.hpp +++ b/rx/include/rx/SharedAtomic.hpp @@ -6,19 +6,10 @@ #include #include #include -#include #include +#include "asm.hpp" namespace rx { -inline void yield() { std::this_thread::yield(); } -inline void relax() { -#if defined(__GNUC__) && (defined __i386__ || defined __x86_64__) - __builtin_ia32_pause(); -#else - yield(); -#endif -} - static constexpr auto kRelaxSpinCount = 12; static constexpr auto kSpinCount = 16; @@ -31,7 +22,7 @@ bool try_spin_wait(auto &&pred) { } if (i < kRelaxSpinCount) { - relax(); + pause(); } else { yield(); } diff --git a/rx/include/rx/align.hpp b/rx/include/rx/align.hpp index e7e0ced38..0c16994c5 100644 --- a/rx/include/rx/align.hpp +++ b/rx/include/rx/align.hpp @@ -1,14 +1,21 @@ #pragma once -#include +#include namespace rx { -inline constexpr std::uint64_t alignUp(std::uint64_t value, - std::uint64_t alignment) { - return (value + (alignment - 1)) & ~(alignment - 1); +template + requires std::is_unsigned_v +inline constexpr std::make_unsigned_t> +alignUp(T value, U alignment) { + return static_cast>>( + (value + (alignment - 1)) & ~(alignment - 1)); } -inline constexpr std::uint64_t alignDown(std::uint64_t value, - std::uint64_t alignment) { - return value & ~(alignment - 1); + +template + requires std::is_unsigned_v +inline constexpr std::make_unsigned_t> +alignDown(T value, U alignment) { + return static_cast>>( + value & ~(alignment - 1)); } } // namespace rx diff --git a/rx/include/rx/asm.hpp b/rx/include/rx/asm.hpp index 2faed84ea..0bea59587 100644 --- a/rx/include/rx/asm.hpp +++ b/rx/include/rx/asm.hpp @@ -1,6 +1,8 @@ #pragma once +#include "rx/tsc.hpp" #include "types.hpp" +#include #include extern bool g_use_rtm; @@ -275,6 +277,16 @@ inline void pause() { #endif } +inline void yield() { std::this_thread::yield(); } + +// Synchronization helper (cache-friendly busy waiting) +inline void busy_wait(usz cycles = 3000) { + const u64 stop = get_tsc() + cycles; + do + pause(); + while (get_tsc() < stop); +} + // Align to power of 2 template requires std::is_unsigned_v @@ -312,12 +324,6 @@ constexpr T rational_mul(T value, std::type_identity_t numerator, return static_cast(value * u64{numerator} / u64{denominator}); } -#if is_u128_emulated - if constexpr (sizeof(T) <= sizeof(u128) / 2) { - return static_cast(u128_from_mul(value, numerator) / u64{denominator}); - } -#endif - return static_cast(value / denominator * numerator + (value % denominator) * numerator / denominator); } diff --git a/rx/include/rx/tsc.hpp b/rx/include/rx/tsc.hpp new file mode 100644 index 000000000..f604b184c --- /dev/null +++ b/rx/include/rx/tsc.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "types.hpp" +#include + +#ifdef _M_X64 +#ifdef _MSC_VER +extern "C" std::uint64_t __rdtsc(); +#else +#include +#endif +#endif + +namespace rx { +inline std::uint64_t get_tsc() { +#if defined(ARCH_ARM64) + std::uint64_t r = 0; + __asm__ volatile("mrs %0, cntvct_el0" : "=r"(r)); + return r; +#elif defined(_M_X64) + return __rdtsc(); +#elif defined(ARCH_X64) + return __builtin_ia32_rdtsc(); +#else +#error "Missing rx::get_tsc() implementation" +#endif +} +} // namespace rx diff --git a/rx/include/rx/types.hpp b/rx/include/rx/types.hpp index c545a3c57..ae4298642 100644 --- a/rx/include/rx/types.hpp +++ b/rx/include/rx/types.hpp @@ -106,20 +106,6 @@ template fn_helper(F &&f) -> fn_helper; [[maybe_unused]] auto &&z, \ [[maybe_unused]] auto &&w) { return (__VA_ARGS__); }) -#if __cpp_lib_bit_cast < 201806L -namespace std { -template -[[nodiscard]] constexpr To bit_cast(const From &from) noexcept { - return __builtin_bit_cast(To, from); -} -} // namespace std -#endif - -#if defined(__INTELLISENSE__) || (defined(__clang__) && (__clang_major__ <= 16)) -#define consteval constexpr -#define constinit -#endif - // FIXME: move to ps3 kernel implementation using schar = signed char; using uchar = unsigned char; @@ -206,9 +192,9 @@ public: }; #if defined(ARCH_X64) && !defined(_MSC_VER) -using __m128i = long long __attribute__((vector_size(16))); -using __m128d = double __attribute__((vector_size(16))); -using __m128 = float __attribute__((vector_size(16))); +using __m128i = long long __attribute__((vector_size(16), aligned(16))); +using __m128d = double __attribute__((vector_size(16), aligned(16))); +using __m128 = float __attribute__((vector_size(16), aligned(16))); #endif #ifndef _MSC_VER diff --git a/rx/src/SharedMutex.cpp b/rx/src/SharedMutex.cpp index a7d7458ab..087696a73 100644 --- a/rx/src/SharedMutex.cpp +++ b/rx/src/SharedMutex.cpp @@ -1,14 +1,7 @@ #include "SharedMutex.hpp" +#include "asm.hpp" #include #include -#include - -static void busy_wait(unsigned long long cycles = 3000) { - const auto stop = __builtin_ia32_rdtsc() + cycles; - do - _mm_pause(); - while (__builtin_ia32_rdtsc() < stop); -} namespace rx { void shared_mutex::impl_lock_shared(unsigned val) {