moved tsc and asm utilities to rx

This commit is contained in:
DH 2025-10-05 19:28:03 +03:00
parent bd215fab92
commit 640df36c48
121 changed files with 706 additions and 1225 deletions

View file

@ -9,7 +9,8 @@
#include <map>
#include <iostream>
#include "util/asm.hpp"
#include "rx/align.hpp"
#include "rx/asm.hpp"
#include "util/coro.hpp"
using namespace std::literals::string_literals;
@ -2386,7 +2387,7 @@ u64 fs::get_dir_size(const std::string& path, u64 rounding_alignment, atomic_t<b
if (!entry.is_directory)
{
result += utils::align(entry.size, rounding_alignment);
result += rx::alignUp(entry.size, rounding_alignment);
}
else
{

View file

@ -5,9 +5,10 @@
#include "File.h"
#include "util/logs.hpp"
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "util/v128.hpp"
#include "util/simd.hpp"
#include "rx/align.hpp"
#ifdef __linux__
#include <unistd.h>
@ -158,8 +159,8 @@ static u8* add_jit_memory(usz size, usz align)
// Simple allocation by incrementing pointer to the next free data
const u64 pos = Ctr.atomic_op([&](u64& ctr) -> u64
{
const u64 _pos = utils::align(ctr & 0xffff'ffff, align);
const u64 _new = utils::align(_pos + size, align);
const u64 _pos = rx::alignUp(ctr & 0xffff'ffff, align);
const u64 _new = rx::alignUp(_pos + size, align);
if (_new > 0x40000000) [[unlikely]]
{
@ -175,7 +176,7 @@ static u8* add_jit_memory(usz size, usz align)
// Check the necessity to commit more memory
if (_new > olda) [[unlikely]]
{
newa = utils::align(_new, 0x200000);
newa = rx::alignUp(_new, 0x200000);
}
ctr += _new - (ctr & 0xffff'ffff);
@ -237,9 +238,9 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code, usz align) noexcept
for (asmjit::Section* section : code->_sections)
{
if (section->offset() + section->bufferSize() > utils::align<usz>(codeSize, align))
if (section->offset() + section->bufferSize() > rx::alignUp<usz>(codeSize, align))
{
fmt::throw_exception("CodeHolder section exceeds range: Section->offset: 0x%x, Section->bufferSize: 0x%x, alloted-memory=0x%x", section->offset(), section->bufferSize(), utils::align<usz>(codeSize, align));
fmt::throw_exception("CodeHolder section exceeds range: Section->offset: 0x%x, Section->bufferSize: 0x%x, alloted-memory=0x%x", section->offset(), section->bufferSize(), rx::alignUp<usz>(codeSize, align));
}
std::memcpy(p + section->offset(), section->data(), section->bufferSize());
@ -365,7 +366,7 @@ jit_runtime_base& asmjit::get_global_runtime()
{
return m_pos.atomic_op([&](uchar*& pos) -> uchar*
{
const auto r = reinterpret_cast<uchar*>(utils::align(uptr(pos), align));
const auto r = reinterpret_cast<uchar*>(rx::alignUp(uptr(pos), align));
if (r >= pos && r + size > pos && r + size <= m_max)
{

View file

@ -7,7 +7,8 @@
#include "util/logs.hpp"
#include "mutex.h"
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "rx/align.hpp"
#include "Crypto/unzip.h"
#include <charconv>
@ -216,7 +217,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
~MemoryManager1() override
{
// Hack: don't release to prevent reuse of address space, see jit_announce
// constexpr auto how_much = [](u64 pos) { return utils::align(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); };
// constexpr auto how_much = [](u64 pos) { return rx::alignUp(pos, pos < c_page_size ? c_page_size / 4 : c_page_size); };
// utils::memory_decommit(m_code_mems, how_much(code_ptr));
// utils::memory_decommit(m_data_ro_mems, how_much(data_ro_ptr));
// utils::memory_decommit(m_data_rw_mems, how_much(data_rw_ptr));
@ -249,7 +250,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
{
align = align ? align : 16;
const u64 sizea = utils::align(size, align);
const u64 sizea = rx::alignUp(size, align);
if (!size || align > c_page_size || sizea > c_max_size || sizea < size)
{
@ -259,7 +260,7 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
u64 oldp = alloc_pos;
u64 olda = utils::align(oldp, align);
u64 olda = rx::alignUp(oldp, align);
ensure(olda >= oldp);
ensure(olda < ~sizea);
@ -285,8 +286,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
// Optimization: split the first allocation to 512 KiB for single-module compilers
if (oldp < c_page_size && align < page_quarter && (std::min(newp, c_page_size) - 1) / page_quarter != (oldp - 1) / page_quarter)
{
const u64 pagea = utils::align(oldp, page_quarter);
const u64 psize = utils::align(std::min(newp, c_page_size) - pagea, page_quarter);
const u64 pagea = rx::alignUp(oldp, page_quarter);
const u64 psize = rx::alignUp(std::min(newp, c_page_size) - pagea, page_quarter);
utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
// Advance
@ -296,8 +297,8 @@ struct MemoryManager1 : llvm::RTDyldMemoryManager
if ((newp - 1) / c_page_size != (oldp - 1) / c_page_size)
{
// Allocate pages on demand
const u64 pagea = utils::align(oldp, c_page_size);
const u64 psize = utils::align(newp - pagea, c_page_size);
const u64 pagea = rx::alignUp(oldp, c_page_size);
const u64 psize = rx::alignUp(newp - pagea, c_page_size);
utils::memory_commit(reinterpret_cast<u8*>(block) + (pagea % c_max_size), psize, prot);
}

View file

@ -1,4 +1,6 @@
#include "stdafx.h"
#include "rx/debug.hpp"
#include "Emu/Cell/timers.hpp"
#include "Emu/System.h"
#include "Emu/Cell/SPUThread.h"
@ -88,7 +90,7 @@ DYNAMIC_IMPORT_RENAME("Kernel32.dll", SetThreadDescriptionImport, "SetThreadDesc
#include "util/vm.hpp"
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "util/v128.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
@ -141,74 +143,11 @@ std::string dump_useful_thread_info()
return result;
}
#ifndef _WIN32
bool IsDebuggerPresent()
{
#if defined(__APPLE__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)
int mib[] = {
CTL_KERN,
KERN_PROC,
KERN_PROC_PID,
getpid(),
#if defined(__NetBSD__) || defined(__OpenBSD__)
sizeof(struct kinfo_proc),
1,
#endif
};
u_int miblen = std::size(mib);
struct kinfo_proc info;
usz size = sizeof(info);
if (sysctl(mib, miblen, &info, &size, NULL, 0))
{
return false;
}
return info.KP_FLAGS & P_TRACED;
#else
char buf[4096];
fs::file status_fd("/proc/self/status");
if (!status_fd)
{
std::fprintf(stderr, "Failed to open /proc/self/status\n");
return false;
}
const auto num_read = status_fd.read(buf, sizeof(buf) - 1);
if (num_read == 0 || num_read == umax)
{
std::fprintf(stderr, "Failed to read /proc/self/status (%d)\n", errno);
return false;
}
buf[num_read] = '\0';
std::string_view status = buf;
const auto found = status.find("TracerPid:");
if (found == umax)
{
std::fprintf(stderr, "Failed to find 'TracerPid:' in /proc/self/status\n");
return false;
}
for (const char* cp = status.data() + found + 10; cp <= status.data() + num_read; ++cp)
{
if (!std::isspace(*cp))
{
return std::isdigit(*cp) != 0 && *cp != '0';
}
}
return false;
#endif
}
#endif
bool is_debugger_present()
{
if (g_cfg.core.external_debugger)
return true;
return IsDebuggerPresent();
return rx::isDebuggerPresent();
}
#if defined(ARCH_X64)
@ -2071,7 +2010,7 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
sys_log.notice("\n%s", dump_useful_thread_info());
logs::listener::sync_all();
if (IsDebuggerPresent())
if (rx::isDebuggerPresent())
{
// Convert to SIGTRAP
raise(SIGTRAP);
@ -2091,7 +2030,7 @@ static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept
sys_log.notice("\n%s", dump_useful_thread_info());
logs::listener::sync_all();
if (IsDebuggerPresent())
if (rx::isDebuggerPresent())
{
// Convert to SIGTRAP
raise(SIGTRAP);
@ -2140,7 +2079,7 @@ const bool s_exception_handler_set = []() -> bool
std::abort();
}
std::printf("Debugger: %d\n", +IsDebuggerPresent());
std::printf("Debugger: %d\n", +rx::isDebuggerPresent());
return true;
}();
@ -2150,10 +2089,10 @@ const bool s_terminate_handler_set = []() -> bool
{
std::set_terminate([]()
{
if (IsDebuggerPresent())
if (rx::isDebuggerPresent())
{
logs::listener::sync_all();
utils::trap();
rx::breakpoint();
}
report_fatal_error("RPCS3 has abnormally terminated.");
@ -2214,7 +2153,7 @@ void thread_base::initialize(void (*error_cb)())
{
if (attempts == umax)
{
g_tls_wait_time += utils::get_tsc() - stamp0;
g_tls_wait_time += rx::get_tsc() - stamp0;
}
else if (attempts > 1)
{
@ -2246,7 +2185,7 @@ void thread_base::set_name(std::string name)
};
// Set thread name for VS debugger
if (IsDebuggerPresent())
if (rx::isDebuggerPresent())
[&]() NEVER_INLINE
{
THREADNAME_INFO info;
@ -2527,7 +2466,7 @@ void thread_ctrl::wait_for(u64 usec, [[maybe_unused]] bool alert /* true */)
void thread_ctrl::wait_until(u64* wait_time, u64 add_time, u64 min_wait, bool update_to_current_time)
{
*wait_time = utils::add_saturate<u64>(*wait_time, add_time);
*wait_time = rx::add_saturate<u64>(*wait_time, add_time);
// TODO: Implement proper support for "waiting until" inside atomic wait engine
const u64 current_time = get_system_time();
@ -2546,7 +2485,7 @@ void thread_ctrl::wait_until(u64* wait_time, u64 add_time, u64 min_wait, bool up
if (min_wait)
{
*wait_time = std::max<u64>(*wait_time, utils::add_saturate<u64>(current_time, min_wait));
*wait_time = std::max<u64>(*wait_time, rx::add_saturate<u64>(current_time, min_wait));
}
wait_for(*wait_time - current_time);
@ -2588,7 +2527,7 @@ void thread_ctrl::wait_for_accurate(u64 usec)
}
else
{
busy_wait(100);
rx::busy_wait(100);
}
const auto current = std::chrono::steady_clock::now();
@ -2663,7 +2602,7 @@ bool thread_base::join(bool dtor) const
// Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove
const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf;
auto stamp0 = utils::get_tsc();
auto stamp0 = rx::get_tsc();
for (u64 i = 0; (m_sync & 3) <= 1; i++)
{
@ -2676,7 +2615,7 @@ bool thread_base::join(bool dtor) const
if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf)
{
sig_log.error("Thread [%s] is too sleepy. Waiting for it %.3fus already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
sig_log.error("Thread [%s] is too sleepy. Waiting for it %.3fus already!", *m_tname.load(), (rx::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
}
}
@ -2764,7 +2703,7 @@ void thread_base::exec()
for (thread_future* prev{};;)
{
utils::prefetch_exec(prev_head->exec.load());
rx::prefetch_exec(prev_head->exec.load());
if (auto next = prev_head->next.get())
{
@ -2836,7 +2775,7 @@ void thread_base::exec()
logs::listener::sync_all();
if (IsDebuggerPresent())
if (rx::isDebuggerPresent())
{
// Prevent repeatedly halting the debugger in case multiple threads crashed at once
static atomic_t<u64> s_last_break = 0;
@ -2861,7 +2800,7 @@ void thread_base::exec()
})
.second)
{
utils::trap();
rx::breakpoint();
}
}

View file

@ -1,476 +0,0 @@
#pragma once
#include "util/types.hpp"
#include "util/tsc.hpp"
#include "util/atomic.hpp"
#include <functional>
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
#ifdef _M_X64
#ifdef _MSC_VER
extern "C"
{
u32 _xbegin();
void _xend();
void _mm_pause();
void _mm_prefetch(const char*, int);
void _m_prefetchw(const volatile void*);
uchar _rotl8(uchar, uchar);
ushort _rotl16(ushort, uchar);
u64 __popcnt64(u64);
s64 __mulh(s64, s64);
u64 __umulh(u64, u64);
s64 _div128(s64, s64, s64, s64*);
u64 _udiv128(u64, u64, u64, u64*);
void __debugbreak();
}
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#endif
namespace utils
{
// Transaction helper (result = pair of success and op result, or just bool)
template <typename F, typename R = std::invoke_result_t<F>>
inline auto tx_start(F op)
{
#if defined(ARCH_X64)
uint status = -1;
for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
{
#ifndef _MSC_VER
__asm__ goto("xbegin %l[retry];" ::: "memory" : retry);
#else
status = _xbegin();
if (status != _XBEGIN_STARTED) [[unlikely]]
{
goto retry;
}
#endif
if constexpr (std::is_void_v<R>)
{
std::invoke(op);
#ifndef _MSC_VER
__asm__ volatile("xend;" ::: "memory");
#else
_xend();
#endif
return true;
}
else
{
auto result = std::invoke(op);
#ifndef _MSC_VER
__asm__ volatile("xend;" ::: "memory");
#else
_xend();
#endif
return std::make_pair(true, std::move(result));
}
retry:
#ifndef _MSC_VER
__asm__ volatile("movl %%eax, %0;" : "=r"(status)::"memory");
#endif
if (!status) [[unlikely]]
{
break;
}
}
#else
static_cast<void>(op);
#endif
if constexpr (std::is_void_v<R>)
{
return false;
}
else
{
return std::make_pair(false, R());
}
};
// Try to prefetch to Level 2 cache since it's not split to data/code on most processors
template <typename T>
constexpr void prefetch_exec(T func)
{
if (std::is_constant_evaluated())
{
return;
}
const u64 value = reinterpret_cast<u64>(func);
const void* ptr = reinterpret_cast<const void*>(value);
#ifdef _M_X64
return _mm_prefetch(static_cast<const char*>(ptr), _MM_HINT_T1);
#else
return __builtin_prefetch(ptr, 0, 2);
#endif
}
// Try to prefetch to Level 1 cache
constexpr void prefetch_read(const void* ptr)
{
if (std::is_constant_evaluated())
{
return;
}
#ifdef _M_X64
return _mm_prefetch(static_cast<const char*>(ptr), _MM_HINT_T0);
#else
return __builtin_prefetch(ptr, 0, 3);
#endif
}
constexpr void prefetch_write(void* ptr)
{
if (std::is_constant_evaluated())
{
return;
}
#if defined(_M_X64) && !defined(__clang__)
return _m_prefetchw(ptr);
#else
return __builtin_prefetch(ptr, 1, 0);
#endif
}
constexpr u8 rol8(u8 x, u8 n)
{
if (std::is_constant_evaluated())
{
return (x << (n & 7)) | (x >> ((-n & 7)));
}
#ifdef _MSC_VER
return _rotl8(x, n);
#elif defined(__clang__)
return __builtin_rotateleft8(x, n);
#elif defined(ARCH_X64)
return __builtin_ia32_rolqi(x, n);
#else
return (x << (n & 7)) | (x >> ((-n & 7)));
#endif
}
constexpr u16 rol16(u16 x, u16 n)
{
if (std::is_constant_evaluated())
{
return (x << (n & 15)) | (x >> ((-n & 15)));
}
#ifdef _MSC_VER
return _rotl16(x, static_cast<uchar>(n));
#elif defined(__clang__)
return __builtin_rotateleft16(x, n);
#elif defined(ARCH_X64)
return __builtin_ia32_rolhi(x, n);
#else
return (x << (n & 15)) | (x >> ((-n & 15)));
#endif
}
constexpr u32 rol32(u32 x, u32 n)
{
if (std::is_constant_evaluated())
{
return (x << (n & 31)) | (x >> (((0 - n) & 31)));
}
#ifdef _MSC_VER
return _rotl(x, n);
#elif defined(__clang__)
return __builtin_rotateleft32(x, n);
#else
return (x << (n & 31)) | (x >> (((0 - n) & 31)));
#endif
}
constexpr u64 rol64(u64 x, u64 n)
{
if (std::is_constant_evaluated())
{
return (x << (n & 63)) | (x >> (((0 - n) & 63)));
}
#ifdef _MSC_VER
return _rotl64(x, static_cast<int>(n));
#elif defined(__clang__)
return __builtin_rotateleft64(x, n);
#else
return (x << (n & 63)) | (x >> (((0 - n) & 63)));
#endif
}
constexpr u32 popcnt64(u64 v)
{
#if !defined(_MSC_VER) || defined(__SSE4_2__)
if (std::is_constant_evaluated())
#endif
{
v = (v & 0xaaaaaaaaaaaaaaaa) / 2 + (v & 0x5555555555555555);
v = (v & 0xcccccccccccccccc) / 4 + (v & 0x3333333333333333);
v = (v & 0xf0f0f0f0f0f0f0f0) / 16 + (v & 0x0f0f0f0f0f0f0f0f);
v = (v & 0xff00ff00ff00ff00) / 256 + (v & 0x00ff00ff00ff00ff);
v = ((v & 0xffff0000ffff0000) >> 16) + (v & 0x0000ffff0000ffff);
return static_cast<u32>((v >> 32) + v);
}
#if !defined(_MSC_VER) || defined(__SSE4_2__)
#ifdef _MSC_VER
return static_cast<u32>(__popcnt64(v));
#else
return __builtin_popcountll(v);
#endif
#endif
}
constexpr u32 popcnt128(const u128& v)
{
#ifdef _MSC_VER
return popcnt64(v.lo) + popcnt64(v.hi);
#else
return popcnt64(v) + popcnt64(v >> 64);
#endif
}
constexpr u64 umulh64(u64 x, u64 y)
{
#ifdef _MSC_VER
if (std::is_constant_evaluated())
#endif
{
return static_cast<u64>((u128{x} * u128{y}) >> 64);
}
#ifdef _MSC_VER
return __umulh(x, y);
#endif
}
inline s64 mulh64(s64 x, s64 y)
{
#ifdef _MSC_VER
return __mulh(x, y);
#else
return (s128{x} * s128{y}) >> 64;
#endif
}
inline s64 div128(s64 high, s64 low, s64 divisor, s64* remainder = nullptr)
{
#ifdef _MSC_VER
s64 rem = 0;
s64 r = _div128(high, low, divisor, &rem);
if (remainder)
{
*remainder = rem;
}
#else
const s128 x = (u128{static_cast<u64>(high)} << 64) | u64(low);
const s128 r = x / divisor;
if (remainder)
{
*remainder = x % divisor;
}
#endif
return r;
}
inline u64 udiv128(u64 high, u64 low, u64 divisor, u64* remainder = nullptr)
{
#ifdef _MSC_VER
u64 rem = 0;
u64 r = _udiv128(high, low, divisor, &rem);
if (remainder)
{
*remainder = rem;
}
#else
const u128 x = (u128{high} << 64) | low;
const u128 r = x / divisor;
if (remainder)
{
*remainder = x % divisor;
}
#endif
return r;
}
#ifdef _MSC_VER
inline u128 operator/(u128 lhs, u64 rhs)
{
u64 rem = 0;
return _udiv128(lhs.hi, lhs.lo, rhs, &rem);
}
#endif
constexpr u32 ctz128(u128 arg)
{
#ifdef _MSC_VER
if (!arg.lo)
return std::countr_zero(arg.hi) + 64u;
else
return std::countr_zero(arg.lo);
#else
if (u64 lo = static_cast<u64>(arg))
return std::countr_zero<u64>(lo);
else
return std::countr_zero<u64>(arg >> 64) + 64;
#endif
}
constexpr u32 clz128(u128 arg)
{
#ifdef _MSC_VER
if (arg.hi)
return std::countl_zero(arg.hi);
else
return std::countl_zero(arg.lo) + 64;
#else
if (u64 hi = static_cast<u64>(arg >> 64))
return std::countl_zero<u64>(hi);
else
return std::countl_zero<u64>(arg) + 64;
#endif
}
inline void pause()
{
#if defined(ARCH_ARM64)
__asm__ volatile("yield");
#elif defined(_M_X64)
_mm_pause();
#elif defined(ARCH_X64)
__builtin_ia32_pause();
#else
#error "Missing utils::pause() implementation"
#endif
}
// Synchronization helper (cache-friendly busy waiting)
inline void busy_wait(usz cycles = 3000)
{
const u64 stop = get_tsc() + cycles;
do
pause();
while (get_tsc() < stop);
}
// Align to power of 2
template <typename T, typename U>
requires std::is_unsigned_v<T>
constexpr std::make_unsigned_t<std::common_type_t<T, U>> align(T value, U align)
{
return static_cast<std::make_unsigned_t<std::common_type_t<T, U>>>((value + (align - 1)) & (T{0} - align));
}
// General purpose aligned division, the result is rounded up not truncated
template <typename T>
requires std::is_unsigned_v<T>
constexpr T aligned_div(T value, std::type_identity_t<T> align)
{
return static_cast<T>(value / align + T{!!(value % align)});
}
// General purpose aligned division, the result is rounded to nearest
template <typename T>
requires std::is_integral_v<T>
constexpr T rounded_div(T value, std::type_identity_t<T> align)
{
if constexpr (std::is_unsigned_v<T>)
{
return static_cast<T>(value / align + T{(value % align) > (align / 2)});
}
return static_cast<T>(value / align + (value > 0 ? T{(value % align) > (align / 2)} : 0 - T{(value % align) < (align / 2)}));
}
// Multiplying by ratio, semi-resistant to overflows
template <UnsignedInt T>
constexpr T rational_mul(T value, std::type_identity_t<T> numerator, std::type_identity_t<T> denominator)
{
if constexpr (sizeof(T) <= sizeof(u64) / 2)
{
return static_cast<T>(value * u64{numerator} / u64{denominator});
}
#if is_u128_emulated
if constexpr (sizeof(T) <= sizeof(u128) / 2)
{
return static_cast<T>(u128_from_mul(value, numerator) / u64{denominator});
}
#endif
return static_cast<T>(value / denominator * numerator + (value % denominator) * numerator / denominator);
}
template <UnsignedInt T>
constexpr T add_saturate(T addend1, T addend2)
{
return static_cast<T>(~addend1) < addend2 ? T{umax} : static_cast<T>(addend1 + addend2);
}
template <UnsignedInt T>
constexpr T sub_saturate(T minuend, T subtrahend)
{
return minuend < subtrahend ? T{0} : static_cast<T>(minuend - subtrahend);
}
template <UnsignedInt T>
constexpr T mul_saturate(T factor1, T factor2)
{
return factor1 > 0 && T{umax} / factor1 < factor2 ? T{umax} : static_cast<T>(factor1 * factor2);
}
inline void trigger_write_page_fault(void* ptr)
{
#if defined(ARCH_X64) && !defined(_MSC_VER)
__asm__ volatile("lock orl $0, 0(%0)" ::"r"(ptr));
#elif defined(ARCH_ARM64) && !defined(ANDROID)
u32 value = 0;
u32* u32_ptr = static_cast<u32*>(ptr);
__asm__ volatile("ldset %w0, %w0, %1" : "+r"(value), "=Q"(*u32_ptr) : "r"(value));
#else
*static_cast<atomic_t<u32>*>(ptr) += 0;
#endif
}
inline void trap()
{
#ifdef _M_X64
__debugbreak();
#elif defined(ARCH_X64)
__asm__ volatile("int3");
#elif defined(ARCH_ARM64)
__asm__ volatile("brk 0x42");
#else
#error "Missing utils::trap() implementation"
#endif
}
} // namespace utils
using utils::busy_wait;
#ifdef _MSC_VER
using utils::operator/;
#endif

View file

@ -50,9 +50,9 @@ static bool has_waitv()
#include <array>
#include <random>
#include "asm.hpp"
#include "rx/asm.hpp"
#include "endian.hpp"
#include "tsc.hpp"
#include "rx/tsc.hpp"
// Total number of entries.
static constexpr usz s_hashtable_size = 1u << 17;
@ -402,7 +402,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
constexpr u128 max_mask = dup8(8192);
// Leave only bits indicating sub-semaphore is full, find free one
const u32 pos = utils::ctz128(~val & max_mask);
const u32 pos = rx::ctz128(~val & max_mask);
if (pos == 128) [[unlikely]]
{
@ -422,7 +422,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
{
constexpr u128 max_mask = dup8(1024);
const u32 pos = utils::ctz128(~val & max_mask);
const u32 pos = rx::ctz128(~val & max_mask);
val += u128{1} << (pos / 11 * 11);
@ -433,7 +433,7 @@ static u32 cond_alloc(uptr iptr, u32 tls_slot = -1)
{
constexpr u128 max_mask = dup8(64) | (dup8(64) << 56);
const u32 pos = utils::ctz128(~val & max_mask);
const u32 pos = rx::ctz128(~val & max_mask);
val += u128{1} << (pos / 7 * 7);
@ -495,15 +495,15 @@ static void cond_free(u32 cond_id, u32 tls_slot = -1)
}
// Call the destructor if necessary
utils::prefetch_write(s_cond_bits + cond_id / 64);
rx::prefetch_write(s_cond_bits + cond_id / 64);
const u32 level3 = cond_id / 64 % 16;
const u32 level2 = cond_id / 1024 % 8;
const u32 level1 = cond_id / 8192 % 8;
utils::prefetch_write(s_cond_sem3 + level2);
utils::prefetch_write(s_cond_sem2 + level1);
utils::prefetch_write(&s_cond_sem1);
rx::prefetch_write(s_cond_sem3 + level2);
rx::prefetch_write(s_cond_sem2 + level1);
rx::prefetch_write(&s_cond_sem1);
cond->destroy();
@ -676,7 +676,7 @@ namespace
u64 utils::get_unique_tsc()
{
const u64 stamp0 = utils::get_tsc();
const u64 stamp0 = rx::get_tsc();
if (!s_min_tsc.fetch_op([=](u64& tsc)
{
@ -832,7 +832,7 @@ FORCE_INLINE auto root_info::slot_search(uptr iptr, F func) noexcept
{
if (u16 cond_id = _this->slots[std::countr_zero(bits)])
{
utils::prefetch_read(s_cond_list + cond_id);
rx::prefetch_read(s_cond_list + cond_id);
cond_ids[cond_count++] = cond_id;
}
}

View file

@ -9,7 +9,8 @@
#include "Emu/VFS.h"
#include "util/types.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "rx/align.hpp"
#include <charconv>
#include <regex>
@ -972,10 +973,10 @@ static usz apply_modification(std::vector<u32>& applied, patch_engine::patch_inf
// Do not allow null address or if resultant ptr is not a VM ptr
if (const u32 alloc_at = (p.offset & -4096); alloc_at >> 16)
{
const u32 alloc_size = utils::align(static_cast<u32>(p.value.long_value) + alloc_at % 4096, 4096);
const u32 alloc_size = rx::alignUp(static_cast<u32>(p.value.long_value) + alloc_at % 4096, 4096);
// Allocate map if needed, if allocated flags will indicate that bit 62 is set (unique identifier)
auto alloc_map = vm::reserve_map(vm::any, alloc_at & -0x10000, utils::align(alloc_size, 0x10000), vm::page_size_64k | (1ull << 62));
auto alloc_map = vm::reserve_map(vm::any, alloc_at & -0x10000, rx::alignUp(alloc_size, 0x10000), vm::page_size_64k | (1ull << 62));
u64 flags = vm::alloc_unwritable;
@ -1106,7 +1107,7 @@ static usz apply_modification(std::vector<u32>& applied, patch_engine::patch_inf
}
case patch_type::c_utf8:
{
memory_size = utils::add_saturate<u32>(::size32(p.original_value), 1);
memory_size = rx::add_saturate<u32>(::size32(p.original_value), 1);
break;
}
case patch_type::move_file:
@ -1165,7 +1166,7 @@ static usz apply_modification(std::vector<u32>& applied, patch_engine::patch_inf
continue;
}
const u32 alloc_size = utils::align(static_cast<u32>(p.value.long_value + 1) * 4, 0x10000);
const u32 alloc_size = rx::alignUp(static_cast<u32>(p.value.long_value + 1) * 4, 0x10000);
// Check if should maybe reuse previous code cave allocation (0 size)
if (alloc_size - 4 != 0)

View file

@ -3,7 +3,7 @@
#include "util/types.hpp"
#include <string>
#include <algorithm>
#include "util/asm.hpp"
#include "rx/asm.hpp"
/*
C-style format parser. Appends formatted string to `out`, returns number of characters written.
@ -59,7 +59,7 @@ usz cfmt_append(Dst& out, const Char* fmt, Src&& src)
{
if constexpr (sizeof(value) == 16)
{
out.resize(out.size() + std::max<u64>(min_num, 129 / 3 - (utils::clz128(value | 1) + 1) / 3), '0');
out.resize(out.size() + std::max<u64>(min_num, 129 / 3 - (rx::clz128(value | 1) + 1) / 3), '0');
}
else
{
@ -77,7 +77,7 @@ usz cfmt_append(Dst& out, const Char* fmt, Src&& src)
{
if constexpr (sizeof(value) == 16)
{
out.resize(out.size() + std::max<u64>(min_num, 128 / 4 - utils::clz128(value | 1) / 4), '0');
out.resize(out.size() + std::max<u64>(min_num, 128 / 4 - rx::clz128(value | 1) / 4), '0');
}
else
{

View file

@ -7,7 +7,7 @@
#include <algorithm>
#ifdef _WIN32
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "windows.h"
#include "tlhelp32.h"
#ifdef _MSC_VER
@ -148,7 +148,7 @@ namespace utils
status = PdhGetFormattedCounterArray(m_cpu_cores, PDH_FMT_DOUBLE, &dwBufferSize, &dwItemCount, nullptr);
if (static_cast<PDH_STATUS>(PDH_MORE_DATA) == status)
{
std::vector<PDH_FMT_COUNTERVALUE_ITEM> items(utils::aligned_div(dwBufferSize, sizeof(PDH_FMT_COUNTERVALUE_ITEM)));
std::vector<PDH_FMT_COUNTERVALUE_ITEM> items(rx::aligned_div(dwBufferSize, sizeof(PDH_FMT_COUNTERVALUE_ITEM)));
if (items.size() >= dwItemCount)
{
status = PdhGetFormattedCounterArray(m_cpu_cores, PDH_FMT_DOUBLE, &dwBufferSize, &dwItemCount, items.data());

View file

@ -1,6 +1,6 @@
#include "mutex.h"
#include "util/asm.hpp"
#include "rx/asm.hpp"
void shared_mutex::imp_lock_shared(u32 val)
{
@ -26,7 +26,7 @@ void shared_mutex::imp_lock_shared(u32 val)
return;
}
busy_wait();
rx::busy_wait();
}
// Acquire writer lock and downgrade
@ -96,7 +96,7 @@ void shared_mutex::imp_lock(u32 val)
for (int i = 0; i < 10; i++)
{
busy_wait();
rx::busy_wait();
const u32 old = m_value;
@ -138,7 +138,7 @@ void shared_mutex::imp_lock_upgrade()
{
for (int i = 0; i < 10; i++)
{
busy_wait();
rx::busy_wait();
if (try_lock_upgrade())
{
@ -178,7 +178,7 @@ void shared_mutex::imp_lock_unlock()
_max = val / c_one;
busy_wait(1500);
rx::busy_wait(1500);
}
// Lock and unlock

View file

@ -1,12 +1,12 @@
#include "sema.h"
#include "util/asm.hpp"
#include "rx/asm.hpp"
void semaphore_base::imp_wait()
{
for (int i = 0; i < 10; i++)
{
busy_wait();
rx::busy_wait();
const u32 value = m_value.load();

View file

@ -1,6 +1,6 @@
#include "util/types.hpp"
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "util/sysinfo.hpp"
#include "util/endian.hpp"
#include "util/lockless.h"
@ -112,7 +112,7 @@ bool uncompressed_serialization_file_handler::handle_file_op(utils::serial& ar,
ar.data_offset = pos;
}
const usz read_pre_buffer = ar.data.empty() ? 0 : utils::sub_saturate<usz>(ar.data_offset, pos);
const usz read_pre_buffer = ar.data.empty() ? 0 : rx::sub_saturate<usz>(ar.data_offset, pos);
if (read_pre_buffer)
{
@ -128,8 +128,8 @@ bool uncompressed_serialization_file_handler::handle_file_op(utils::serial& ar,
// Adjustment to prevent overflow
const usz subtrahend = ar.data.empty() ? 0 : 1;
const usz read_past_buffer = utils::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = utils::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
const usz read_past_buffer = rx::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = rx::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
if (read_past_buffer)
{
@ -410,7 +410,7 @@ bool compressed_serialization_file_handler::handle_file_op(utils::serial& ar, us
// ar.seek_pos(pos);
// }
const usz read_pre_buffer = utils::sub_saturate<usz>(ar.data_offset, pos);
const usz read_pre_buffer = rx::sub_saturate<usz>(ar.data_offset, pos);
if (read_pre_buffer)
{
@ -421,8 +421,8 @@ bool compressed_serialization_file_handler::handle_file_op(utils::serial& ar, us
// Adjustment to prevent overflow
const usz subtrahend = ar.data.empty() ? 0 : 1;
const usz read_past_buffer = utils::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = utils::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
const usz read_past_buffer = rx::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = rx::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
if (read_past_buffer)
{
@ -506,7 +506,7 @@ usz compressed_serialization_file_handler::read_at(utils::serial& ar, usz read_p
m_stream_data_index = m_zs.avail_in ? m_zs.next_in - m_stream_data.data() : m_stream_data.size();
// Adjust again in case the values simply did not fit into uInt
m_zs.avail_out = adjust_for_uint(utils::sub_saturate<usz>(total_to_read, read_size));
m_zs.avail_out = adjust_for_uint(rx::sub_saturate<usz>(total_to_read, read_size));
m_zs.avail_in = adjust_for_uint(m_stream_data.size() - m_stream_data_index);
if (need_more_file_memory)
@ -779,7 +779,7 @@ usz compressed_serialization_file_handler::get_size(const utils::serial& ar, usz
return memory_available;
}
return std::max<usz>(utils::mul_saturate<usz>(m_file->size(), 6), memory_available);
return std::max<usz>(rx::mul_saturate<usz>(m_file->size(), 6), memory_available);
}
struct compressed_zstd_stream_data
@ -973,7 +973,7 @@ bool compressed_zstd_serialization_file_handler::handle_file_op(utils::serial& a
// ar.seek_pos(pos);
// }
const usz read_pre_buffer = utils::sub_saturate<usz>(ar.data_offset, pos);
const usz read_pre_buffer = rx::sub_saturate<usz>(ar.data_offset, pos);
if (read_pre_buffer)
{
@ -984,8 +984,8 @@ bool compressed_zstd_serialization_file_handler::handle_file_op(utils::serial& a
// Adjustment to prevent overflow
const usz subtrahend = ar.data.empty() ? 0 : 1;
const usz read_past_buffer = utils::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = utils::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
const usz read_past_buffer = rx::sub_saturate<usz>(pos + (size - subtrahend), ar.data_offset + (ar.data.size() - subtrahend));
const usz read_limit = rx::sub_saturate<usz>(ar.m_max_data, ar.data_offset);
if (read_past_buffer)
{
@ -1326,7 +1326,7 @@ usz compressed_zstd_serialization_file_handler::get_size(const utils::serial& ar
}
return recommended;
// return std::max<usz>(utils::mul_saturate<usz>(ZSTD_decompressBound(m_file->size()), 2), memory_available);
// return std::max<usz>(rx::mul_saturate<usz>(ZSTD_decompressBound(m_file->size()), 2), memory_available);
}
bool null_serialization_file_handler::handle_file_op(utils::serial&, usz, usz, const void*)

View file

@ -4,7 +4,7 @@
#include "util/types.hpp"
#include "util/v128.hpp"
#include "util/sysinfo.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "util/JIT.h"
#include <rx/simd.hpp>

View file

@ -18,14 +18,14 @@
#include <sys/resource.h>
#ifndef __APPLE__
#include <sys/utsname.h>
#include <errno.h>
#include <cerrno>
#endif
#endif
#include <thread>
#include <fstream>
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "util/fence.hpp"
#if defined(_M_X64) && defined(_MSC_VER)
@ -790,7 +790,7 @@ static constexpr ullong round_tsc(ullong val, ullong known_error)
known_error /= 10;
}
return utils::rounded_div(val, by) * by;
return rx::rounded_div(val, by) * by;
}
namespace utils
@ -898,7 +898,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool
{
for (usz i = 0; i < retry_count; i++)
{
const u64 rdtsc_read = (utils::lfence(), utils::get_tsc());
const u64 rdtsc_read = (utils::lfence(), rx::get_tsc());
#ifdef _WIN32
LARGE_INTEGER ctr;
QueryPerformanceCounter(&ctr);
@ -906,7 +906,7 @@ static const bool s_tsc_freq_evaluated = []() -> bool
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
#endif
const u64 rdtsc_read2 = (utils::lfence(), utils::get_tsc());
const u64 rdtsc_read2 = (utils::lfence(), rx::get_tsc());
#ifdef _WIN32
const u64 timer_read = ctr.QuadPart - time_base;
@ -961,10 +961,10 @@ static const bool s_tsc_freq_evaluated = []() -> bool
const u128 data = u128_from_mul(rdtsc_data[1] - rdtsc_data[0], timer_freq);
const u64 res = utils::udiv128(static_cast<u64>(data >> 64), static_cast<u64>(data), (timer_data[1] - timer_data[0]));
const u64 res = rx::udiv128(static_cast<u64>(data >> 64), static_cast<u64>(data), (timer_data[1] - timer_data[0]));
// Rounding
return round_tsc(res, utils::mul_saturate<u64>(utils::add_saturate<u64>(rdtsc_diff[0], rdtsc_diff[1]), utils::aligned_div(timer_freq, timer_data[1] - timer_data[0])));
return round_tsc(res, rx::mul_saturate<u64>(rx::add_saturate<u64>(rdtsc_diff[0], rdtsc_diff[1]), rx::aligned_div(timer_freq, timer_data[1] - timer_data[0])));
}();
atomic_storage<u64>::store(utils::s_tsc_freq, cal_tsc);

View file

@ -1,29 +0,0 @@
#pragma once
#include "util/types.hpp"
#ifdef _M_X64
#ifdef _MSC_VER
extern "C" u64 __rdtsc();
#else
#include <immintrin.h>
#endif
#endif
namespace utils
{
inline u64 get_tsc()
{
#if defined(ARCH_ARM64)
u64 r = 0;
__asm__ volatile("mrs %0, cntvct_el0" : "=r"(r));
return r;
#elif defined(_M_X64)
return __rdtsc();
#elif defined(ARCH_X64)
return __builtin_ia32_rdtsc();
#else
#error "Missing utils::get_tsc() implementation"
#endif
}
} // namespace utils

View file

@ -1,6 +1,8 @@
#include "stdafx.h"
#include "util/vm.hpp"
#include "util/asm.hpp"
#include "rx/asm.hpp"
#include "rx/align.hpp"
#ifdef _WIN32
#include "util/File.h"
#include "util/dyn_lib.hpp"
@ -492,7 +494,7 @@ namespace utils
}
shm::shm(u64 size, u32 flags)
: m_flags(flags), m_size(utils::align(size, 0x10000))
: m_flags(flags), m_size(rx::alignUp(size, 0x10000))
{
#ifdef _WIN32
const ULARGE_INTEGER max_size{.QuadPart = m_size};
@ -535,7 +537,7 @@ namespace utils
}
shm::shm(u64 size, const std::string& storage)
: m_size(utils::align(size, 0x10000))
: m_size(rx::alignUp(size, 0x10000))
{
#ifdef _WIN32
fs::file f;
@ -857,7 +859,7 @@ namespace utils
{
const u64 res64 = reinterpret_cast<u64>(::mmap(reinterpret_cast<void*>(ptr64), m_size + 0xf000, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0));
const u64 aligned = utils::align(res64, 0x10000);
const u64 aligned = rx::alignUp(res64, 0x10000);
const auto result = ::mmap(reinterpret_cast<void*>(aligned), m_size, +prot, (cow ? MAP_PRIVATE : MAP_SHARED) | MAP_FIXED, m_file, 0);
// Now cleanup remnants