#pragma once // No BOM and only basic ASCII in this header, or a neko will die #include "util/types.hpp" #include #include #ifndef _MSC_VER #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4996) extern "C" { void _ReadWriteBarrier(); void* _AddressOfReturnAddress(); uchar _bittest(const long*, long); uchar _interlockedbittestandset(volatile long*, long); uchar _interlockedbittestandreset(volatile long*, long); char _InterlockedCompareExchange8(volatile char*, char, char); char _InterlockedExchange8(volatile char*, char); char _InterlockedExchangeAdd8(volatile char*, char); char _InterlockedAnd8(volatile char*, char); char _InterlockedOr8(volatile char*, char); char _InterlockedXor8(volatile char*, char); short _InterlockedCompareExchange16(volatile short*, short, short); short _InterlockedExchange16(volatile short*, short); short _InterlockedExchangeAdd16(volatile short*, short); short _InterlockedAnd16(volatile short*, short); short _InterlockedOr16(volatile short*, short); short _InterlockedXor16(volatile short*, short); short _InterlockedIncrement16(volatile short*); short _InterlockedDecrement16(volatile short*); long _InterlockedCompareExchange(volatile long*, long, long); long _InterlockedCompareExchange_HLEAcquire(volatile long*, long, long); long _InterlockedExchange(volatile long*, long); long _InterlockedExchangeAdd(volatile long*, long); long _InterlockedExchangeAdd_HLERelease(volatile long*, long); long _InterlockedAnd(volatile long*, long); long _InterlockedOr(volatile long*, long); long _InterlockedXor(volatile long*, long); long _InterlockedIncrement(volatile long*); long _InterlockedDecrement(volatile long*); s64 _InterlockedCompareExchange64(volatile s64*, s64, s64); s64 _InterlockedCompareExchange64_HLEAcquire(volatile s64*, s64, s64); s64 _InterlockedExchange64(volatile s64*, s64); s64 _InterlockedExchangeAdd64(volatile s64*, s64); s64 _InterlockedExchangeAdd64_HLERelease(volatile s64*, s64); s64 _InterlockedAnd64(volatile s64*, s64); s64 _InterlockedOr64(volatile s64*, s64); s64 _InterlockedXor64(volatile s64*, s64); s64 _InterlockedIncrement64(volatile s64*); s64 _InterlockedDecrement64(volatile s64*); uchar _InterlockedCompareExchange128(volatile s64*, s64, s64, s64*); } namespace utils { u128 __vectorcall atomic_load16(const void*); void __vectorcall atomic_store16(void*, u128); } #endif FORCE_INLINE void atomic_fence_consume() { #if defined(_M_X64) && defined(_MSC_VER) _ReadWriteBarrier(); #else __atomic_thread_fence(__ATOMIC_CONSUME); #endif } FORCE_INLINE void atomic_fence_acquire() { #if defined(_M_X64) && defined(_MSC_VER) _ReadWriteBarrier(); #else __atomic_thread_fence(__ATOMIC_ACQUIRE); #endif } FORCE_INLINE void atomic_fence_release() { #if defined(_M_X64) && defined(_MSC_VER) _ReadWriteBarrier(); #else __atomic_thread_fence(__ATOMIC_RELEASE); #endif } FORCE_INLINE void atomic_fence_acq_rel() { #if defined(_M_X64) && defined(_MSC_VER) _ReadWriteBarrier(); #else __atomic_thread_fence(__ATOMIC_ACQ_REL); #endif } FORCE_INLINE void atomic_fence_seq_cst() { #if defined(_M_X64) && defined(_MSC_VER) _ReadWriteBarrier(); _InterlockedOr(static_cast(_AddressOfReturnAddress()), 0); _ReadWriteBarrier(); #elif defined(ARCH_X64) __asm__ volatile ("lock orl $0, 0(%%rsp);" ::: "cc", "memory"); #else __atomic_thread_fence(__ATOMIC_SEQ_CST); #endif } #if defined(_M_X64) && defined(_MSC_VER) #pragma warning(pop) #endif // Wait timeout extension (in nanoseconds) enum class atomic_wait_timeout : u64 { inf = 0xffffffffffffffff, }; template class lf_queue; namespace stx { template class atomic_ptr; } // Various extensions for atomic_t::wait namespace atomic_wait { // Max number of simultaneous atomic variables to wait on (can be extended if really necessary) constexpr uint max_list = 8; constexpr struct any_value_t { template operator T() const noexcept { return T(); } } any_value; struct info { const void* data; u32 old; }; template class list { static_assert(Max <= max_list, "Too many elements in the atomic wait list."); // Null-terminated list of wait info info m_info[Max + 1]{}; public: constexpr list() noexcept = default; constexpr list(const list&) noexcept = default; constexpr list& operator=(const list&) noexcept = default; template ().wait(any_value))...>> constexpr list(U&... vars) : m_info{{&vars, 0}...} { static_assert(sizeof...(U) == Max, "Inconsistent amount of atomics."); } template constexpr list& values(U... values) { static_assert(sizeof...(U) == Max, "Inconsistent amount of values."); auto* ptr = m_info; (((ptr->old = std::bit_cast(values)), ptr++), ...); return *this; } template ().wait(any_value))>> constexpr void set(T2& var, U value) { static_assert(Index < Max); m_info[Index].data = &var; m_info[Index].old = std::bit_cast(value); } template constexpr void set(lf_queue& var, std::nullptr_t = nullptr) { static_assert(Index < Max); static_assert(sizeof(var) == sizeof(uptr)); m_info[Index].data = reinterpret_cast(&var) + sizeof(u32); m_info[Index].old = 0; } template constexpr void set(stx::atomic_ptr& var, std::nullptr_t = nullptr) { static_assert(Index < Max); static_assert(sizeof(var) == sizeof(uptr)); m_info[Index].data = reinterpret_cast(&var) + sizeof(u32); m_info[Index].old = 0; } // Timeout is discouraged void wait(atomic_wait_timeout timeout = atomic_wait_timeout::inf); // Same as wait void start() { wait(); } }; template ().wait(any_value))...>> list(T&... vars) -> list; } namespace utils { // RDTSC with adjustment for being unique u64 get_unique_tsc(); } // Helper for waitable atomics (as in C++20 std::atomic) struct atomic_wait_engine { private: template friend class atomic_t; template friend class atomic_wait::list; static void wait(const void* data, u32 old_value, u64 timeout, atomic_wait::info* ext = nullptr); public: static void notify_one(const void* data); static void notify_all(const void* data); static void set_wait_callback(bool(*cb)(const void* data, u64 attempts, u64 stamp0)); static void set_notify_callback(void(*cb)(const void* data, u64 progress)); static void set_one_time_use_wait_callback(bool (*cb)(u64 progress)); }; template void atomic_wait::list::wait(atomic_wait_timeout timeout) { static_assert(!!Max, "Cannot initiate atomic wait with empty list."); atomic_wait_engine::wait(m_info[0].data, m_info[0].old, static_cast(timeout), m_info + 1); } // Helper class, provides access to compiler-specific atomic intrinsics template struct atomic_storage { /* First part: Non-MSVC intrinsics */ using type = get_uint_t; #if !defined(_MSC_VER) || !defined(_M_X64) #if defined(__ATOMIC_HLE_ACQUIRE) && defined(__ATOMIC_HLE_RELEASE) static constexpr int s_hle_ack = __ATOMIC_SEQ_CST | __ATOMIC_HLE_ACQUIRE; static constexpr int s_hle_rel = __ATOMIC_SEQ_CST | __ATOMIC_HLE_RELEASE; #else static constexpr int s_hle_ack = __ATOMIC_SEQ_CST; static constexpr int s_hle_rel = __ATOMIC_SEQ_CST; #endif // clang often thinks atomics are misaligned, GCC doesn't like reinterpret_cast for breaking strict aliasing #ifdef __clang__ #define MAYBE_CAST(...) (reinterpret_cast(__VA_ARGS__)) #else #define MAYBE_CAST(...) (__VA_ARGS__) #endif static inline bool compare_exchange(T& dest, T& comp, T exch) { return __atomic_compare_exchange(MAYBE_CAST(&dest), MAYBE_CAST(&comp), MAYBE_CAST(&exch), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); } static inline bool compare_exchange_hle_acq(T& dest, T& comp, T exch) { static_assert(sizeof(T) == 4 || sizeof(T) == 8); return __atomic_compare_exchange(MAYBE_CAST(&dest), MAYBE_CAST(&comp), MAYBE_CAST(&exch), false, s_hle_ack, s_hle_ack); } static inline T load(const T& dest) { #ifdef __clang__ type result; __atomic_load(reinterpret_cast(&dest), MAYBE_CAST(&result), __ATOMIC_SEQ_CST); return std::bit_cast(result); #else alignas(sizeof(T)) T result; __atomic_load(&dest, &result, __ATOMIC_SEQ_CST); return result; #endif } static inline T observe(const T& dest) { #ifdef __clang__ type result; __atomic_load(reinterpret_cast(&dest), MAYBE_CAST(&result), __ATOMIC_RELAXED); return std::bit_cast(result); #else alignas(sizeof(T)) T result; __atomic_load(&dest, &result, __ATOMIC_RELAXED); return result; #endif } static inline void store(T& dest, T value) { static_cast(exchange(dest, value)); } static inline void release(T& dest, T value) { __atomic_store(MAYBE_CAST(&dest), MAYBE_CAST(&value), __ATOMIC_RELEASE); } static inline T exchange(T& dest, T value) { alignas(sizeof(T)) T result; __atomic_exchange(MAYBE_CAST(&dest), MAYBE_CAST(&value), MAYBE_CAST(&result), __ATOMIC_SEQ_CST); return result; } static inline T fetch_add(T& dest, T value) { return __atomic_fetch_add(&dest, value, __ATOMIC_SEQ_CST); } static inline T fetch_add_hle_rel(T& dest, T value) { static_assert(sizeof(T) == 4 || sizeof(T) == 8); return __atomic_fetch_add(&dest, value, s_hle_rel); } static inline T add_fetch(T& dest, T value) { return __atomic_add_fetch(&dest, value, __ATOMIC_SEQ_CST); } static inline T fetch_sub(T& dest, T value) { return __atomic_fetch_sub(&dest, value, __ATOMIC_SEQ_CST); } static inline T sub_fetch(T& dest, T value) { return __atomic_sub_fetch(&dest, value, __ATOMIC_SEQ_CST); } static inline T fetch_and(T& dest, T value) { return __atomic_fetch_and(&dest, value, __ATOMIC_SEQ_CST); } static inline T and_fetch(T& dest, T value) { return __atomic_and_fetch(&dest, value, __ATOMIC_SEQ_CST); } static inline T fetch_xor(T& dest, T value) { return __atomic_fetch_xor(&dest, value, __ATOMIC_SEQ_CST); } static inline T xor_fetch(T& dest, T value) { return __atomic_xor_fetch(&dest, value, __ATOMIC_SEQ_CST); } static inline T fetch_or(T& dest, T value) { return __atomic_fetch_or(&dest, value, __ATOMIC_SEQ_CST); } static inline T or_fetch(T& dest, T value) { return __atomic_or_fetch(&dest, value, __ATOMIC_SEQ_CST); } #endif /* Second part: MSVC-specific */ #if defined(_M_X64) && defined(_MSC_VER) static inline T add_fetch(T& dest, T value) { return atomic_storage::fetch_add(dest, value) + value; } static inline T fetch_sub(T& dest, T value) { return atomic_storage::fetch_add(dest, 0 - value); } static inline T sub_fetch(T& dest, T value) { return atomic_storage::fetch_add(dest, 0 - value) - value; } static inline T and_fetch(T& dest, T value) { return atomic_storage::fetch_and(dest, value) & value; } static inline T or_fetch(T& dest, T value) { return atomic_storage::fetch_or(dest, value) | value; } static inline T xor_fetch(T& dest, T value) { return atomic_storage::fetch_xor(dest, value) ^ value; } #undef MAYBE_CAST #endif /* Third part: fallbacks, may be hidden by subsequent atomic_storage<> specializations */ static inline T fetch_inc(T& dest) { return atomic_storage::fetch_add(dest, 1); } static inline T inc_fetch(T& dest) { return atomic_storage::add_fetch(dest, 1); } static inline T fetch_dec(T& dest) { return atomic_storage::fetch_sub(dest, 1); } static inline T dec_fetch(T& dest) { return atomic_storage::sub_fetch(dest, 1); } static inline bool bts(T& dest, uint bit) { #if defined(ARCH_X64) uchar* dst = reinterpret_cast(&dest); if constexpr (sizeof(T) < 4) { const uptr ptr = reinterpret_cast(dst); // Align the bit up and pointer down bit = bit + (ptr & 3) * 8; dst = reinterpret_cast(ptr & -4); } #endif #if defined(_M_X64) && defined(_MSC_VER) return _interlockedbittestandset((long*)dst, bit) != 0; #elif defined(ARCH_X64) bool result; __asm__ volatile ("lock btsl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory"); return result; #else const T value = static_cast(1) << bit; return (__atomic_fetch_or(&dest, value, __ATOMIC_SEQ_CST) & value) != 0; #endif } static inline bool btr(T& dest, uint bit) { #if defined(ARCH_X64) uchar* dst = reinterpret_cast(&dest); if constexpr (sizeof(T) < 4) { const uptr ptr = reinterpret_cast(dst); // Align the bit up and pointer down bit = bit + (ptr & 3) * 8; dst = reinterpret_cast(ptr & -4); } #endif #if defined(_M_X64) && defined(_MSC_VER) return _interlockedbittestandreset((long*)dst, bit) != 0; #elif defined(ARCH_X64) bool result; __asm__ volatile ("lock btrl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory"); return result; #else const T value = static_cast(1) << bit; return (__atomic_fetch_and(&dest, ~value, __ATOMIC_SEQ_CST) & value) != 0; #endif } static inline bool btc(T& dest, uint bit) { #if defined(ARCH_X64) uchar* dst = reinterpret_cast(&dest); if constexpr (sizeof(T) < 4) { const uptr ptr = reinterpret_cast(dst); // Align the bit up and pointer down bit = bit + (ptr & 3) * 8; dst = reinterpret_cast(ptr & -4); } #endif #if defined(_M_X64) && defined(_MSC_VER) while (true) { // Keep trying until we actually invert desired bit if (!_bittest((long*)dst, bit) && !_interlockedbittestandset((long*)dst, bit)) return false; if (_interlockedbittestandreset((long*)dst, bit)) return true; } #elif defined(ARCH_X64) bool result; __asm__ volatile ("lock btcl %2, 0(%1)\n" : "=@ccc" (result) : "r" (dst), "Ir" (bit) : "cc", "memory"); return result; #else const T value = static_cast(1) << bit; return (__atomic_fetch_xor(&dest, value, __ATOMIC_SEQ_CST) & value) != 0; #endif } }; /* The rest: ugly MSVC intrinsics + inline asm implementations */ template struct atomic_storage : atomic_storage { #if defined(_M_X64) && defined(_MSC_VER) static inline bool compare_exchange(T& dest, T& comp, T exch) { const char v = std::bit_cast(comp); const char r = _InterlockedCompareExchange8(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline T load(const T& dest) { atomic_fence_acquire(); const char value = *reinterpret_cast(&dest); atomic_fence_acquire(); return std::bit_cast(value); } static inline T observe(const T& dest) { const char value = *reinterpret_cast(&dest); return std::bit_cast(value); } static inline void release(T& dest, T value) { atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); atomic_fence_release(); } static inline T exchange(T& dest, T value) { const char r = _InterlockedExchange8(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline void store(T& dest, T value) { exchange(dest, value); } static inline T fetch_add(T& dest, T value) { const char r = _InterlockedExchangeAdd8(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_and(T& dest, T value) { const char r = _InterlockedAnd8(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_or(T& dest, T value) { const char r = _InterlockedOr8(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_xor(T& dest, T value) { const char r = _InterlockedXor8(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } #endif }; template struct atomic_storage : atomic_storage { #if defined(_M_X64) && defined(_MSC_VER) static inline bool compare_exchange(T& dest, T& comp, T exch) { const short v = std::bit_cast(comp); const short r = _InterlockedCompareExchange16(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline T load(const T& dest) { atomic_fence_acquire(); const short value = *reinterpret_cast(&dest); atomic_fence_acquire(); return std::bit_cast(value); } static inline T observe(const T& dest) { const short value = *reinterpret_cast(&dest); return std::bit_cast(value); } static inline void release(T& dest, T value) { atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); atomic_fence_release(); } static inline T exchange(T& dest, T value) { const short r = _InterlockedExchange16(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline void store(T& dest, T value) { exchange(dest, value); } static inline T fetch_add(T& dest, T value) { const short r = _InterlockedExchangeAdd16(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_and(T& dest, T value) { const short r = _InterlockedAnd16(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_or(T& dest, T value) { const short r = _InterlockedOr16(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_xor(T& dest, T value) { const short r = _InterlockedXor16(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T inc_fetch(T& dest) { const short r = _InterlockedIncrement16(reinterpret_cast(&dest)); return std::bit_cast(r); } static inline T dec_fetch(T& dest) { const short r = _InterlockedDecrement16(reinterpret_cast(&dest)); return std::bit_cast(r); } #endif }; template struct atomic_storage : atomic_storage { #if defined(_M_X64) && defined(_MSC_VER) static inline bool compare_exchange(T& dest, T& comp, T exch) { const long v = std::bit_cast(comp); const long r = _InterlockedCompareExchange(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline bool compare_exchange_hle_acq(T& dest, T& comp, T exch) { const long v = std::bit_cast(comp); const long r = _InterlockedCompareExchange_HLEAcquire(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline T load(const T& dest) { atomic_fence_acquire(); const long value = *reinterpret_cast(&dest); atomic_fence_acquire(); return std::bit_cast(value); } static inline T observe(const T& dest) { const long value = *reinterpret_cast(&dest); return std::bit_cast(value); } static inline void release(T& dest, T value) { atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); atomic_fence_release(); } static inline T exchange(T& dest, T value) { const long r = _InterlockedExchange(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline void store(T& dest, T value) { exchange(dest, value); } static inline T fetch_add(T& dest, T value) { const long r = _InterlockedExchangeAdd(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_add_hle_rel(T& dest, T value) { const long r = _InterlockedExchangeAdd_HLERelease(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_and(T& dest, T value) { long r = _InterlockedAnd(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_or(T& dest, T value) { const long r = _InterlockedOr(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_xor(T& dest, T value) { const long r = _InterlockedXor(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T inc_fetch(T& dest) { const long r = _InterlockedIncrement(reinterpret_cast(&dest)); return std::bit_cast(r); } static inline T dec_fetch(T& dest) { const long r = _InterlockedDecrement(reinterpret_cast(&dest)); return std::bit_cast(r); } #endif }; template struct atomic_storage : atomic_storage { #if defined(_M_X64) && defined(_MSC_VER) static inline bool compare_exchange(T& dest, T& comp, T exch) { const llong v = std::bit_cast(comp); const llong r = _InterlockedCompareExchange64(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline bool compare_exchange_hle_acq(T& dest, T& comp, T exch) { const llong v = std::bit_cast(comp); const llong r = _InterlockedCompareExchange64_HLEAcquire(reinterpret_cast(&dest), std::bit_cast(exch), v); comp = std::bit_cast(r); return r == v; } static inline T load(const T& dest) { atomic_fence_acquire(); const llong value = *reinterpret_cast(&dest); atomic_fence_acquire(); return std::bit_cast(value); } static inline T observe(const T& dest) { const llong value = *reinterpret_cast(&dest); return std::bit_cast(value); } static inline void release(T& dest, T value) { atomic_fence_release(); *reinterpret_cast(&dest) = std::bit_cast(value); atomic_fence_release(); } static inline T exchange(T& dest, T value) { const llong r = _InterlockedExchange64(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline void store(T& dest, T value) { exchange(dest, value); } static inline T fetch_add(T& dest, T value) { const llong r = _InterlockedExchangeAdd64(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_add_hle_rel(T& dest, T value) { const llong r = _InterlockedExchangeAdd64_HLERelease(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_and(T& dest, T value) { const llong r = _InterlockedAnd64(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_or(T& dest, T value) { const llong r = _InterlockedOr64(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T fetch_xor(T& dest, T value) { const llong r = _InterlockedXor64(reinterpret_cast(&dest), std::bit_cast(value)); return std::bit_cast(r); } static inline T inc_fetch(T& dest) { const llong r = _InterlockedIncrement64(reinterpret_cast(&dest)); return std::bit_cast(r); } static inline T dec_fetch(T& dest) { const llong r = _InterlockedDecrement64(reinterpret_cast(&dest)); return std::bit_cast(r); } #endif }; template struct atomic_storage : atomic_storage { #if defined(_M_X64) && defined(_MSC_VER) static inline T load(const T& dest) { atomic_fence_acquire(); u128 val = utils::atomic_load16(&dest); atomic_fence_acquire(); return std::bit_cast(val); } static inline T observe(const T& dest) { return load(dest); } static inline bool compare_exchange(T& dest, T& comp, T exch) { struct alignas(16) llong2 { llong ll[2]; }; const llong2 _exch = std::bit_cast(exch); return _InterlockedCompareExchange128(reinterpret_cast(&dest), _exch.ll[1], _exch.ll[0], reinterpret_cast(&comp)) != 0; } static inline T exchange(T& dest, T value) { struct alignas(16) llong2 { llong ll[2]; }; const llong2 _value = std::bit_cast(value); const auto llptr = reinterpret_cast(&dest); llong2 cmp{ llptr[0], llptr[1] }; while (!_InterlockedCompareExchange128(llptr, _value.ll[1], _value.ll[0], cmp.ll)); return std::bit_cast(cmp); } static inline void store(T& dest, T value) { atomic_fence_acq_rel(); release(dest, value); atomic_fence_seq_cst(); } static inline void release(T& dest, T value) { atomic_fence_release(); utils::atomic_store16(&dest, std::bit_cast(value)); atomic_fence_release(); } #elif defined(ARCH_X64) static inline T load(const T& dest) { alignas(16) T r; #ifdef __AVX__ __asm__ volatile("vmovdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory"); #else __asm__ volatile("movdqa %1, %0;" : "=x" (r) : "m" (dest) : "memory"); #endif return r; } static inline T observe(const T& dest) { return load(dest); } static inline bool compare_exchange(T& dest, T& comp, T exch) { bool result; ullong cmp_lo = 0; ullong cmp_hi = 0; ullong exc_lo = 0; ullong exc_hi = 0; if constexpr (std::is_same_v || std::is_same_v) { cmp_lo = comp; cmp_hi = comp >> 64; exc_lo = exch; exc_hi = exch >> 64; } else { std::memcpy(&cmp_lo, reinterpret_cast(&comp) + 0, 8); std::memcpy(&cmp_hi, reinterpret_cast(&comp) + 8, 8); std::memcpy(&exc_lo, reinterpret_cast(&exch) + 0, 8); std::memcpy(&exc_hi, reinterpret_cast(&exch) + 8, 8); } __asm__ volatile("lock cmpxchg16b %1;" : "=@ccz" (result) , "+m" (dest) , "+d" (cmp_hi) , "+a" (cmp_lo) : "c" (exc_hi) , "b" (exc_lo) : "cc"); if constexpr (std::is_same_v || std::is_same_v) { comp = T{cmp_hi} << 64 | cmp_lo; } else { std::memcpy(reinterpret_cast(&comp) + 0, &cmp_lo, 8); std::memcpy(reinterpret_cast(&comp) + 8, &cmp_hi, 8); } return result; } static inline T exchange(T& dest, T value) { __atomic_thread_fence(__ATOMIC_ACQ_REL); return std::bit_cast(__sync_lock_test_and_set(reinterpret_cast(&dest), std::bit_cast(value))); } static inline void store(T& dest, T value) { release(dest, value); atomic_fence_seq_cst(); } static inline void release(T& dest, T value) { u128 val = std::bit_cast(value); #ifdef __AVX__ __asm__ volatile("vmovdqa %0, %1;" :: "x" (val), "m" (dest) : "memory"); #else __asm__ volatile("movdqa %0, %1;" :: "x" (val), "m" (dest) : "memory"); #endif } #elif defined(ARCH_ARM64) static inline T load(const T& dest) { #if defined(ARM_FEATURE_LSE2) u64 data[2]; __asm__ volatile("1:\n" "ldp %x[data0], %x[data1], %[dest]\n" "dmb ish\n" : [data0] "=r"(data[0]), [data1] "=r"(data[1]) : [dest] "Q"(dest) : "memory"); T result; std::memcpy(&result, data, 16); return result; #else u32 tmp; u64 data[2]; __asm__ volatile("1:\n" "ldaxp %x[data0], %x[data1], %[dest]\n" "stlxp %w[tmp], %x[data0], %x[data1], %[dest]\n" "cbnz %w[tmp], 1b\n" : [tmp] "=&r" (tmp), [data0] "=&r" (data[0]), [data1] "=&r" (data[1]) : [dest] "Q" (dest) : "memory" ); T result; std::memcpy(&result, data, 16); return result; #endif } static inline T observe(const T& dest) { // TODO return load(dest); } static inline bool compare_exchange(T& dest, T& comp, T exch) { bool result; u64 cmp[2]; std::memcpy(cmp, &comp, 16); u64 data[2]; std::memcpy(data, &exch, 16); u64 prev[2]; __asm__ volatile("1:\n" "ldaxp %x[prev0], %x[prev1], %[storage]\n" "cmp %x[prev0], %x[cmp0]\n" "ccmp %x[prev1], %x[cmp1], #0, eq\n" "b.ne 2f\n" "stlxp %w[result], %x[data0], %x[data1], %[storage]\n" "cbnz %w[result], 1b\n" "2:\n" "cset %w[result], eq\n" : [result] "=&r" (result), [storage] "+Q" (dest), [prev0] "=&r" (prev[0]), [prev1] "=&r" (prev[1]) : [data0] "r" (data[0]), [data1] "r" (data[1]), [cmp0] "r" (cmp[0]), [cmp1] "r" (cmp[1]) : "cc", "memory" ); if (result) { return true; } std::memcpy(&comp, prev, 16); return false; } static inline T exchange(T& dest, T value) { u32 tmp; u64 src[2]; u64 data[2]; std::memcpy(src, &value, 16); __asm__ volatile("1:\n" "ldaxp %x[data0], %x[data1], %[dest]\n" "stlxp %w[tmp], %x[src0], %x[src1], %[dest]\n" "cbnz %w[tmp], 1b\n" : [tmp] "=&r" (tmp), [dest] "+Q" (dest), [data0] "=&r" (data[0]), [data1] "=&r" (data[1]) : [src0] "r" (src[0]), [src1] "r" (src[1]) : "memory" ); T result; std::memcpy(&result, data, 16); return result; } static inline void store(T& dest, T value) { // TODO #if defined(ARM_FEATURE_LSE2) u64 src[2]; std::memcpy(src, &value, 16); __asm__ volatile("1:\n" "dmb ish\n" "stp %x[data0], %x[data1], %[dest]\n" "dmb ish\n" : [dest] "=Q" (dest) : [data0] "r" (src[0]), [data1] "r" (src[1]) : "memory" ); #else exchange(dest, value); #endif } static inline void release(T& dest, T value) { #if defined(ARM_FEATURE_LSE2) u64 src[2]; std::memcpy(src, &value, 16); __asm__ volatile("1:\n" "dmb ish\n" "stp %x[data0], %x[data1], %[dest]\n" : [dest] "=Q" (dest) : [data0] "r" (src[0]), [data1] "r" (src[1]) : "memory" ); #else // TODO exchange(dest, value); #endif } #endif // TODO }; #ifndef _MSC_VER #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif // Atomic type with lock-free and standard layout guarantees (and appropriate limitations) template class atomic_t { protected: using type = typename std::remove_cv::type; using ptr_rt = std::conditional_t, ullong, type>; static_assert((Align & (Align - 1)) == 0, "atomic_t<> error: unexpected Align parameter (not power of 2)."); static_assert(Align % sizeof(type) == 0, "atomic_t<> error: invalid type, must be power of 2."); static_assert(sizeof(type) <= 16, "atomic_t<> error: invalid type, too big (max supported size is 16)."); static_assert(Align >= sizeof(type), "atomic_t<> error: bad args, specify bigger alignment if necessary."); static_assert(std::is_trivially_copyable_v); static_assert(std::is_copy_constructible_v); static_assert(std::is_move_constructible_v); static_assert(std::is_copy_assignable_v); static_assert(std::is_move_assignable_v); alignas(Align) type m_data; public: static constexpr usz align = Align; ENABLE_BITWISE_SERIALIZATION; atomic_t() noexcept = default; atomic_t(const atomic_t&) = delete; atomic_t& operator =(const atomic_t&) = delete; constexpr atomic_t(const type& value) noexcept : m_data(value) { } // Unsafe direct access type& raw() { return m_data; } // Unsafe direct access const type& raw() const { return m_data; } // Atomically compare data with cmp, replace with exch if equal, return previous data value anyway type compare_and_swap(const type& cmp, const type& exch) { type old = cmp; atomic_storage::compare_exchange(m_data, old, exch); return old; } // Atomically compare data with cmp, replace with exch if equal, return true if data was replaced bool compare_and_swap_test(const type& cmp, const type& exch) { type old = cmp; return atomic_storage::compare_exchange(m_data, old, exch); } // As in std::atomic bool compare_exchange(type& cmp_and_old, const type& exch) { return atomic_storage::compare_exchange(m_data, cmp_and_old, exch); } // Atomic operation; returns old value, or pair of old value and return value (cancel op if evaluates to false) template > std::conditional_t, type, std::pair> fetch_op(F func) { type _new, old = atomic_storage::load(m_data); while (true) { _new = old; if constexpr (std::is_void_v) { std::invoke(func, _new); if (atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return old; } } else { RT ret = std::invoke(func, _new); if (!ret || atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return {old, std::move(ret)}; } } } } // Atomic operation; returns function result value, function is the lambda template > RT atomic_op(F func) { type _new, old = atomic_storage::load(m_data); while (true) { _new = old; if constexpr (std::is_void_v) { std::invoke(func, _new); if (atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return; } } else { RT result = std::invoke(func, _new); if (atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return result; } } } } // Atomically read data type load() const { return atomic_storage::load(m_data); } // Atomically read data operator std::common_type_t() const { return atomic_storage::load(m_data); } // Relaxed load type observe() const { return atomic_storage::observe(m_data); } // Atomically write data void store(const type& rhs) { atomic_storage::store(m_data, rhs); } type operator =(const type& rhs) { atomic_storage::store(m_data, rhs); return rhs; } // Atomically write data with release memory order (faster on x86) void release(const type& rhs) { atomic_storage::release(m_data, rhs); } // Atomically replace data with value, return previous data value type exchange(const type& rhs) { return atomic_storage::exchange(m_data, rhs); } auto fetch_add(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_add(m_data, rhs); } return fetch_op([&](T& v) { v += rhs; }); } auto add_fetch(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::add_fetch(m_data, rhs); } return atomic_op([&](T& v) { v += rhs; return v; }); } auto operator +=(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::add_fetch(m_data, rhs); } return atomic_op([&](T& v) { return v += rhs; }); } auto fetch_sub(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_sub(m_data, rhs); } return fetch_op([&](T& v) { v -= rhs; }); } auto sub_fetch(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::sub_fetch(m_data, rhs); } return atomic_op([&](T& v) { v -= rhs; return v; }); } auto operator -=(const ptr_rt& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::sub_fetch(m_data, rhs); } return atomic_op([&](T& v) { return v -= rhs; }); } auto fetch_and(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_and(m_data, rhs); } return fetch_op([&](T& v) { v &= rhs; }); } auto and_fetch(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::and_fetch(m_data, rhs); } return atomic_op([&](T& v) { v &= rhs; return v; }); } auto operator &=(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::and_fetch(m_data, rhs); } return atomic_op([&](T& v) { return v &= rhs; }); } auto fetch_or(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_or(m_data, rhs); } return fetch_op([&](T& v) { v |= rhs; }); } auto or_fetch(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::or_fetch(m_data, rhs); } return atomic_op([&](T& v) { v |= rhs; return v; }); } auto operator |=(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::or_fetch(m_data, rhs); } return atomic_op([&](T& v) { return v |= rhs; }); } auto fetch_xor(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_xor(m_data, rhs); } return fetch_op([&](T& v) { v ^= rhs; }); } auto xor_fetch(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::xor_fetch(m_data, rhs); } return atomic_op([&](T& v) { v ^= rhs; return v; }); } auto operator ^=(const type& rhs) { if constexpr(std::is_integral::value) { return atomic_storage::xor_fetch(m_data, rhs); } return atomic_op([&](T& v) { return v ^= rhs; }); } auto operator ++() { if constexpr(std::is_integral::value) { return atomic_storage::inc_fetch(m_data); } return atomic_op([](T& v) { return ++v; }); } auto operator --() { if constexpr(std::is_integral::value) { return atomic_storage::dec_fetch(m_data); } return atomic_op([](T& v) { return --v; }); } auto operator ++(int) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_inc(m_data); } return atomic_op([](T& v) { return v++; }); } auto operator --(int) { if constexpr(std::is_integral::value) { return atomic_storage::fetch_dec(m_data); } return atomic_op([](T& v) { return v--; }); } // Conditionally decrement bool try_dec(std::common_type_t greater_than) { type _new, old = atomic_storage::load(m_data); while (true) { _new = old; if (!(_new > greater_than)) { return false; } _new -= 1; if (atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return true; } } } // Conditionally increment bool try_inc(std::common_type_t less_than) { type _new, old = atomic_storage::load(m_data); while (true) { _new = old; if (!(_new < less_than)) { return false; } _new += 1; if (atomic_storage::compare_exchange(m_data, old, _new)) [[likely]] { return true; } } } bool bit_test_set(uint bit) { if constexpr (std::is_integral::value) { return atomic_storage::bts(m_data, bit & (sizeof(T) * 8 - 1)); } return atomic_op([](type& v) { const auto old = v; const auto bit = type(1) << (sizeof(T) * 8 - 1); v |= bit; return !!(old & bit); }); } bool bit_test_reset(uint bit) { if constexpr (std::is_integral::value) { return atomic_storage::btr(m_data, bit & (sizeof(T) * 8 - 1)); } return atomic_op([](type& v) { const auto old = v; const auto bit = type(1) << (sizeof(T) * 8 - 1); v &= ~bit; return !!(old & bit); }); } bool bit_test_invert(uint bit) { if constexpr (std::is_integral::value) { return atomic_storage::btc(m_data, bit & (sizeof(T) * 8 - 1)); } return atomic_op([](type& v) { const auto old = v; const auto bit = type(1) << (sizeof(T) * 8 - 1); v ^= bit; return !!(old & bit); }); } void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const requires(sizeof(type) == 4) { atomic_wait_engine::wait(&m_data, std::bit_cast(old_value), static_cast(timeout)); } [[deprecated]] void wait(type old_value, atomic_wait_timeout timeout = atomic_wait_timeout::inf) const requires(sizeof(type) == 8) { atomic_wait::info ext[2]{}; ext[0].data = reinterpret_cast(&m_data) + 4; ext[0].old = std::bit_cast(old_value) >> 32; atomic_wait_engine::wait(&m_data, static_cast(std::bit_cast(old_value)), static_cast(timeout), ext); } void notify_one() requires(sizeof(type) == 4 || sizeof(type) == 8) { atomic_wait_engine::notify_one(&m_data); } void notify_all() requires(sizeof(type) == 4 || sizeof(type) == 8) { atomic_wait_engine::notify_all(&m_data); } }; template class atomic_t : private atomic_t { using base = atomic_t; public: static constexpr usz align = Align; atomic_t() noexcept = default; atomic_t(const atomic_t&) = delete; atomic_t& operator =(const atomic_t&) = delete; constexpr atomic_t(bool value) noexcept : base(value) { } bool load() const noexcept { return base::load() != 0; } // Override implicit conversion from the parent type explicit operator uchar() const = delete; operator bool() const noexcept { return base::load() != 0; } bool observe() const noexcept { return base::observe() != 0; } void store(bool value) { base::store(value); } bool operator =(bool value) { base::store(value); return value; } void release(bool value) { base::release(value); } bool exchange(bool value) { return base::exchange(value) != 0; } bool test_and_set() { return base::exchange(1) != 0; } bool test_and_reset() { return base::exchange(0) != 0; } bool test_and_invert() { return base::fetch_xor(1) != 0; } }; // Specializations template struct std::common_type, atomic_t> : std::common_type {}; template struct std::common_type, T2> : std::common_type> {}; template struct std::common_type> : std::common_type, T2> {}; #ifndef _MSC_VER #pragma GCC diagnostic pop #pragma GCC diagnostic pop #endif