rpcsx/rpcs3/util/atomic.cpp

#include "atomic.hpp"

#if defined(__linux__)
#define USE_FUTEX
#elif !defined(_WIN32)
#define USE_STD
#endif

#include "Utilities/sync.h"

#include <utility>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <iterator>
#include <memory>
#include <cstdlib>

// Hashtable size factor (can be set to 0 to stress-test collisions)
static constexpr uint s_hashtable_power = 16;

// Total number of entries, should be a power of 2.
static constexpr std::uintptr_t s_hashtable_size = 1u << s_hashtable_power;

// Pointer mask without bits used as hash, assuming signed 48-bit pointers.
static constexpr u64 s_pointer_mask = s_hashtable_power > 7 ? 0xffff'ffff'ffff & ~((s_hashtable_size - 1)) : 0xffff'ffff'ffff;

// Max number of waiters is 32767.
static constexpr u64 s_waiter_mask = s_hashtable_power > 7 ? 0x7fff'0000'0000'0000 : 0x7f00'0000'0000'0000;

// Bit indicates that more than one.
static constexpr u64 s_collision_bit = 0x8000'0000'0000'0000;

// Allocated slot with secondary table.
static constexpr u64 s_slot_mask = ~(s_waiter_mask | s_pointer_mask | s_collision_bit);

// Helper to get least significant set bit from 64-bit masks
template <u64 Mask>
static constexpr u64 one_v = Mask & (0 - Mask);

// Callback for wait() function, returns false if wait should return
static thread_local bool(*s_tls_wait_cb)(const void* data) = [](const void*){ return true; };

// Callback for notification functions for optimizations
static thread_local void(*s_tls_notify_cb)(const void* data, u64 progress) = [](const void*, u64){};

// Compare data in memory with old value, and return true if they are equal
template <bool CheckCb = true>
static NEVER_INLINE bool
#ifdef _WIN32
__vectorcall
#endif
ptr_cmp(const void* data, u32 size, __m128i old128, __m128i mask128)
{
	if constexpr (CheckCb)
	{
		if (!s_tls_wait_cb(data))
		{
			return false;
		}
	}

	const u64 old_value = _mm_cvtsi128_si64(old128);
	const u64 mask = _mm_cvtsi128_si64(mask128);

	switch (size)
	{
	case 1: return (reinterpret_cast<const atomic_t<u8>*>(data)->load() & mask) == (old_value & mask);
	case 2: return (reinterpret_cast<const atomic_t<u16>*>(data)->load() & mask) == (old_value & mask);
	case 4: return (reinterpret_cast<const atomic_t<u32>*>(data)->load() & mask) == (old_value & mask);
	case 8: return (reinterpret_cast<const atomic_t<u64>*>(data)->load() & mask) == (old_value & mask);
	case 16:
	{
		const auto v0 = std::bit_cast<__m128i>(atomic_storage<u128>::load(*reinterpret_cast<const u128*>(data)));
		const auto v1 = _mm_xor_si128(v0, old128);
		const auto v2 = _mm_and_si128(v1, mask128);
		const auto v3 = _mm_packs_epi16(v2, v2);

		if (_mm_cvtsi128_si64(v3) == 0)
		{
			return true;
		}

		break;
	}
	default:
	{
		fprintf(stderr, "ptr_cmp(): bad size (size=%u)" HERE "\n", size);
		std::abort();
	}
	}

	return false;
}

// Returns true if mask overlaps, or the argument is invalid
static bool
#ifdef _WIN32
__vectorcall
#endif
cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m128i val2)
{
	// In force wake up, one of the size arguments is zero
	const u32 size = std::min(size1, size2);

	if (!size) [[unlikely]]
	{
		return true;
	}

	// Compare only masks, new value is not available in this mode
	if ((size1 | size2) == umax)
	{
		// Simple mask overlap
		const auto v0 = _mm_and_si128(mask1, mask2);
		const auto v1 = _mm_packs_epi16(v0, v0);
		return _mm_cvtsi128_si64(v1) != 0;
	}

	// Generate masked value inequality bits
	const auto v0 = _mm_and_si128(_mm_and_si128(mask1, mask2), _mm_xor_si128(val1, val2));

	if (size <= 8)
	{
		// Generate sized mask
		const u64 mask = UINT64_MAX >> ((64 - size * 8) & 63);

		if (!(_mm_cvtsi128_si64(v0) & mask))
		{
			return false;
		}
	}
	else if (size == 16)
	{
		if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0)))
		{
			return false;
		}
	}
	else
	{
		fprintf(stderr, "cmp_mask(): bad size (size1=%u, size2=%u)" HERE "\n", size1, size2);
		std::abort();
	}

	return true;
}

namespace atomic_wait
{
	// Essentially a fat semaphore
	struct alignas(64) cond_handle
	{
#ifdef _WIN32
		u64 tid = GetCurrentThreadId();
#else
		u64 tid = reinterpret_cast<u64>(pthread_self());
#endif
		atomic_t<u32> sync{};
		u32 size{};
		u64 tsc0{};
		const void* ptr{};
		__m128i mask{};
		__m128i oldv{};

#ifdef USE_STD
		// Standard CV/mutex pair (often contains pthread_cond_t/pthread_mutex_t)
		std::condition_variable cond;
		std::mutex mtx;
#endif

		bool forced_wakeup()
		{
			const auto [_old, ok] = sync.fetch_op([](u32& val)
			{
				if (val == 1 || val == 2)
				{
					val = 3;
					return true;
				}

				return false;
			});

			// Prevent collision between normal wake-up and forced one
			return ok && _old == 1;
		}

		void alert_native()
		{
#ifdef USE_FUTEX
			// Use "wake all" arg for robustness, only 1 thread is expected
			futex(&sync, FUTEX_WAKE_PRIVATE, 0x7fff'ffff);
#elif defined(USE_STD)
			// Not super efficient: locking is required to avoid lost notifications
			mtx.lock();
			mtx.unlock();
			cond.notify_all();
#elif defined(_WIN32)
			if (NtWaitForAlertByThreadId)
			{
				// Sets some sticky alert bit, at least I believe so
				NtAlertThreadByThreadId(tid);
			}
			else
			{
				// Can wait in rare cases, which is its annoying weakness
				NtReleaseKeyedEvent(nullptr, &sync, 1, nullptr);
			}
#endif
		}

		bool try_alert_native()
		{
#if defined(USE_FUTEX)
			return false;
#elif defined(USE_STD)
			// Optimistic non-blocking path
			if (mtx.try_lock())
			{
				mtx.unlock();
				cond.notify_all();
				return true;
			}

			return false;
#elif defined(_WIN32)
			if (NtAlertThreadByThreadId)
			{
				// Don't notify prematurely with this API
				return false;
			}

			static LARGE_INTEGER instant{};

			if (NtReleaseKeyedEvent(nullptr, &sync, 1, &instant) != NTSTATUS_SUCCESS)
			{
				// Failed to notify immediately
				return false;
			}

			return true;
#endif
		}
	};

#ifndef USE_STD
	static_assert(sizeof(cond_handle) == 64);
#endif
}

// Max allowed thread number is chosen to fit in 16 bits
static std::aligned_storage_t<sizeof(atomic_wait::cond_handle), alignof(atomic_wait::cond_handle)> s_cond_list[UINT16_MAX + 1]{};

// Used to allow concurrent notifying
static atomic_t<u32> s_cond_refs[UINT16_MAX + 1]{};

// Allocation bits
static atomic_t<u64, 64> s_cond_bits[(UINT16_MAX + 1) / 64]{};

// Allocation semaphore
static atomic_t<u32, 64> s_cond_sema{0};

static u32 cond_alloc()
{
	// Determine whether there is a free slot or not
	if (!s_cond_sema.try_inc(UINT16_MAX + 1))
	{
		return 0;
	}

	// Diversify search start points to reduce contention and increase immediate success chance
#ifdef _WIN32
	const u32 start = GetCurrentProcessorNumber();
#elif __linux__
	const u32 start = sched_getcpu();
#else
	const u32 start = __rdtsc();
#endif

	for (u32 i = start;; i++)
	{
		const u32 group = i % ::size32(s_cond_bits);

		const auto [bits, ok] = s_cond_bits[group].fetch_op([](u64& bits)
		{
			if (~bits)
			{
				// Set lowest clear bit
				bits |= bits + 1;
				return true;
			}

			return false;
		});

		if (ok) [[likely]]
		{
			// Find lowest clear bit
			const u32 id = group * 64 + std::countr_one(bits);

			if (id == 0) [[unlikely]]
			{
				// Special case, set bit and continue
				continue;
			}

			// Construct inplace before it can be used
			new (s_cond_list + id) atomic_wait::cond_handle();

			// Add first reference
			verify(HERE), !s_cond_refs[id]++;

			return id;
		}
	}

	// Unreachable
	std::abort();
	return 0;
}

static atomic_wait::cond_handle* cond_get(u32 cond_id)
{
	if (cond_id - 1 < u32{UINT16_MAX}) [[likely]]
	{
		return std::launder(reinterpret_cast<atomic_wait::cond_handle*>(s_cond_list + cond_id));
	}

	return nullptr;
}

static void cond_free(u32 cond_id)
{
	if (cond_id - 1 >= u32{UINT16_MAX})
	{
		fprintf(stderr, "cond_free(): bad id %u" HERE "\n", cond_id);
		std::abort();
	}

	// Dereference, destroy on last ref
	if (--s_cond_refs[cond_id])
	{
		return;
	}

	// Call the destructor
	cond_get(cond_id)->~cond_handle();

	// Remove the allocation bit
	s_cond_bits[cond_id / 64] &= ~(1ull << (cond_id % 64));

	// Release the semaphore
	s_cond_sema--;
}

static u32 cond_lock(atomic_t<u16>* sema)
{
	while (const u32 cond_id = sema->load())
	{
		const auto [old, ok] = s_cond_refs[cond_id].fetch_op([](u32& ref)
		{
			if (!ref || ref == UINT32_MAX)
			{
				// Don't reference already deallocated semaphore
				return false;
			}

			ref++;
			return true;
		});

		if (ok)
		{
			return cond_id;
		}

		if (old == UINT32_MAX)
		{
			fmt::raw_error("Thread limit " STRINGIZE(UINT32_MAX) " for a single address reached in atomic notifier.");
		}

		if (sema->load() != cond_id)
		{
			// Try again if it changed
			continue;
		}
		else
		{
			break;
		}
	}

	return 0;
}

namespace atomic_wait
{
#define MAX_THREADS (56)

	struct alignas(128) sync_var
	{
		constexpr sync_var() noexcept = default;

		// Reference counter, owning pointer, collision bit and optionally selected slot
		atomic_t<u64> addr_ref{};

	private:
		// Semaphores (allocated in reverse order), empty are zeros
		atomic_t<u16> sema_data[MAX_THREADS]{};

		// Allocated semaphore bits (to make total size 128)
		atomic_t<u64> sema_bits{};

	public:
		atomic_t<u16>* sema_alloc()
		{
			const auto [bits, ok] = sema_bits.fetch_op([](u64& bits)
			{
				if (bits + 1 < (1ull << MAX_THREADS))
				{
					// Set lowest clear bit
					bits |= bits + 1;
					return true;
				}

				return false;
			});

			if (ok) [[likely]]
			{
				// Find lowest clear bit
				return get_sema(std::countr_one(bits));
			}

			// TODO: support extension if reached
			fmt::raw_error("Thread limit " STRINGIZE(MAX_THREADS) " for a single address reached in atomic wait.");
			return nullptr;
		}

		atomic_t<u16>* get_sema(u32 id)
		{
			verify(HERE), id < MAX_THREADS;

			return &sema_data[(MAX_THREADS - 1) - id];
		}

		u64 get_sema_bits() const
		{
			return sema_bits & ((1ull << MAX_THREADS) - 1);
		}

		void reset_sema_bit(atomic_t<u16>* sema)
		{
			verify(HERE), sema >= sema_data && sema < std::end(sema_data);

			sema_bits &= ~(1ull << ((MAX_THREADS - 1) - (sema - sema_data)));
		}

		void sema_free(atomic_t<u16>* sema)
		{
			if (sema < sema_data || sema >= std::end(sema_data))
			{
				fprintf(stderr, "sema_free(): bad sema ptr %p" HERE "\n", sema);
				std::abort();
			}

			// Try to deallocate semaphore (may be delegated to a notifier)
			cond_free(sema->exchange(0));

			// Clear sema bit
			reset_sema_bit(sema);
		}
	};

	static_assert(sizeof(sync_var) == 128);

#undef MAX_THREADS
}

// Main hashtable for atomic wait.
alignas(128) static atomic_wait::sync_var s_hashtable[s_hashtable_size]{};

namespace atomic_wait
{
	struct slot_info
	{
		constexpr slot_info() noexcept = default;

		// Branch extension
		atomic_wait::sync_var branch[48 - s_hashtable_power]{};
	};
}

// Number of search groups (defines max slot branch count as gcount * 64)
#define MAX_SLOTS (4096)

// Array of slot branch objects
alignas(128) static atomic_wait::slot_info s_slot_list[MAX_SLOTS]{};

// Allocation bits
static atomic_t<u64, 64> s_slot_bits[MAX_SLOTS / 64]{};

// Allocation semaphore
static atomic_t<u32, 64> s_slot_sema{0};

static_assert(MAX_SLOTS % 64 == 0);

static u64 slot_alloc()
{
	// Determine whether there is a free slot or not
	if (!s_slot_sema.try_inc(MAX_SLOTS + 1))
	{
		fmt::raw_error("Hashtable extension slot limit " STRINGIZE(MAX_SLOTS) " reached in atomic wait.");
		return 0;
	}

	// Diversify search start points to reduce contention and increase immediate success chance
#ifdef _WIN32
	const u32 start = GetCurrentProcessorNumber();
#elif __linux__
	const u32 start = sched_getcpu();
#else
	const u32 start = __rdtsc();
#endif

	for (u32 i = start;; i++)
	{
		const u32 group = i % ::size32(s_slot_bits);

		const auto [bits, ok] = s_slot_bits[group].fetch_op([](u64& bits)
		{
			if (~bits)
			{
				// Set lowest clear bit
				bits |= bits + 1;
				return true;
			}

			return false;
		});

		if (ok)
		{
			// Find lowest clear bit
			return group * 64 + std::countr_one(bits);
		}
	}

	// Unreachable
	std::abort();
	return 0;
}

#undef MAX_SLOTS

static atomic_wait::sync_var* slot_get(std::uintptr_t iptr, atomic_wait::sync_var* loc, u64 lv = 0)
{
	if (!loc)
	{
		return nullptr;
	}

	const u64 value = loc->addr_ref.load();

	if ((value & s_waiter_mask) == 0)
	{
		return nullptr;
	}

	if ((value & s_pointer_mask) == (iptr & s_pointer_mask))
	{
		return loc;
	}

	if ((value & s_collision_bit) == 0)
	{
		return nullptr;
	}

	// Get the number of leading equal bits to determine subslot
	const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);

	// Proceed recursively, increment level
	return slot_get(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
}

static void slot_free(u64 id)
{
	// Reset allocation bit
	id = (id & s_slot_mask) / one_v<s_slot_mask>;
	s_slot_bits[id / 64] &= ~(1ull << (id % 64));

	// Reset semaphore
	s_slot_sema--;
}

static void slot_free(std::uintptr_t iptr, atomic_wait::sync_var* loc, u64 lv = 0)
{
	const u64 value = loc->addr_ref.load();

	if ((value & s_pointer_mask) != (iptr & s_pointer_mask))
	{
		ASSERT(value & s_waiter_mask);
		ASSERT(value & s_collision_bit);

		// Get the number of leading equal bits to determine subslot
		const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);

		// Proceed recursively, to deallocate deepest branch first
		slot_free(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
	}

	// Actual cleanup in reverse order
	auto [_old, ok] = loc->addr_ref.fetch_op([&](u64& value)
	{
		ASSERT(value & s_waiter_mask);
		{
			value -= one_v<s_waiter_mask>;

			if (!(value & s_waiter_mask))
			{
				// Reset on last waiter
				value = 0;
				return 2;
			}

			return 1;
		}
	});

	if (ok > 1 && _old & s_collision_bit)
	{
		// Deallocate slot on last waiter
		slot_free(_old);
	}
}

SAFE_BUFFERS void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 timeout, __m128i mask)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	// Allocated slot index
	u64 slot_a = -1;

	// Found slot object
	atomic_wait::sync_var* slot = nullptr;

	auto install_op = [&](u64& value) -> u64
	{
		if ((value & s_waiter_mask) == s_waiter_mask)
		{
			// Return immediately on waiter overflow
			return 0;
		}

		if (!value || (value & s_pointer_mask) == (iptr & s_pointer_mask))
		{
			// Store pointer bits
			value |= (iptr & s_pointer_mask);
		}
		else
		{
			if ((value & s_collision_bit) == 0)
			{
				if (slot_a + 1 == 0)
				{
					// Second waiter: allocate slot and install it
					slot_a = slot_alloc() * one_v<s_slot_mask>;
				}

				value |= slot_a;
			}

			// Set collision bit
			value |= s_collision_bit;
		}

		// Add waiter
		value += one_v<s_waiter_mask>;
		return value;
	};

	// Search detail
	u64 lv = 0;

	for (atomic_wait::sync_var* ptr = &s_hashtable[iptr % s_hashtable_size];;)
	{
		auto [_old, ok] = ptr->addr_ref.fetch_op(install_op);

		if (slot_a + 1)
		{
			if ((_old & s_collision_bit) == 0 && (ok & s_collision_bit) && (ok & s_slot_mask) == slot_a)
			{
				// Slot set successfully
				slot_a = -1;
			}
		}

		if (!ok)
		{
			// Expected only on top level
			if (timeout + 1 || ptr_cmp<false>(data, size, old_value, mask))
			{
				return;
			}

			// TODO
			busy_wait(30000);
			continue;
		}

		if (!_old || (_old & s_pointer_mask) == (iptr & s_pointer_mask))
		{
			// Success
			if (slot_a + 1)
			{
				// Cleanup slot if unused
				slot_free(slot_a);
				slot_a = -1;
			}

			slot = ptr;
			break;
		}

		// Get the number of leading equal bits (between iptr and slot owner)
		const u64 eq_bits = std::countl_zero<u64>((((iptr ^ ok) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);

		// Collision; need to go deeper
		ptr = s_slot_list[(ok & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits;

		lv = eq_bits + 1;
	}

	const u32 cond_id = cond_alloc();

	if (cond_id == 0)
	{
		fmt::raw_error("Thread limit " STRINGIZE(UINT16_MAX) " reached in atomic wait.");
	}

	auto sema = slot->sema_alloc();

	while (!sema)
	{
		if (timeout + 1 || ptr_cmp<false>(data, size, old_value, mask))
		{
			cond_free(cond_id);
			slot_free(iptr, &s_hashtable[iptr % s_hashtable_size]);
			return;
		}

		// TODO
		busy_wait(30000);
		sema = slot->sema_alloc();
	}

	// Save for notifiers
	const auto cond = cond_get(cond_id);

	// Store some info for notifiers (some may be unused)
	cond->size = size;
	cond->mask = mask;
	cond->oldv = old_value;
	cond->ptr  = data;
	cond->tsc0 = __rdtsc();

	cond->sync = 1;
	sema->store(static_cast<u16>(cond_id));

#ifdef USE_STD
	// Lock mutex
	std::unique_lock lock(cond->mtx);
#endif

	// Can skip unqueue process if true
#if defined(USE_FUTEX) || defined(USE_STD)
	constexpr bool fallback = true;
#else
	bool fallback = false;
#endif

	while (ptr_cmp(data, size, old_value, mask))
	{
#ifdef USE_FUTEX
		struct timespec ts;
		ts.tv_sec  = timeout / 1'000'000'000;
		ts.tv_nsec = timeout % 1'000'000'000;

		if (cond->sync.load() > 1) [[unlikely]]
		{
			// Signaled prematurely
			if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
			{
				break;
			}
		}
		else
		{
			futex(&cond->sync, FUTEX_WAIT_PRIVATE, 1, timeout + 1 ? &ts : nullptr);
		}
#elif defined(USE_STD)
		if (cond->sync.load() > 1) [[unlikely]]
		{
			if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
			{
				break;
			}
		}

		if (timeout + 1)
		{
			cond->cond.wait_for(lock, std::chrono::nanoseconds(timeout));
		}
		else
		{
			cond->cond.wait(lock);
		}
#elif defined(_WIN32)
		LARGE_INTEGER qw;
		qw.QuadPart = -static_cast<s64>(timeout / 100);

		if (timeout % 100)
		{
			// Round up to closest 100ns unit
			qw.QuadPart -= 1;
		}

		if (fallback) [[unlikely]]
		{
			if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
			{
				fallback = false;
				break;
			}

			fallback = false;
		}

		if (NtWaitForAlertByThreadId)
		{
			switch (DWORD status = NtWaitForAlertByThreadId(cond, timeout + 1 ? &qw : nullptr))
			{
			case NTSTATUS_ALERTED: fallback = true; break;
			case NTSTATUS_TIMEOUT: break;
			default:
			{
				SetLastError(status);
				fmt::raw_verify_error("Unexpected NtWaitForAlertByThreadId result.", nullptr, 0);
			}
			}
		}
		else
		{
			if (NtWaitForKeyedEvent(nullptr, &cond->sync, false, timeout + 1 ? &qw : nullptr) == NTSTATUS_SUCCESS)
			{
				// Error code assumed to be timeout
				fallback = true;
			}
		}
#endif

		if (timeout + 1)
		{
			// TODO: reduce timeout instead
			break;
		}
	}

	while (!fallback)
	{
#if defined(_WIN32)
		static LARGE_INTEGER instant{};

		if (cond->sync.compare_and_swap_test(1, 2))
		{
			// Succeeded in self-notifying
			break;
		}

		if (NtWaitForAlertByThreadId)
		{
			if (NtWaitForAlertByThreadId(cond, &instant) == NTSTATUS_ALERTED)
			{
				break;
			}

			continue;
		}

		if (!NtWaitForKeyedEvent(nullptr, &cond->sync, false, &instant))
		{
			// Succeeded in obtaining an event without waiting
			break;
		}

		continue;
#endif
	}

#ifdef USE_STD
	if (lock)
	{
		lock.unlock();
	}
#endif

	slot->sema_free(sema);

	slot_free(iptr, &s_hashtable[iptr % s_hashtable_size]);

	s_tls_wait_cb(nullptr);
}

// Platform specific wake-up function
static NEVER_INLINE bool
#ifdef _WIN32
__vectorcall
#endif
alert_sema(atomic_t<u16>* sema, const void* data, u64 info, u32 size, __m128i mask, __m128i new_value)
{
	const u32 cond_id = cond_lock(sema);

	if (!cond_id)
	{
		return false;
	}

	const auto cond = cond_get(cond_id);

	verify(HERE), cond;

	bool ok = false;

	if (cond->sync && (!size ? (!info || cond->tid == info) : cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv)))
	{
		if ((!size && cond->forced_wakeup()) || (size && cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2)))
		{
			ok = true;
			cond->alert_native();
		}
	}

	// Remove lock, possibly deallocate cond
	cond_free(cond_id);
	return ok;
}

void atomic_wait_engine::set_wait_callback(bool(*cb)(const void* data))
{
	if (cb)
	{
		s_tls_wait_cb = cb;
	}
	else
	{
		s_tls_wait_cb = [](const void*){ return true; };
	}
}

void atomic_wait_engine::set_notify_callback(void(*cb)(const void*, u64))
{
	if (cb)
	{
		s_tls_notify_cb = cb;
	}
	else
	{
		s_tls_notify_cb = [](const void*, u64){};
	}
}

bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
{
	// Special operation mode. Note that this is not atomic.
	if (!data)
	{
		// Special path: search thread_id without pointer information
		for (u32 i = 1; i < UINT16_MAX; i++)
		{
			const auto [_, ok] = s_cond_refs[i].fetch_op([&](u32& ref)
			{
				if (!ref)
				{
					// Skip dead semaphores
					return false;
				}

				if (thread_id)
				{
					u64 tid = 0;
					std::memcpy(&tid, &cond_get(i)->tid, sizeof(tid));

					if (tid != thread_id)
					{
						// Check thread first without locking (memory may be uninitialized)
						return false;
					}
				}

				if (ref < UINT32_MAX)
				{
					// Need to busy loop otherwise (TODO)
					ref++;
				}

				return true;
			});

			if (ok) [[unlikely]]
			{
				const auto cond = cond_get(i);

				if (!thread_id || cond->tid == thread_id)
				{
					if (cond->forced_wakeup())
					{
						cond->alert_native();

						if (thread_id)
						{
							// Only if thread_id is speficied, stop only it and return true.
							cond_free(i);
							return true;
						}
					}
				}

				cond_free(i);
			}
		}

		return false;
	}

	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);

	if (!slot)
	{
		return false;
	}

	s_tls_notify_cb(data, 0);

	u64 progress = 0;

	for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
	{
		const auto sema = slot->get_sema(std::countr_zero(bits));

		// Forced notification
		if (alert_sema(sema, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128()))
		{
			s_tls_notify_cb(data, ++progress);

			if (thread_id == 0)
			{
				// Works like notify_all in this case
				continue;
			}

			break;
		}
	}

	s_tls_notify_cb(data, -1);
	return progress != 0;
}

void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i new_value)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);

	if (!slot)
	{
		return;
	}

	s_tls_notify_cb(data, 0);

	u64 progress = 0;

	for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
	{
		const auto sema = slot->get_sema(std::countr_zero(bits));

		if (alert_sema(sema, data, progress, size, mask, new_value))
		{
			s_tls_notify_cb(data, ++progress);
			break;
		}
	}

	s_tls_notify_cb(data, -1);
}

SAFE_BUFFERS void
#ifdef _WIN32
__vectorcall
#endif
atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask, __m128i new_value)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);

	if (!slot)
	{
		return;
	}

	s_tls_notify_cb(data, 0);

	u64 progress = 0;
	{
		// Make a copy to filter out waiters that fail some checks
		u64 copy = slot->get_sema_bits();
		u64 lock = 0;
		u32 lock_ids[64]{};

		for (u64 bits = copy; bits; bits &= bits - 1)
		{
			const u32 id = std::countr_zero(bits);

			const auto sema = slot->get_sema(id);

			if (const u32 cond_id = cond_lock(sema))
			{
				// Add lock bit for cleanup
				lock |= 1ull << id;
				lock_ids[id] = cond_id;

				const auto cond = cond_get(cond_id);

				verify(HERE), cond;

				if (cond->sync && cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv))
				{
					if (cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2))
					{
						// Ok.
						continue;
					}
				}
			}

			// Remove the bit from next stage
			copy &= ~(1ull << id);
		}

		// If only one waiter exists, there is no point in trying to optimize
		if (copy & (copy - 1))
		{
			for (u64 bits = copy; bits; bits &= bits - 1)
			{
				const u32 id = std::countr_zero(bits);

				if (cond_get(lock_ids[id])->try_alert_native())
				{
					s_tls_notify_cb(data, ++progress);

					// Remove the bit from next stage
					copy &= ~(1ull << id);
				}
			}
		}

		// Proceed with remaining bits using "normal" blocking waiting
		for (u64 bits = copy; bits; bits &= bits - 1)
		{
			cond_get(lock_ids[std::countr_zero(bits)])->alert_native();

			s_tls_notify_cb(data, ++progress);
		}

		// Cleanup locked notifiers
		for (u64 bits = lock; bits; bits &= bits - 1)
		{
			cond_free(lock_ids[std::countr_zero(bits)]);
		}

		s_tls_notify_cb(data, -1);
		return;
	}

	// Unused, let's keep for reference
	for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
	{
		const auto sema = slot->get_sema(std::countr_zero(bits));

		if (alert_sema(sema, data, progress, size, mask, new_value))
		{
			s_tls_notify_cb(data, ++progress);
			continue;
		}
	}

	s_tls_notify_cb(data, -1);
}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#include "atomic.hpp"
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#if defined(__linux__)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#define USE_FUTEX
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#elif !defined(_WIN32)
 								#define USE_STD
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#endif
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#include "Utilities/sync.h"
-												atomic.hpp: fix internal resource deallocation

Also remove debug printf.

											
										
										
											2019-10-20 21:01:10 +02:00
+								#include <utility>
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+								#include <mutex>
 								#include <condition_variable>
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#include <chrono>
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#include <iterator>
 								#include <memory>
-												atomic.hpp: fix internal resource deallocation

Also remove debug printf.

											
										
										
											2019-10-20 21:01:10 +02:00
+								#include <cstdlib>
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 								// Hashtable size factor (can be set to 0 to stress-test collisions)
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+								static constexpr uint s_hashtable_power = 16;
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: increase hashtable capacity

Double size and ignore 2 lowest bits (effectively x8)

											
										
										
											2019-09-12 16:14:26 +02:00
+								// Total number of entries, should be a power of 2.
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								static constexpr std::uintptr_t s_hashtable_size = 1u << s_hashtable_power;
 								// Pointer mask without bits used as hash, assuming signed 48-bit pointers.
-												atomic.hpp: fix internal resource deallocation

Also remove debug printf.

											
										
										
											2019-10-20 21:01:10 +02:00
+								static constexpr u64 s_pointer_mask = s_hashtable_power > 7 ? 0xffff'ffff'ffff & ~((s_hashtable_size - 1)) : 0xffff'ffff'ffff;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 								// Max number of waiters is 32767.
-												atomic.hpp: fix internal resource deallocation

Also remove debug printf.

											
										
										
											2019-10-20 21:01:10 +02:00
+								static constexpr u64 s_waiter_mask = s_hashtable_power > 7 ? 0x7fff'0000'0000'0000 : 0x7f00'0000'0000'0000;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 								// Bit indicates that more than one.
 								static constexpr u64 s_collision_bit = 0x8000'0000'0000'0000;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								// Allocated slot with secondary table.
 								static constexpr u64 s_slot_mask = ~(s_waiter_mask | s_pointer_mask | s_collision_bit);
 								// Helper to get least significant set bit from 64-bit masks
 								template <u64 Mask>
 								static constexpr u64 one_v = Mask & (0 - Mask);
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
-												atomic.cpp: integrate callback check in ptr_cmp function

											
										
										
											2020-10-23 17:20:57 +02:00
+								// Callback for wait() function, returns false if wait should return
 								static thread_local bool(*s_tls_wait_cb)(const void* data) = [](const void*){ return true; };
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+								// Callback for notification functions for optimizations
 								static thread_local void(*s_tls_notify_cb)(const void* data, u64 progress) = [](const void*, u64){};
-												atomic.cpp: integrate callback check in ptr_cmp function

											
										
										
											2020-10-23 17:20:57 +02:00
+								// Compare data in memory with old value, and return true if they are equal
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								template <bool CheckCb = true>
 								static NEVER_INLINE bool
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+								#ifdef _WIN32
 								__vectorcall
 								#endif
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								ptr_cmp(const void* data, u32 size, __m128i old128, __m128i mask128)
-												atomic.cpp: integrate callback check in ptr_cmp function

											
										
										
											2020-10-23 17:20:57 +02:00
+								{
 									if constexpr (CheckCb)
 									{
 										if (!s_tls_wait_cb(data))
 										{
 											return false;
 										}
 									}
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+									const u64 old_value = _mm_cvtsi128_si64(old128);
 									const u64 mask = _mm_cvtsi128_si64(mask128);
-												atomic.cpp: integrate callback check in ptr_cmp function

											
										
										
											2020-10-23 17:20:57 +02:00
+									switch (size)
 									{
 									case 1: return (reinterpret_cast<const atomic_t<u8>*>(data)->load() & mask) == (old_value & mask);
 									case 2: return (reinterpret_cast<const atomic_t<u16>*>(data)->load() & mask) == (old_value & mask);
 									case 4: return (reinterpret_cast<const atomic_t<u32>*>(data)->load() & mask) == (old_value & mask);
 									case 8: return (reinterpret_cast<const atomic_t<u64>*>(data)->load() & mask) == (old_value & mask);
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+									case 16:
 									{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										const auto v0 = std::bit_cast<__m128i>(atomic_storage<u128>::load(*reinterpret_cast<const u128*>(data)));
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+										const auto v1 = _mm_xor_si128(v0, old128);
 										const auto v2 = _mm_and_si128(v1, mask128);
 										const auto v3 = _mm_packs_epi16(v2, v2);
 										if (_mm_cvtsi128_si64(v3) == 0)
 										{
 											return true;
 										}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
 										break;
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									default:
 									{
 										fprintf(stderr, "ptr_cmp(): bad size (size=%u)" HERE "\n", size);
 										std::abort();
 									}
-												atomic.cpp: integrate callback check in ptr_cmp function

											
										
										
											2020-10-23 17:20:57 +02:00
+									}
 									return false;
 								}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								// Returns true if mask overlaps, or the argument is invalid
 								static bool
 								#ifdef _WIN32
 								__vectorcall
 								#endif
 								cmp_mask(u32 size1, __m128i mask1, __m128i val1, u32 size2, __m128i mask2, __m128i val2)
 								{
 									// In force wake up, one of the size arguments is zero
 									const u32 size = std::min(size1, size2);
 									if (!size) [[unlikely]]
 									{
 										return true;
 									}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+									// Compare only masks, new value is not available in this mode
 									if ((size1 | size2) == umax)
 									{
 										// Simple mask overlap
 										const auto v0 = _mm_and_si128(mask1, mask2);
 										const auto v1 = _mm_packs_epi16(v0, v0);
 										return _mm_cvtsi128_si64(v1) != 0;
 									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									// Generate masked value inequality bits
 									const auto v0 = _mm_and_si128(_mm_and_si128(mask1, mask2), _mm_xor_si128(val1, val2));
 									if (size <= 8)
 									{
 										// Generate sized mask
 										const u64 mask = UINT64_MAX >> ((64 - size * 8) & 63);
 										if (!(_mm_cvtsi128_si64(v0) & mask))
 										{
 											return false;
 										}
 									}
 									else if (size == 16)
 									{
 										if (!_mm_cvtsi128_si64(_mm_packs_epi16(v0, v0)))
 										{
 											return false;
 										}
 									}
 									else
 									{
 										fprintf(stderr, "cmp_mask(): bad size (size1=%u, size2=%u)" HERE "\n", size1, size2);
 										std::abort();
 									}
 									return true;
 								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								namespace atomic_wait
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									// Essentially a fat semaphore
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									struct alignas(64) cond_handle
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								#ifdef _WIN32
 										u64 tid = GetCurrentThreadId();
 								#else
 										u64 tid = reinterpret_cast<u64>(pthread_self());
 								#endif
 										atomic_t<u32> sync{};
 										u32 size{};
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										u64 tsc0{};
 										const void* ptr{};
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										__m128i mask{};
 										__m128i oldv{};
 								#ifdef USE_STD
 										// Standard CV/mutex pair (often contains pthread_cond_t/pthread_mutex_t)
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										std::condition_variable cond;
 										std::mutex mtx;
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								#endif
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
 										bool forced_wakeup()
 										{
 											const auto [_old, ok] = sync.fetch_op([](u32& val)
 											{
 												if (val == 1 || val == 2)
 												{
 													val = 3;
 													return true;
 												}
 												return false;
 											});
 											// Prevent collision between normal wake-up and forced one
 											return ok && _old == 1;
 										}
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
 										void alert_native()
 										{
 								#ifdef USE_FUTEX
 											// Use "wake all" arg for robustness, only 1 thread is expected
 											futex(&sync, FUTEX_WAKE_PRIVATE, 0x7fff'ffff);
 								#elif defined(USE_STD)
 											// Not super efficient: locking is required to avoid lost notifications
 											mtx.lock();
 											mtx.unlock();
 											cond.notify_all();
 								#elif defined(_WIN32)
 											if (NtWaitForAlertByThreadId)
 											{
 												// Sets some sticky alert bit, at least I believe so
 												NtAlertThreadByThreadId(tid);
 											}
 											else
 											{
 												// Can wait in rare cases, which is its annoying weakness
 												NtReleaseKeyedEvent(nullptr, &sync, 1, nullptr);
 											}
 								#endif
 										}
 										bool try_alert_native()
 										{
 								#if defined(USE_FUTEX)
 											return false;
 								#elif defined(USE_STD)
 											// Optimistic non-blocking path
 											if (mtx.try_lock())
 											{
 												mtx.unlock();
 												cond.notify_all();
 												return true;
 											}
 											return false;
 								#elif defined(_WIN32)
 											if (NtAlertThreadByThreadId)
 											{
 												// Don't notify prematurely with this API
 												return false;
 											}
 											static LARGE_INTEGER instant{};
 											if (NtReleaseKeyedEvent(nullptr, &sync, 1, &instant) != NTSTATUS_SUCCESS)
 											{
 												// Failed to notify immediately
 												return false;
 											}
 											return true;
 								#endif
 										}
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									};
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
 								#ifndef USE_STD
 									static_assert(sizeof(cond_handle) == 64);
 								#endif
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+								// Max allowed thread number is chosen to fit in 16 bits
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+								static std::aligned_storage_t<sizeof(atomic_wait::cond_handle), alignof(atomic_wait::cond_handle)> s_cond_list[UINT16_MAX + 1]{};
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								// Used to allow concurrent notifying
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+								static atomic_t<u32> s_cond_refs[UINT16_MAX + 1]{};
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								// Allocation bits
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+								static atomic_t<u64, 64> s_cond_bits[(UINT16_MAX + 1) / 64]{};
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								// Allocation semaphore
 								static atomic_t<u32, 64> s_cond_sema{0};
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
 								static u32 cond_alloc()
 								{
 									// Determine whether there is a free slot or not
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									if (!s_cond_sema.try_inc(UINT16_MAX + 1))
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									{
 										return 0;
 									}
 									// Diversify search start points to reduce contention and increase immediate success chance
 								#ifdef _WIN32
 									const u32 start = GetCurrentProcessorNumber();
 								#elif __linux__
 									const u32 start = sched_getcpu();
 								#else
 									const u32 start = __rdtsc();
 								#endif
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									for (u32 i = start;; i++)
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										const u32 group = i % ::size32(s_cond_bits);
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
 										const auto [bits, ok] = s_cond_bits[group].fetch_op([](u64& bits)
 										{
 											if (~bits)
 											{
 												// Set lowest clear bit
 												bits |= bits + 1;
 												return true;
 											}
 											return false;
 										});
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+										if (ok) [[likely]]
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										{
 											// Find lowest clear bit
 											const u32 id = group * 64 + std::countr_one(bits);
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+											if (id == 0) [[unlikely]]
 											{
 												// Special case, set bit and continue
 												continue;
 											}
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+											// Construct inplace before it can be used
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+											new (s_cond_list + id) atomic_wait::cond_handle();
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											// Add first reference
 											verify(HERE), !s_cond_refs[id]++;
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+											return id;
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										}
 									}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									// Unreachable
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									std::abort();
 									return 0;
 								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								static atomic_wait::cond_handle* cond_get(u32 cond_id)
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									if (cond_id - 1 < u32{UINT16_MAX}) [[likely]]
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									{
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+										return std::launder(reinterpret_cast<atomic_wait::cond_handle*>(s_cond_list + cond_id));
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									}
 									return nullptr;
 								}
 								static void cond_free(u32 cond_id)
 								{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									if (cond_id - 1 >= u32{UINT16_MAX})
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										fprintf(stderr, "cond_free(): bad id %u" HERE "\n", cond_id);
 										std::abort();
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									// Dereference, destroy on last ref
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+									if (--s_cond_refs[cond_id])
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									{
 										return;
 									}
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+									// Call the destructor
 									cond_get(cond_id)->~cond_handle();
 									// Remove the allocation bit
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+									s_cond_bits[cond_id / 64] &= ~(1ull << (cond_id % 64));
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
 									// Release the semaphore
 									s_cond_sema--;
 								}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+								static u32 cond_lock(atomic_t<u16>* sema)
 								{
 									while (const u32 cond_id = sema->load())
 									{
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+										const auto [old, ok] = s_cond_refs[cond_id].fetch_op([](u32& ref)
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										{
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+											if (!ref || ref == UINT32_MAX)
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											{
 												// Don't reference already deallocated semaphore
 												return false;
 											}
 											ref++;
 											return true;
 										});
 										if (ok)
 										{
 											return cond_id;
 										}
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+										if (old == UINT32_MAX)
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										{
-												atomic.cpp: fix cond_handle data structures

Fix a critical bug with possible id out of range.

											
										
										
											2020-11-06 09:55:01 +01:00
+											fmt::raw_error("Thread limit " STRINGIZE(UINT32_MAX) " for a single address reached in atomic notifier.");
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										}
 										if (sema->load() != cond_id)
 										{
 											// Try again if it changed
 											continue;
 										}
 										else
 										{
 											break;
 										}
 									}
 									return 0;
 								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								namespace atomic_wait
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								#define MAX_THREADS (56)
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									struct alignas(128) sync_var
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										constexpr sync_var() noexcept = default;
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										// Reference counter, owning pointer, collision bit and optionally selected slot
 										atomic_t<u64> addr_ref{};
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									private:
 										// Semaphores (allocated in reverse order), empty are zeros
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										atomic_t<u16> sema_data[MAX_THREADS]{};
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										// Allocated semaphore bits (to make total size 128)
 										atomic_t<u64> sema_bits{};
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									public:
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										atomic_t<u16>* sema_alloc()
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										{
 											const auto [bits, ok] = sema_bits.fetch_op([](u64& bits)
 											{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+												if (bits + 1 < (1ull << MAX_THREADS))
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+												{
 													// Set lowest clear bit
 													bits |= bits + 1;
 													return true;
 												}
 												return false;
 											});
 											if (ok) [[likely]]
 											{
 												// Find lowest clear bit
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+												return get_sema(std::countr_one(bits));
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+											// TODO: support extension if reached
 											fmt::raw_error("Thread limit " STRINGIZE(MAX_THREADS) " for a single address reached in atomic wait.");
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											return nullptr;
 										}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										atomic_t<u16>* get_sema(u32 id)
 										{
 											verify(HERE), id < MAX_THREADS;
 											return &sema_data[(MAX_THREADS - 1) - id];
 										}
 										u64 get_sema_bits() const
 										{
 											return sema_bits & ((1ull << MAX_THREADS) - 1);
 										}
 										void reset_sema_bit(atomic_t<u16>* sema)
 										{
 											verify(HERE), sema >= sema_data && sema < std::end(sema_data);
 											sema_bits &= ~(1ull << ((MAX_THREADS - 1) - (sema - sema_data)));
 										}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										void sema_free(atomic_t<u16>* sema)
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										{
 											if (sema < sema_data || sema >= std::end(sema_data))
 											{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+												fprintf(stderr, "sema_free(): bad sema ptr %p" HERE "\n", sema);
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+												std::abort();
 											}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											// Try to deallocate semaphore (may be delegated to a notifier)
 											cond_free(sema->exchange(0));
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											// Clear sema bit
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											reset_sema_bit(sema);
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										}
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+									};
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
 									static_assert(sizeof(sync_var) == 128);
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
 								#undef MAX_THREADS
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+								}
 								// Main hashtable for atomic wait.
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								alignas(128) static atomic_wait::sync_var s_hashtable[s_hashtable_size]{};
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								namespace atomic_wait
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+								{
 									struct slot_info
 									{
 										constexpr slot_info() noexcept = default;
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										// Branch extension
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+										atomic_wait::sync_var branch[48 - s_hashtable_power]{};
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									};
 								}
 								// Number of search groups (defines max slot branch count as gcount * 64)
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								#define MAX_SLOTS (4096)
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 								// Array of slot branch objects
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								alignas(128) static atomic_wait::slot_info s_slot_list[MAX_SLOTS]{};
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 								// Allocation bits
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								static atomic_t<u64, 64> s_slot_bits[MAX_SLOTS / 64]{};
 								// Allocation semaphore
 								static atomic_t<u32, 64> s_slot_sema{0};
 								static_assert(MAX_SLOTS % 64 == 0);
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								static u64 slot_alloc()
 								{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									// Determine whether there is a free slot or not
 									if (!s_slot_sema.try_inc(MAX_SLOTS + 1))
 									{
 										fmt::raw_error("Hashtable extension slot limit " STRINGIZE(MAX_SLOTS) " reached in atomic wait.");
 										return 0;
 									}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									// Diversify search start points to reduce contention and increase immediate success chance
 								#ifdef _WIN32
 									const u32 start = GetCurrentProcessorNumber();
 								#elif __linux__
 									const u32 start = sched_getcpu();
 								#else
 									const u32 start = __rdtsc();
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#endif
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									for (u32 i = start;; i++)
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										const u32 group = i % ::size32(s_slot_bits);
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										const auto [bits, ok] = s_slot_bits[group].fetch_op([](u64& bits)
 										{
 											if (~bits)
 											{
 												// Set lowest clear bit
 												bits |= bits + 1;
 												return true;
 											}
 											return false;
 										});
 										if (ok)
 										{
 											// Find lowest clear bit
-												Replace utils::cnttz{32,64} with std::countr_{zero,one}

Make #include <bit> mandatory.

											
										
										
											2020-04-13 14:31:41 +02:00
+											return group * 64 + std::countr_one(bits);
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										}
 									}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									// Unreachable
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									std::abort();
 									return 0;
 								}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								#undef MAX_SLOTS
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								static atomic_wait::sync_var* slot_get(std::uintptr_t iptr, atomic_wait::sync_var* loc, u64 lv = 0)
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									if (!loc)
 									{
 										return nullptr;
 									}
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+									const u64 value = loc->addr_ref.load();
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
+									if ((value & s_waiter_mask) == 0)
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									{
 										return nullptr;
 									}
 									if ((value & s_pointer_mask) == (iptr & s_pointer_mask))
 									{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										return loc;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									}
 									if ((value & s_collision_bit) == 0)
 									{
 										return nullptr;
 									}
 									// Get the number of leading equal bits to determine subslot
-												Replace utils::cntlz{32,64} with std::countl_zero

											
										
										
											2020-04-13 20:57:16 +02:00
+									const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 									// Proceed recursively, increment level
 									return slot_get(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
 								}
 								static void slot_free(u64 id)
 								{
 									// Reset allocation bit
 									id = (id & s_slot_mask) / one_v<s_slot_mask>;
 									s_slot_bits[id / 64] &= ~(1ull << (id % 64));
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
 									// Reset semaphore
 									s_slot_sema--;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								static void slot_free(std::uintptr_t iptr, atomic_wait::sync_var* loc, u64 lv = 0)
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
+								{
 									const u64 value = loc->addr_ref.load();
 									if ((value & s_pointer_mask) != (iptr & s_pointer_mask))
 									{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										ASSERT(value & s_waiter_mask);
 										ASSERT(value & s_collision_bit);
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
 										// Get the number of leading equal bits to determine subslot
-												Replace utils::cntlz{32,64} with std::countl_zero

											
										
										
											2020-04-13 20:57:16 +02:00
+										const u64 eq_bits = std::countl_zero<u64>((((iptr ^ value) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
 										// Proceed recursively, to deallocate deepest branch first
 										slot_free(iptr, s_slot_list[(value & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits, eq_bits + 1);
 									}
 									// Actual cleanup in reverse order
 									auto [_old, ok] = loc->addr_ref.fetch_op([&](u64& value)
 									{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										ASSERT(value & s_waiter_mask);
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
+										{
 											value -= one_v<s_waiter_mask>;
 											if (!(value & s_waiter_mask))
 											{
 												// Reset on last waiter
 												value = 0;
 												return 2;
 											}
 											return 1;
 										}
 									});
 									if (ok > 1 && _old & s_collision_bit)
 									{
 										// Deallocate slot on last waiter
 										slot_free(_old);
 									}
 								}
-												atomics.cpp: add support for waiting on 128-bit atomics

Complementarily.
Also refactored to make waiting mask non-template arg.

											
										
										
											2020-10-26 21:32:40 +01:00
+								SAFE_BUFFERS void
 								#ifdef _WIN32
 								__vectorcall
 								#endif
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								atomic_wait_engine::wait(const void* data, u32 size, __m128i old_value, u64 timeout, __m128i mask)
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+								{
 									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									// Allocated slot index
 									u64 slot_a = -1;
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									// Found slot object
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+									atomic_wait::sync_var* slot = nullptr;
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									auto install_op = [&](u64& value) -> u64
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+									{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										if ((value & s_waiter_mask) == s_waiter_mask)
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											// Return immediately on waiter overflow
 											return 0;
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										if (!value || (value & s_pointer_mask) == (iptr & s_pointer_mask))
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+										{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+											// Store pointer bits
 											value |= (iptr & s_pointer_mask);
 										}
 										else
 										{
 											if ((value & s_collision_bit) == 0)
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+											{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+												if (slot_a + 1 == 0)
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+												{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+													// Second waiter: allocate slot and install it
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+													slot_a = slot_alloc() * one_v<s_slot_mask>;
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+												}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+												value |= slot_a;
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+											}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											// Set collision bit
 											value |= s_collision_bit;
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+										}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										// Add waiter
 										value += one_v<s_waiter_mask>;
 										return value;
 									};
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									// Search detail
 									u64 lv = 0;
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+									for (atomic_wait::sync_var* ptr = &s_hashtable[iptr % s_hashtable_size];;)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										auto [_old, ok] = ptr->addr_ref.fetch_op(install_op);
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 										if (slot_a + 1)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+											if ((_old & s_collision_bit) == 0 && (ok & s_collision_bit) && (ok & s_slot_mask) == slot_a)
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											{
 												// Slot set successfully
 												slot_a = -1;
 											}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										if (!ok)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											// Expected only on top level
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											if (timeout + 1 || ptr_cmp<false>(data, size, old_value, mask))
 											{
 												return;
 											}
 											// TODO
 											busy_wait(30000);
 											continue;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 										if (!_old || (_old & s_pointer_mask) == (iptr & s_pointer_mask))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											// Success
 											if (slot_a + 1)
 											{
 												// Cleanup slot if unused
 												slot_free(slot_a);
 												slot_a = -1;
 											}
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+											slot = ptr;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+											break;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										// Get the number of leading equal bits (between iptr and slot owner)
-												Replace utils::cntlz{32,64} with std::countl_zero

											
										
										
											2020-04-13 20:57:16 +02:00
+										const u64 eq_bits = std::countl_zero<u64>((((iptr ^ ok) & (s_pointer_mask >> lv)) | ~s_pointer_mask) << 16);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+										// Collision; need to go deeper
-												atomic.hpp: improve internal data layout

Put semaphore control variable together in hashtable.
Avoid unnecessary allocation of extents.

											
										
										
											2019-10-20 19:08:09 +02:00
+										ptr = s_slot_list[(ok & s_slot_mask) / one_v<s_slot_mask>].branch + eq_bits;
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
 										lv = eq_bits + 1;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									const u32 cond_id = cond_alloc();
 									if (cond_id == 0)
 									{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										fmt::raw_error("Thread limit " STRINGIZE(UINT16_MAX) " reached in atomic wait.");
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									}
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									auto sema = slot->sema_alloc();
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									while (!sema)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									{
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										if (timeout + 1 || ptr_cmp<false>(data, size, old_value, mask))
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											cond_free(cond_id);
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											slot_free(iptr, &s_hashtable[iptr % s_hashtable_size]);
 											return;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										}
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										// TODO
 										busy_wait(30000);
 										sema = slot->sema_alloc();
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									// Save for notifiers
 									const auto cond = cond_get(cond_id);
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									// Store some info for notifiers (some may be unused)
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									cond->size = size;
 									cond->mask = mask;
 									cond->oldv = old_value;
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									cond->ptr  = data;
 									cond->tsc0 = __rdtsc();
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
 									cond->sync = 1;
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									sema->store(static_cast<u16>(cond_id));
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#ifdef USE_STD
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									// Lock mutex
 									std::unique_lock lock(cond->mtx);
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#endif
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									// Can skip unqueue process if true
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#if defined(USE_FUTEX) || defined(USE_STD)
 									constexpr bool fallback = true;
 								#else
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									bool fallback = false;
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+								#endif
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									while (ptr_cmp(data, size, old_value, mask))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												atomic.hpp: reimplement USE_FUTEX path for Linux

											
										
										
											2019-10-20 17:19:54 +02:00
+								#ifdef USE_FUTEX
 										struct timespec ts;
 										ts.tv_sec  = timeout / 1'000'000'000;
 										ts.tv_nsec = timeout % 1'000'000'000;
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (cond->sync.load() > 1) [[unlikely]]
-												atomic.hpp: reimplement USE_FUTEX path for Linux

											
										
										
											2019-10-20 17:19:54 +02:00
+										{
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											// Signaled prematurely
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
 											{
 												break;
 											}
-												atomic.hpp: reimplement USE_FUTEX path for Linux

											
										
										
											2019-10-20 17:19:54 +02:00
+										}
 										else
 										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											futex(&cond->sync, FUTEX_WAIT_PRIVATE, 1, timeout + 1 ? &ts : nullptr);
-												atomic.hpp: reimplement USE_FUTEX path for Linux

											
										
										
											2019-10-20 17:19:54 +02:00
+										}
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#elif defined(USE_STD)
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (cond->sync.load() > 1) [[unlikely]]
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+											{
 												break;
 											}
 										}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
 										if (timeout + 1)
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											cond->cond.wait_for(lock, std::chrono::nanoseconds(timeout));
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										}
 										else
 										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											cond->cond.wait(lock);
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+										}
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+								#elif defined(_WIN32)
-												Revert "Atomics: use WaitOnAddress if available (Win8+)"

This reverts commit f0fd7e2e19975fe442455080475e74ec43a3f520.

											
										
										
											2020-10-21 08:54:37 +02:00
+										LARGE_INTEGER qw;
 										qw.QuadPart = -static_cast<s64>(timeout / 100);
-												Atomics: use WaitOnAddress if available (Win8+)

											
										
										
											2020-10-20 21:00:15 +02:00
-												Revert "Atomics: use WaitOnAddress if available (Win8+)"

This reverts commit f0fd7e2e19975fe442455080475e74ec43a3f520.

											
										
										
											2020-10-21 08:54:37 +02:00
+										if (timeout % 100)
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										{
-												Revert "Atomics: use WaitOnAddress if available (Win8+)"

This reverts commit f0fd7e2e19975fe442455080475e74ec43a3f520.

											
										
										
											2020-10-21 08:54:37 +02:00
+											// Round up to closest 100ns unit
 											qw.QuadPart -= 1;
 										}
-												Atomics: use WaitOnAddress if available (Win8+)

											
										
										
											2020-10-20 21:00:15 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (fallback) [[unlikely]]
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+											if (cond->sync.load() == 3 || !cond->sync.compare_and_swap_test(2, 1))
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+											{
 												fallback = false;
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+												break;
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+											}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											fallback = false;
 										}
 										if (NtWaitForAlertByThreadId)
 										{
 											switch (DWORD status = NtWaitForAlertByThreadId(cond, timeout + 1 ? &qw : nullptr))
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+											{
 											case NTSTATUS_ALERTED: fallback = true; break;
 											case NTSTATUS_TIMEOUT: break;
 											default:
 											{
 												SetLastError(status);
 												fmt::raw_verify_error("Unexpected NtWaitForAlertByThreadId result.", nullptr, 0);
 											}
 											}
 										}
 										else
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+											if (NtWaitForKeyedEvent(nullptr, &cond->sync, false, timeout + 1 ? &qw : nullptr) == NTSTATUS_SUCCESS)
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+											{
 												// Error code assumed to be timeout
 												fallback = true;
 											}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#endif
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+										if (timeout + 1)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											// TODO: reduce timeout instead
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+											break;
 										}
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									while (!fallback)
 									{
 								#if defined(_WIN32)
-												Revert "Atomics: use WaitOnAddress if available (Win8+)"

This reverts commit f0fd7e2e19975fe442455080475e74ec43a3f520.

											
										
										
											2020-10-21 08:54:37 +02:00
+										static LARGE_INTEGER instant{};
-												Atomics: use WaitOnAddress if available (Win8+)

											
										
										
											2020-10-20 21:00:15 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (cond->sync.compare_and_swap_test(1, 2))
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+										{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											// Succeeded in self-notifying
 											break;
 										}
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (NtWaitForAlertByThreadId)
 										{
 											if (NtWaitForAlertByThreadId(cond, &instant) == NTSTATUS_ALERTED)
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+											{
 												break;
 											}
 											continue;
 										}
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+										if (!NtWaitForKeyedEvent(nullptr, &cond->sync, false, &instant))
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											// Succeeded in obtaining an event without waiting
 											break;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										continue;
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+								#endif
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									}
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#ifdef USE_STD
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									if (lock)
 									{
 										lock.unlock();
 									}
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
+								#endif
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+									slot->sema_free(sema);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: remove <string> dependency

											
										
										
											2019-10-20 21:52:18 +02:00
+									slot_free(iptr, &s_hashtable[iptr % s_hashtable_size]);
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									s_tls_wait_cb(nullptr);
 								}
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+								// Platform specific wake-up function
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								static NEVER_INLINE bool
 								#ifdef _WIN32
 								__vectorcall
 								#endif
 								alert_sema(atomic_t<u16>* sema, const void* data, u64 info, u32 size, __m128i mask, __m128i new_value)
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+								{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									const u32 cond_id = cond_lock(sema);
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									if (!cond_id)
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									{
 										return false;
 									}
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									const auto cond = cond_get(cond_id);
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									verify(HERE), cond;
 									bool ok = false;
-												atomic.cpp: add std primitive fallback for other platforms

Other platforms = not Windows or Linux.

											
										
										
											2020-10-25 02:17:54 +02:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									if (cond->sync && (!size ? (!info || cond->tid == info) : cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv)))
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+									{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										if ((!size && cond->forced_wakeup()) || (size && cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2)))
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+										{
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+											ok = true;
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+											cond->alert_native();
-												atomic.cpp: use new thread alerting API (Win8+)

Win7 will remain using old API (keyed events).

											
										
										
											2020-10-24 03:40:12 +02:00
+										}
 									}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									// Remove lock, possibly deallocate cond
 									cond_free(cond_id);
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									return ok;
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								void atomic_wait_engine::set_wait_callback(bool(*cb)(const void* data))
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								{
 									if (cb)
 									{
 										s_tls_wait_cb = cb;
 									}
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+									else
 									{
 										s_tls_wait_cb = [](const void*){ return true; };
 									}
 								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								void atomic_wait_engine::set_notify_callback(void(*cb)(const void*, u64))
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+								{
 									if (cb)
 									{
 										s_tls_notify_cb = cb;
 									}
 									else
 									{
 										s_tls_notify_cb = [](const void*, u64){};
 									}
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								}
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								bool atomic_wait_engine::raw_notify(const void* data, u64 thread_id)
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								{
-												atomic.cpp: upgrade raw_notify()

Now it searches all semaphores if data arg is nullptr.
Also it tries to wake up all threads if thread_id is 0.

											
										
										
											2020-11-06 10:53:49 +01:00
+									// Special operation mode. Note that this is not atomic.
 									if (!data)
 									{
 										// Special path: search thread_id without pointer information
 										for (u32 i = 1; i < UINT16_MAX; i++)
 										{
 											const auto [_, ok] = s_cond_refs[i].fetch_op([&](u32& ref)
 											{
 												if (!ref)
 												{
 													// Skip dead semaphores
 													return false;
 												}
 												if (thread_id)
 												{
 													u64 tid = 0;
 													std::memcpy(&tid, &cond_get(i)->tid, sizeof(tid));
 													if (tid != thread_id)
 													{
 														// Check thread first without locking (memory may be uninitialized)
 														return false;
 													}
 												}
 												if (ref < UINT32_MAX)
 												{
 													// Need to busy loop otherwise (TODO)
 													ref++;
 												}
 												return true;
 											});
 											if (ok) [[unlikely]]
 											{
 												const auto cond = cond_get(i);
 												if (!thread_id || cond->tid == thread_id)
 												{
 													if (cond->forced_wakeup())
 													{
 														cond->alert_native();
 														if (thread_id)
 														{
 															// Only if thread_id is speficied, stop only it and return true.
 															cond_free(i);
 															return true;
 														}
 													}
 												}
 												cond_free(i);
 											}
 										}
 										return false;
 									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
 									const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);
 									if (!slot)
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+									{
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										return false;
 									}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									s_tls_notify_cb(data, 0);
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									u64 progress = 0;
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+									{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										const auto sema = slot->get_sema(std::countr_zero(bits));
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
 										// Forced notification
 										if (alert_sema(sema, data, thread_id, 0, _mm_setzero_si128(), _mm_setzero_si128()))
 										{
 											s_tls_notify_cb(data, ++progress);
 											if (thread_id == 0)
 											{
 												// Works like notify_all in this case
 												continue;
 											}
 											break;
 										}
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+									}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
 									s_tls_notify_cb(data, -1);
 									return progress != 0;
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								void
 								#ifdef _WIN32
 								__vectorcall
 								#endif
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								atomic_wait_engine::notify_one(const void* data, u32 size, __m128i mask, __m128i new_value)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								{
 									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									if (!slot)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									{
 										return;
 									}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									s_tls_notify_cb(data, 0);
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+									u64 progress = 0;
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										const auto sema = slot->get_sema(std::countr_zero(bits));
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (alert_sema(sema, data, progress, size, mask, new_value))
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+											s_tls_notify_cb(data, ++progress);
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											break;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										}
 									}
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
 									s_tls_notify_cb(data, -1);
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								}
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+								SAFE_BUFFERS void
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+								#ifdef _WIN32
 								__vectorcall
 								#endif
-												atomic.cpp: remove load() from notify functions

Only compare masks for overlap for second overload (with mask provided).
Explicit "new value" can be provided in new 3-arg overloads.
Also rename atomic_storage_futex -> atomic_wait_engine.

											
										
										
											2020-11-06 00:43:14 +01:00
+								atomic_wait_engine::notify_all(const void* data, u32 size, __m128i mask, __m128i new_value)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								{
 									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									const auto slot = slot_get(iptr, &s_hashtable[(iptr) % s_hashtable_size]);
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
+									if (!slot)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									{
 										return;
 									}
-												atomic.hpp: rewrite collision handling

Remove "fallback" code path.
Remove USE_FUTEX code path temporarily.

											
										
										
											2019-10-20 01:41:19 +02:00
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+									s_tls_notify_cb(data, 0);
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+									u64 progress = 0;
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
+									{
 										// Make a copy to filter out waiters that fail some checks
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										u64 copy = slot->get_sema_bits();
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										u64 lock = 0;
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+										u32 lock_ids[64]{};
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
 										for (u64 bits = copy; bits; bits &= bits - 1)
 										{
 											const u32 id = std::countr_zero(bits);
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											const auto sema = slot->get_sema(id);
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											if (const u32 cond_id = cond_lock(sema))
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+											{
 												// Add lock bit for cleanup
 												lock |= 1ull << id;
 												lock_ids[id] = cond_id;
 												const auto cond = cond_get(cond_id);
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+												verify(HERE), cond;
 												if (cond->sync && cond->ptr == data && cmp_mask(size, mask, new_value, cond->size, cond->mask, cond->oldv))
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+												{
 													if (cond->sync.load() == 1 && cond->sync.compare_and_swap_test(1, 2))
 													{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+														// Ok.
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+														continue;
 													}
 												}
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
+											}
 											// Remove the bit from next stage
 											copy &= ~(1ull << id);
 										}
 										// If only one waiter exists, there is no point in trying to optimize
 										if (copy & (copy - 1))
 										{
 											for (u64 bits = copy; bits; bits &= bits - 1)
 											{
 												const u32 id = std::countr_zero(bits);
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+												if (cond_get(lock_ids[id])->try_alert_native())
-												atomic.cpp: various cleanups and fixes

Add pointer comparison to notifiers (to prevent spurious wakeups).
Fix a bug with a possible double notification in raw_notify().
Fix a bug with incorrect allocatin bit slots for cond_handle.
Add a semaphore counter to track max allowed number of threads.
Use #define for some constants to STRINGIZE them in errors.
Add some error messages when certain limits are reached.
Fix a bug with a wrong check simply throwing std::abort.
Use "special" notify_all patch with batch processing for every arch.
Fix Win7 bug who no one probably noticed.

											
										
										
											2020-11-05 17:18:48 +01:00
+												{
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+													s_tls_notify_cb(data, ++progress);
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+													// Remove the bit from next stage
 													copy &= ~(1ull << id);
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
+												}
 											}
 										}
 										// Proceed with remaining bits using "normal" blocking waiting
 										for (u64 bits = copy; bits; bits &= bits - 1)
 										{
-												atomic.cpp: refactor native signaling functions

Add native_alert() and try_native_alert()

											
										
										
											2020-11-06 10:48:56 +01:00
+											cond_get(lock_ids[std::countr_zero(bits)])->alert_native();
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+											s_tls_notify_cb(data, ++progress);
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
+										}
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										// Cleanup locked notifiers
 										for (u64 bits = lock; bits; bits &= bits - 1)
 										{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+											cond_free(lock_ids[std::countr_zero(bits)]);
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										}
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+										s_tls_notify_cb(data, -1);
-												atomic.cpp: experimental optimization (Win7+)

Try non-blocking wake up function first.

											
										
										
											2020-10-23 21:43:27 +02:00
+										return;
 									}
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+									// Unused, let's keep for reference
 									for (u64 bits = slot->get_sema_bits(); bits; bits &= bits - 1)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									{
-												atomic.cpp: more improvements

Add ref counters for internal semaphores for concurrent notifying.
Lack of them could result in loss of notification.

											
										
										
											2020-11-05 22:06:58 +01:00
+										const auto sema = slot->get_sema(std::countr_zero(bits));
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												atomic.cpp: improvements.

Reduced static memory amount for waitable atomics.
Allow notifier to skip notifications if wait/notify masks don't overlap.
Improve raw_notify to wake up the thread by its id, add thread_id arg.
Add optional mask argument to notify_one() and notify_all().

											
										
										
											2020-11-04 15:19:35 +01:00
+										if (alert_sema(sema, data, progress, size, mask, new_value))
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										{
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
+											s_tls_notify_cb(data, ++progress);
-												atomic.cpp: rewrite internals (again)

Use individual semaphore for each thread.
Unfortunately, limit max thread waiting for single address (60).
If limit is reached, use busy waiting.

											
										
										
											2020-10-23 18:31:29 +02:00
+											continue;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										}
 									}
-												atomic.cpp: implement notify callback

Notification can be very heavy, especially if we need to wake many threads.
Callback is set for cpu_thread in order to set wait flag accordingly.

											
										
										
											2020-10-26 02:02:39 +01:00
 									s_tls_notify_cb(data, -1);
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								}