rpcsx/rpcs3/util/atomic.cpp

#include "atomic.hpp"

// USE_FUTEX takes precedence over USE_POSIX

#ifdef __linux__
#define USE_FUTEX
#define USE_POSIX
#endif

#include "Utilities/sync.h"
#include "Utilities/asm.h"

#ifdef USE_POSIX
#include <semaphore.h>
#endif

#include <map>
#include <mutex>
#include <condition_variable>
#include <iterator>
#include <memory>

// Total number of entries, should be a power of 2.
static constexpr std::uintptr_t s_hashtable_size = 1u << 22;

// TODO: it's probably better to implement more effective futex emulation for OSX/BSD here.
static atomic_t<u64> s_hashtable[s_hashtable_size]{};

// Pointer mask without bits used as hash, assuming signed 48-bit pointers
static constexpr u64 s_pointer_mask = 0xffff'ffff'ffff & ~((s_hashtable_size - 1) << 2);

// Max number of waiters is 32767
static constexpr u64 s_waiter_mask = 0x7fff'0000'0000'0000;

//
static constexpr u64 s_collision_bit = 0x8000'0000'0000'0000;

#ifdef USE_FUTEX
static constexpr u64 s_sema_mask = 0;
#else
// Number of search groups (defines max semaphore count as gcount * 64)
static constexpr u32 s_sema_gcount = 64;

// Bits encoding allocated semaphore index (zero = not allocated yet)
static constexpr u64 s_sema_mask = (64 * s_sema_gcount - 1) << 2;
#endif

// Implementation detail (remaining bits out of 32 available for futex)
static constexpr u64 s_signal_mask = 0xffffffff & ~(s_waiter_mask | s_pointer_mask | s_collision_bit | s_sema_mask);

// Callback for wait() function, returns false if wait should return
static thread_local bool(*s_tls_wait_cb)(const void* data) = [](const void*)
{
	return true;
};

#ifndef USE_FUTEX

#ifdef USE_POSIX
using sema_handle = sem_t;
#elif defined(_WIN32)
using sema_handle = HANDLE;
#else
namespace
{
	struct dumb_sema
	{
		u64 count = 0;
		std::mutex mutex;
		std::condition_variable cond;
	};
}

using sema_handle = std::unique_ptr<dumb_sema>;
#endif

// Array of native semaphores
static sema_handle s_sema_list[64 * s_sema_gcount]{};

// Array of associated reference counters
static atomic_t<u64> s_sema_refs[64 * s_sema_gcount]{};

// Allocation bits (reserve first bit)
static atomic_t<u64> s_sema_bits[s_sema_gcount]{1};

static u32 sema_alloc()
{
	// Diversify search start points to reduce contention and increase immediate success chance
#ifdef _WIN32
	const u32 start = GetCurrentProcessorNumber();
#elif __linux__
	const u32 start = sched_getcpu();
#else
	const u32 start = __rdtsc();
#endif

	for (u32 i = 0; i < s_sema_gcount * 3; i++)
	{
		const u32 group = (i + start) % s_sema_gcount;

		const auto [bits, ok] = s_sema_bits[group].fetch_op([](u64& bits)
		{
			if (~bits)
			{
				// Set lowest clear bit
				bits |= bits + 1;
				return true;
			}

			return false;
		});

		if (ok)
		{
			// Find lowest clear bit
			const u32 id = group * 64 + utils::cnttz64(~bits, false);

#ifdef USE_POSIX
			// Initialize semaphore (should be very fast)
			sem_init(&s_sema_list[id], 0, 0);
#elif defined(_WIN32)
			if (!s_sema_list[id])
			{
				s_sema_list[id] = CreateSemaphoreW(nullptr, 0, 0x7fff'ffff, nullptr);
			}
#else
			if (!s_sema_list[id])
			{
				s_sema_list[id] = std::make_unique<dumb_sema>();
			}
#endif

			// Initialize ref counter
			if (s_sema_refs[id]++)
			{
				std::abort();
			}

			return id;
		}
	}

	return 0;
}

static void sema_free(u32 id)
{
	if (id && id < 64 * s_sema_gcount)
	{
		// Dereference first
		if (--s_sema_refs[id])
		{
			return;
		}

#ifdef USE_POSIX
		// Destroy semaphore (should be very fast)
		sem_destroy(&s_sema_list[id]);
#else
		// No action required
#endif

		// Reset allocation bit
		s_sema_bits[id / 64] &= ~(1ull << (id % 64));
	}
}

static bool sema_get(u32 id)
{
	if (id && id < 64 * s_sema_gcount)
	{
		// Increment only if the semaphore is allocated
		if (s_sema_refs[id].fetch_op([](u64& refs)
		{
			if (refs)
			{
				// Increase reference from non-zero value
				refs++;
			}
		}))
		{
			return true;
		}
	}

	return false;
}

#endif

static inline bool ptr_cmp(const void* data, std::size_t size, u64 old_value, u64 mask)
{
	switch (size)
	{
	case 1: return (reinterpret_cast<const atomic_t<u8>*>(data)->load() & mask) == (old_value & mask);
	case 2: return (reinterpret_cast<const atomic_t<u16>*>(data)->load() & mask) == (old_value & mask);
	case 4: return (reinterpret_cast<const atomic_t<u32>*>(data)->load() & mask) == (old_value & mask);
	case 8: return (reinterpret_cast<const atomic_t<u64>*>(data)->load() & mask) == (old_value & mask);
	}

	return false;
}

// Fallback implementation
namespace
{
	struct waiter
	{
		std::condition_variable cond;
		void* const tls_ptr;

		explicit waiter(void* tls_ptr)
			: tls_ptr(tls_ptr)
		{
		}
	};

	struct waiter_map
	{
		std::mutex mutex;
		std::multimap<const void*, waiter> list;
	};

	// Thread's unique node to insert without allocation
	thread_local std::multimap<const void*, waiter>::node_type s_tls_waiter = []()
	{
		// Initialize node from a dummy container (there is no separate node constructor)
		std::multimap<const void*, waiter> dummy;
		return dummy.extract(dummy.emplace(nullptr, &s_tls_waiter));
	}();

	waiter_map& get_fallback_map(const void* ptr)
	{
		static waiter_map s_waiter_maps[4096];

		return s_waiter_maps[std::hash<const void*>()(ptr) % std::size(s_waiter_maps)];
	}

	void fallback_wait(const void* data, std::size_t size, u64 old_value, u64 timeout, u64 mask)
	{
		auto& wmap = get_fallback_map(data);

		if (!timeout)
		{
			return;
		}

		// Update node key
		s_tls_waiter.key() = data;

		if (std::unique_lock lock(wmap.mutex); ptr_cmp(data, size, old_value, mask) && s_tls_wait_cb(data))
		{
			// Add node to the waiter list
			const auto iter = wmap.list.insert(std::move(s_tls_waiter));

			// Wait until the node is returned to its TLS location
			if (timeout + 1)
			{
				if (!iter->second.cond.wait_for(lock, std::chrono::nanoseconds(timeout), [&]
				{
					return 1 && s_tls_waiter;
				}))
				{
					// Put it back
					s_tls_waiter = wmap.list.extract(iter);
				}

				return;
			}

			while (!s_tls_waiter)
			{
				iter->second.cond.wait(lock);
			}
		}
	}

	void fallback_notify(waiter_map& wmap, std::multimap<const void*, waiter>::iterator found)
	{
		// Return notified node to its TLS location
		const auto ptls = static_cast<std::multimap<const void*, waiter>::node_type*>(found->second.tls_ptr);
		*ptls = wmap.list.extract(found);
		ptls->mapped().cond.notify_one();
	}

	void fallback_notify_one(const void* data)
	{
		auto& wmap = get_fallback_map(data);

		std::lock_guard lock(wmap.mutex);

		if (auto found = wmap.list.find(data); found != wmap.list.end())
		{
			fallback_notify(wmap, found);
		}
	}

	void fallback_notify_all(const void* data)
	{
		auto& wmap = get_fallback_map(data);

		std::lock_guard lock(wmap.mutex);

		for (auto it = wmap.list.lower_bound(data); it != wmap.list.end() && it->first == data;)
		{
			fallback_notify(wmap, it++);
		}
	}
}

void atomic_storage_futex::wait(const void* data, std::size_t size, u64 old_value, u64 timeout, u64 mask)
{
	if (!timeout)
	{
		return;
	}

	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];

	u32 new_value = 0;

	bool fallback = false;

	u32 sema_id = -1;

	const auto [_, ok] = entry.fetch_op([&](u64& value)
	{
		if ((value & s_waiter_mask) == s_waiter_mask || (value & s_signal_mask) == s_signal_mask)
		{
			// Return immediately on waiter overflow or signal overflow
			return false;
		}

#ifndef USE_FUTEX
		sema_id = (value & s_sema_mask) >> 2;
#endif

		if (!value || (value & s_pointer_mask) == (iptr & s_pointer_mask))
		{
			// Store pointer bits
			value |= (iptr & s_pointer_mask);
			fallback = false;
		}
		else
		{
			// Set collision bit
			value |= s_collision_bit;
			fallback = true;
		}

		// Add waiter
		value += s_waiter_mask & -s_waiter_mask;
		new_value = static_cast<u32>(value);
		return true;
	});

	if (!ok)
	{
		return;
	}

#ifndef USE_FUTEX
	for (u32 loop_count = 0; !fallback && loop_count < 7; loop_count++)
	{
		// Try to allocate a semaphore
		if (!sema_id)
		{
			const u32 sema = sema_alloc();

			if (!sema)
			{
				break;
			}

			sema_id = entry.atomic_op([&](u64& value) -> u32
			{
				if (value & s_sema_mask)
				{
					return (value & s_sema_mask) >> 2;
				}

				// Insert allocated semaphore
				value += s_signal_mask & -s_signal_mask;
				value |= (u64{sema} << 2);
				return 0;
			});

			if (sema_id)
			{
				// Drop unnecessary allocation
				sema_free(sema);
			}
			else
			{
				sema_id = sema;
				break;
			}
		}

		if (!sema_get(sema_id))
		{
			sema_id = 0;
			continue;
		}

		// Try to increment sig (check semaphore validity)
		const auto [_old, ok] = entry.fetch_op([&](u64& value)
		{
			if ((value & s_signal_mask) == s_signal_mask)
			{
				return false;
			}

			if ((value & s_sema_mask) >> 2 != sema_id)
			{
				return false;
			}

			value += s_signal_mask & -s_signal_mask;
			return true;
		});

		if (!ok)
		{
			sema_free(sema_id);
			sema_id = 0;

			if ((_old & s_signal_mask) == s_signal_mask)
			{
				// Break on signal overflow
				break;
			}

			continue;
		}

		break;
	}
#endif

	if (fallback)
	{
		fallback_wait(data, size, old_value, timeout, mask);
	}
	else if (sema_id && ptr_cmp(data, size, old_value, mask) && s_tls_wait_cb(data))
	{
#ifndef USE_FUTEX
#if defined(_WIN32) && !defined(USE_POSIX)
		LARGE_INTEGER qw;
		qw.QuadPart = -static_cast<s64>(timeout / 100);

		if (timeout % 100)
		{
			// Round up to closest 100ns unit
			qw.QuadPart -= 1;
		}

		if (!NtWaitForSingleObject(s_sema_list[sema_id], false, timeout + 1 ? &qw : nullptr))
		{
			fallback = true;
		}
#elif defined(USE_POSIX)
		struct timespec ts;
		clock_gettime(CLOCK_REALTIME, &ts);
		ts.tv_sec  += timeout / 1'000'000'000;
		ts.tv_nsec += timeout % 1'000'000'000;
		ts.tv_sec  += ts.tv_nsec / 1'000'000'000;
		ts.tv_nsec %= 1'000'000'000;

		// It's pretty unreliable because it uses absolute time, which may jump backwards. Sigh.
		if (timeout + 1)
		{
			if (sem_timedwait(&s_sema_list[sema_id], &ts) == 0)
			{
				fallback = true;
			}
		}
		else
		{
			if (sem_wait(&s_sema_list[sema_id]) == 0)
			{
				fallback = true;
			}
		}
#else
		dumb_sema& sema = *s_sema_list[sema_id];

		std::unique_lock lock(sema.mutex);

		if (timeout + 1)
		{
			sema.cond.wait_for(lock, std::chrono::nanoseconds(timeout), [&]
			{
				return sema.count > 0;
			});
		}
		else
		{
			sema.cond.wait(lock, [&]
			{
				return sema.count > 0;
			});
		}

		if (sema.count > 0)
		{
			sema.count--;
			fallback = true;
		}
#endif
#else
		struct timespec ts;
		ts.tv_sec  = timeout / 1'000'000'000;
		ts.tv_nsec = timeout % 1'000'000'000;

		futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAIT_PRIVATE, new_value, timeout + 1 ? &ts : nullptr);
#endif
	}

	if (!sema_id)
	{
		fallback = true;
	}

	while (true)
	{
		// Try to decrement
		const auto [prev, ok] = entry.fetch_op([&](u64& value)
		{
			if (value & s_waiter_mask)
			{
#ifndef USE_FUTEX
				// If timeout
				if (!fallback)
				{
					if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
					{
						return false;
					}

					value -= s_signal_mask & -s_signal_mask;

					if ((value & s_signal_mask) == 0)
					{
						value &= ~s_sema_mask;
					}
				}
#endif

				value -= s_waiter_mask & -s_waiter_mask;

				if ((value & s_waiter_mask) == 0)
				{
					// Reset on last waiter
					value = 0;
				}

				return true;
			}

			return false;
		});

		if (ok || fallback)
		{
			break;
		}

#ifndef USE_FUTEX
#if defined(_WIN32) && !defined(USE_POSIX)
		static LARGE_INTEGER instant{};

		if (!NtWaitForSingleObject(s_sema_list[sema_id], false, &instant))
		{
			fallback = true;
		}
#elif defined(USE_POSIX)
		if (sem_trywait(&s_sema_list[sema_id]) == 0)
		{
			fallback = true;
		}
#else
		dumb_sema& sema = *s_sema_list[sema_id];

		std::unique_lock lock(sema.mutex);

		if (sema.count > 0)
		{
			sema.count--;
			fallback = true;
		}
#endif
#endif
	}

#ifndef USE_FUTEX
	if (sema_id)
	{
		sema_free(sema_id);
	}
#endif

	s_tls_wait_cb(nullptr);
}

#ifdef USE_FUTEX

void atomic_storage_futex::notify_one(const void* data)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];

	const auto [prev, ok] = entry.fetch_op([&](u64& value)
	{
		if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
		{
			if ((value & s_signal_mask) == s_signal_mask)
			{
				// Signal overflow, do nothing
				return false;
			}

			value += s_signal_mask & -s_signal_mask;

			if ((value & s_signal_mask) == s_signal_mask)
			{
				// Signal will overflow, fallback to notify_all
				notify_all(data);
				return false;
			}

			return true;
		}
		else if (value & s_waiter_mask && value & s_collision_bit)
		{
			fallback_notify_one(data);
			return false;
		}

		return false;
	});

	if (ok)
	{
		futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAKE_PRIVATE, 1);
	}
}

void atomic_storage_futex::notify_all(const void* data)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];

	const auto [_, ok] = entry.fetch_op([&](u64& value)
	{
		if (value & s_waiter_mask)
		{
			if ((value & s_signal_mask) == s_signal_mask)
			{
				// Signal overflow, do nothing
				return false;
			}

			if ((value & s_pointer_mask) == (iptr & s_pointer_mask))
			{
				value += s_signal_mask & -s_signal_mask;
				return true;
			}

			if (value & s_collision_bit)
			{
				fallback_notify_all(data);
				return false;
			}
		}

		return false;
	});

	if (ok)
	{
		futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAKE_PRIVATE, 0x7fffffff);
	}
}

#endif

void atomic_storage_futex::set_wait_callback(bool(*cb)(const void* data))
{
	if (cb)
	{
		s_tls_wait_cb = cb;
	}
}

void atomic_storage_futex::raw_notify(const void* data)
{
	if (data)
	{
		notify_all(data);
	}
}

#ifndef USE_FUTEX

void atomic_storage_futex::notify_one(const void* data)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];

	const u64 value = entry;

	if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
	{
		if ((value & s_signal_mask) == 0 || (value & s_sema_mask) == 0)
		{
			// No relevant waiters, do nothing
			return;
		}
	}
	else if (value & s_waiter_mask && value & s_collision_bit)
	{
		fallback_notify_one(data);
		return;
	}
	else
	{
		return;
	}

	const u32 sema_id = (value & s_sema_mask) >> 2;

	if (!sema_get(sema_id))
	{
		return;
	}

	const auto [_, ok] = entry.fetch_op([&](u64& value)
	{
		if ((value & s_waiter_mask) == 0 || (value & s_pointer_mask) != (iptr & s_pointer_mask))
		{
			return false;
		}

		if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
		{
			return false;
		}

		value -= s_signal_mask & -s_signal_mask;

		// Reset allocated semaphore on last waiter
		if ((value & s_signal_mask) == 0)
		{
			value &= ~s_sema_mask;
		}

		return true;
	});

	if (ok)
	{
#ifdef USE_POSIX
		sem_post(&s_sema_list[sema_id]);
#elif defined(_WIN32)
		ReleaseSemaphore(s_sema_list[sema_id], 1, nullptr);
#else
		dumb_sema& sema = *s_sema_list[sema_id];

		sema.mutex.lock();
		sema.count += 1;
		sema.mutex.unlock();
		sema.cond.notify_one();
#endif
	}

	sema_free(sema_id);
}

void atomic_storage_futex::notify_all(const void* data)
{
	const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);

	atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];

	const u64 value = entry;

	if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
	{
		if ((value & s_signal_mask) == 0 || (value & s_sema_mask) == 0)
		{
			// No relevant waiters, do nothing
			return;
		}
	}
	else if (value & s_waiter_mask && value & s_collision_bit)
	{
		fallback_notify_all(data);
		return;
	}
	else
	{
		return;
	}

	const u32 sema_id = (value & s_sema_mask) >> 2;

	if (!sema_get(sema_id))
	{
		return;
	}

	const auto [_, count] = entry.fetch_op([&](u64& value) -> u32
	{
		if ((value & s_waiter_mask) == 0 || (value & s_pointer_mask) != (iptr & s_pointer_mask))
		{
			return 0;
		}

		if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
		{
			return 0;
		}

		const u32 r = (value & s_signal_mask) / (s_signal_mask & -s_signal_mask);
		value &= ~s_sema_mask;
		value &= ~s_signal_mask;
		return r;
	});

#ifdef USE_POSIX
	for (u32 i = 0; i < count; i++)
	{
		sem_post(&s_sema_list[sema_id]);
	}
#elif defined(_WIN32)
	if (count)
	{
		ReleaseSemaphore(s_sema_list[sema_id], count, nullptr);
	}
#else
	if (count)
	{
		dumb_sema& sema = *s_sema_list[sema_id];

		sema.mutex.lock();
		sema.count += count;
		sema.mutex.unlock();
		sema.cond.notify_all();
	}
#endif

	sema_free(sema_id);
}

#endif
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#include "atomic.hpp"
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								// USE_FUTEX takes precedence over USE_POSIX
 								#ifdef __linux__
 								#define USE_FUTEX
 								#define USE_POSIX
 								#endif
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#include "Utilities/sync.h"
-												atomic.hpp: fix pointer mask

											
										
										
											2019-09-21 20:20:48 +02:00
+								#include "Utilities/asm.h"
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifdef USE_POSIX
 								#include <semaphore.h>
 								#endif
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+								#include <map>
 								#include <mutex>
 								#include <condition_variable>
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#include <iterator>
 								#include <memory>
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
-												atomic.hpp: increase hashtable capacity

Double size and ignore 2 lowest bits (effectively x8)

											
										
										
											2019-09-12 16:14:26 +02:00
+								// Total number of entries, should be a power of 2.
 								static constexpr std::uintptr_t s_hashtable_size = 1u << 22;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
 								// TODO: it's probably better to implement more effective futex emulation for OSX/BSD here.
-												atomic.hpp: fix pointer mask

											
										
										
											2019-09-21 20:20:48 +02:00
+								static atomic_t<u64> s_hashtable[s_hashtable_size]{};
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+								// Pointer mask without bits used as hash, assuming signed 48-bit pointers
-												atomic.hpp: fix pointer mask

											
										
										
											2019-09-21 20:20:48 +02:00
+								static constexpr u64 s_pointer_mask = 0xffff'ffff'ffff & ~((s_hashtable_size - 1) << 2);
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+								// Max number of waiters is 32767
 								static constexpr u64 s_waiter_mask = 0x7fff'0000'0000'0000;
 								//
 								static constexpr u64 s_collision_bit = 0x8000'0000'0000'0000;
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifdef USE_FUTEX
 								static constexpr u64 s_sema_mask = 0;
 								#else
 								// Number of search groups (defines max semaphore count as gcount * 64)
 								static constexpr u32 s_sema_gcount = 64;
 								// Bits encoding allocated semaphore index (zero = not allocated yet)
 								static constexpr u64 s_sema_mask = (64 * s_sema_gcount - 1) << 2;
 								#endif
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+								// Implementation detail (remaining bits out of 32 available for futex)
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								static constexpr u64 s_signal_mask = 0xffffffff & ~(s_waiter_mask | s_pointer_mask | s_collision_bit | s_sema_mask);
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
+								// Callback for wait() function, returns false if wait should return
 								static thread_local bool(*s_tls_wait_cb)(const void* data) = [](const void*)
 								{
 									return true;
 								};
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 								#ifdef USE_POSIX
 								using sema_handle = sem_t;
 								#elif defined(_WIN32)
 								using sema_handle = HANDLE;
 								#else
 								namespace
 								{
 									struct dumb_sema
 									{
 										u64 count = 0;
 										std::mutex mutex;
 										std::condition_variable cond;
 									};
 								}
 								using sema_handle = std::unique_ptr<dumb_sema>;
 								#endif
 								// Array of native semaphores
 								static sema_handle s_sema_list[64 * s_sema_gcount]{};
 								// Array of associated reference counters
 								static atomic_t<u64> s_sema_refs[64 * s_sema_gcount]{};
 								// Allocation bits (reserve first bit)
 								static atomic_t<u64> s_sema_bits[s_sema_gcount]{1};
 								static u32 sema_alloc()
 								{
 									// Diversify search start points to reduce contention and increase immediate success chance
 								#ifdef _WIN32
 									const u32 start = GetCurrentProcessorNumber();
 								#elif __linux__
 									const u32 start = sched_getcpu();
 								#else
 									const u32 start = __rdtsc();
 								#endif
 									for (u32 i = 0; i < s_sema_gcount * 3; i++)
 									{
 										const u32 group = (i + start) % s_sema_gcount;
 										const auto [bits, ok] = s_sema_bits[group].fetch_op([](u64& bits)
 										{
 											if (~bits)
 											{
 												// Set lowest clear bit
 												bits |= bits + 1;
 												return true;
 											}
 											return false;
 										});
 										if (ok)
 										{
 											// Find lowest clear bit
 											const u32 id = group * 64 + utils::cnttz64(~bits, false);
 								#ifdef USE_POSIX
 											// Initialize semaphore (should be very fast)
 											sem_init(&s_sema_list[id], 0, 0);
 								#elif defined(_WIN32)
 											if (!s_sema_list[id])
 											{
 												s_sema_list[id] = CreateSemaphoreW(nullptr, 0, 0x7fff'ffff, nullptr);
 											}
 								#else
 											if (!s_sema_list[id])
 											{
 												s_sema_list[id] = std::make_unique<dumb_sema>();
 											}
 								#endif
 											// Initialize ref counter
 											if (s_sema_refs[id]++)
 											{
 												std::abort();
 											}
 											return id;
 										}
 									}
 									return 0;
 								}
 								static void sema_free(u32 id)
 								{
 									if (id && id < 64 * s_sema_gcount)
 									{
 										// Dereference first
 										if (--s_sema_refs[id])
 										{
 											return;
 										}
 								#ifdef USE_POSIX
 										// Destroy semaphore (should be very fast)
 										sem_destroy(&s_sema_list[id]);
 								#else
 										// No action required
 								#endif
 										// Reset allocation bit
 										s_sema_bits[id / 64] &= ~(1ull << (id % 64));
 									}
 								}
 								static bool sema_get(u32 id)
 								{
 									if (id && id < 64 * s_sema_gcount)
 									{
 										// Increment only if the semaphore is allocated
 										if (s_sema_refs[id].fetch_op([](u64& refs)
 										{
 											if (refs)
 											{
 												// Increase reference from non-zero value
 												refs++;
 											}
 										}))
 										{
 											return true;
 										}
 									}
 									return false;
 								}
 								#endif
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+								static inline bool ptr_cmp(const void* data, std::size_t size, u64 old_value, u64 mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								{
 									switch (size)
 									{
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+									case 1: return (reinterpret_cast<const atomic_t<u8>*>(data)->load() & mask) == (old_value & mask);
 									case 2: return (reinterpret_cast<const atomic_t<u16>*>(data)->load() & mask) == (old_value & mask);
 									case 4: return (reinterpret_cast<const atomic_t<u32>*>(data)->load() & mask) == (old_value & mask);
 									case 8: return (reinterpret_cast<const atomic_t<u64>*>(data)->load() & mask) == (old_value & mask);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									}
 									return false;
 								}
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+								// Fallback implementation
 								namespace
 								{
 									struct waiter
 									{
 										std::condition_variable cond;
 										void* const tls_ptr;
 										explicit waiter(void* tls_ptr)
 											: tls_ptr(tls_ptr)
 										{
 										}
 									};
 									struct waiter_map
 									{
 										std::mutex mutex;
 										std::multimap<const void*, waiter> list;
 									};
 									// Thread's unique node to insert without allocation
 									thread_local std::multimap<const void*, waiter>::node_type s_tls_waiter = []()
 									{
 										// Initialize node from a dummy container (there is no separate node constructor)
 										std::multimap<const void*, waiter> dummy;
 										return dummy.extract(dummy.emplace(nullptr, &s_tls_waiter));
 									}();
 									waiter_map& get_fallback_map(const void* ptr)
 									{
 										static waiter_map s_waiter_maps[4096];
 										return s_waiter_maps[std::hash<const void*>()(ptr) % std::size(s_waiter_maps)];
 									}
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+									void fallback_wait(const void* data, std::size_t size, u64 old_value, u64 timeout, u64 mask)
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+									{
 										auto& wmap = get_fallback_map(data);
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										if (!timeout)
 										{
 											return;
 										}
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+										// Update node key
 										s_tls_waiter.key() = data;
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+										if (std::unique_lock lock(wmap.mutex); ptr_cmp(data, size, old_value, mask) && s_tls_wait_cb(data))
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+										{
 											// Add node to the waiter list
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+											const auto iter = wmap.list.insert(std::move(s_tls_waiter));
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
 											// Wait until the node is returned to its TLS location
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+											if (timeout + 1)
 											{
 												if (!iter->second.cond.wait_for(lock, std::chrono::nanoseconds(timeout), [&]
 												{
 													return 1 && s_tls_waiter;
 												}))
 												{
 													// Put it back
 													s_tls_waiter = wmap.list.extract(iter);
 												}
 												return;
 											}
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+											while (!s_tls_waiter)
 											{
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+												iter->second.cond.wait(lock);
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
+											}
 										}
 									}
 									void fallback_notify(waiter_map& wmap, std::multimap<const void*, waiter>::iterator found)
 									{
 										// Return notified node to its TLS location
 										const auto ptls = static_cast<std::multimap<const void*, waiter>::node_type*>(found->second.tls_ptr);
 										*ptls = wmap.list.extract(found);
 										ptls->mapped().cond.notify_one();
 									}
 									void fallback_notify_one(const void* data)
 									{
 										auto& wmap = get_fallback_map(data);
 										std::lock_guard lock(wmap.mutex);
 										if (auto found = wmap.list.find(data); found != wmap.list.end())
 										{
 											fallback_notify(wmap, found);
 										}
 									}
 									void fallback_notify_all(const void* data)
 									{
 										auto& wmap = get_fallback_map(data);
 										std::lock_guard lock(wmap.mutex);
 										for (auto it = wmap.list.lower_bound(data); it != wmap.list.end() && it->first == data;)
 										{
 											fallback_notify(wmap, it++);
 										}
 									}
 								}
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+								void atomic_storage_futex::wait(const void* data, std::size_t size, u64 old_value, u64 timeout, u64 mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								{
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+									if (!timeout)
 									{
 										return;
 									}
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: increase hashtable capacity

Double size and ignore 2 lowest bits (effectively x8)

											
										
										
											2019-09-12 16:14:26 +02:00
+									atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
 									u32 new_value = 0;
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+									bool fallback = false;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									u32 sema_id = -1;
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const auto [_, ok] = entry.fetch_op([&](u64& value)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+										if ((value & s_waiter_mask) == s_waiter_mask || (value & s_signal_mask) == s_signal_mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
 											// Return immediately on waiter overflow or signal overflow
 											return false;
 										}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 										sema_id = (value & s_sema_mask) >> 2;
 								#endif
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+										if (!value || (value & s_pointer_mask) == (iptr & s_pointer_mask))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+											// Store pointer bits
 											value |= (iptr & s_pointer_mask);
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+											fallback = false;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
 										else
 										{
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+											// Set collision bit
 											value |= s_collision_bit;
 											fallback = true;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+										// Add waiter
 										value += s_waiter_mask & -s_waiter_mask;
 										new_value = static_cast<u32>(value);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										return true;
 									});
 									if (!ok)
 									{
 										return;
 									}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 									for (u32 loop_count = 0; !fallback && loop_count < 7; loop_count++)
 									{
 										// Try to allocate a semaphore
 										if (!sema_id)
 										{
 											const u32 sema = sema_alloc();
 											if (!sema)
 											{
 												break;
 											}
 											sema_id = entry.atomic_op([&](u64& value) -> u32
 											{
 												if (value & s_sema_mask)
 												{
 													return (value & s_sema_mask) >> 2;
 												}
 												// Insert allocated semaphore
 												value += s_signal_mask & -s_signal_mask;
 												value |= (u64{sema} << 2);
 												return 0;
 											});
 											if (sema_id)
 											{
 												// Drop unnecessary allocation
 												sema_free(sema);
 											}
 											else
 											{
 												sema_id = sema;
 												break;
 											}
 										}
 										if (!sema_get(sema_id))
 										{
 											sema_id = 0;
 											continue;
 										}
 										// Try to increment sig (check semaphore validity)
 										const auto [_old, ok] = entry.fetch_op([&](u64& value)
 										{
 											if ((value & s_signal_mask) == s_signal_mask)
 											{
 												return false;
 											}
 											if ((value & s_sema_mask) >> 2 != sema_id)
 											{
 												return false;
 											}
 											value += s_signal_mask & -s_signal_mask;
 											return true;
 										});
 										if (!ok)
 										{
 											sema_free(sema_id);
 											sema_id = 0;
 											if ((_old & s_signal_mask) == s_signal_mask)
 											{
 												// Break on signal overflow
 												break;
 											}
 											continue;
 										}
 										break;
 									}
 								#endif
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+									if (fallback)
 									{
-												atomic.hpp: add atomic wait mask support

											
										
										
											2019-09-12 14:43:26 +02:00
+										fallback_wait(data, size, old_value, timeout, mask);
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+									}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									else if (sema_id && ptr_cmp(data, size, old_value, mask) && s_tls_wait_cb(data))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 								#if defined(_WIN32) && !defined(USE_POSIX)
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										LARGE_INTEGER qw;
 										qw.QuadPart = -static_cast<s64>(timeout / 100);
 										if (timeout % 100)
 										{
 											// Round up to closest 100ns unit
 											qw.QuadPart -= 1;
 										}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										if (!NtWaitForSingleObject(s_sema_list[sema_id], false, timeout + 1 ? &qw : nullptr))
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										{
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+											fallback = true;
 										}
 								#elif defined(USE_POSIX)
 										struct timespec ts;
 										clock_gettime(CLOCK_REALTIME, &ts);
 										ts.tv_sec  += timeout / 1'000'000'000;
 										ts.tv_nsec += timeout % 1'000'000'000;
 										ts.tv_sec  += ts.tv_nsec / 1'000'000'000;
 										ts.tv_nsec %= 1'000'000'000;
 										// It's pretty unreliable because it uses absolute time, which may jump backwards. Sigh.
 										if (timeout + 1)
 										{
 											if (sem_timedwait(&s_sema_list[sema_id], &ts) == 0)
 											{
 												fallback = true;
 											}
 										}
 										else
 										{
 											if (sem_wait(&s_sema_list[sema_id]) == 0)
 											{
 												fallback = true;
 											}
 										}
 								#else
 										dumb_sema& sema = *s_sema_list[sema_id];
 										std::unique_lock lock(sema.mutex);
 										if (timeout + 1)
 										{
 											sema.cond.wait_for(lock, std::chrono::nanoseconds(timeout), [&]
 											{
 												return sema.count > 0;
 											});
 										}
 										else
 										{
 											sema.cond.wait(lock, [&]
 											{
 												return sema.count > 0;
 											});
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
 										if (sema.count > 0)
 										{
 											sema.count--;
 											fallback = true;
 										}
 								#endif
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#else
-												atomic.hpp: add timeout support

											
										
										
											2019-09-09 01:42:05 +02:00
+										struct timespec ts;
 										ts.tv_sec  = timeout / 1'000'000'000;
 										ts.tv_nsec = timeout % 1'000'000'000;
 										futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAIT_PRIVATE, new_value, timeout + 1 ? &ts : nullptr);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#endif
 									}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+									if (!sema_id)
 									{
 										fallback = true;
 									}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									while (true)
 									{
 										// Try to decrement
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+										const auto [prev, ok] = entry.fetch_op([&](u64& value)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+											if (value & s_waiter_mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+											{
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 												// If timeout
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+												if (!fallback)
 												{
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+													if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
-												atomic.hpp: fixup for atomic waiting

											
										
										
											2019-09-21 13:02:13 +02:00
+													{
 														return false;
 													}
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+													value -= s_signal_mask & -s_signal_mask;
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
 													if ((value & s_signal_mask) == 0)
 													{
 														value &= ~s_sema_mask;
 													}
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+												}
 								#endif
-												atomic.hpp: fixup for atomic waiting

											
										
										
											2019-09-21 13:02:13 +02:00
+												value -= s_waiter_mask & -s_waiter_mask;
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+												if ((value & s_waiter_mask) == 0)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+												{
 													// Reset on last waiter
 													value = 0;
 												}
 												return true;
 											}
 											return false;
 										});
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+										if (ok || fallback)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
 											break;
 										}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifndef USE_FUTEX
 								#if defined(_WIN32) && !defined(USE_POSIX)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										static LARGE_INTEGER instant{};
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										if (!NtWaitForSingleObject(s_sema_list[sema_id], false, &instant))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+											fallback = true;
 										}
 								#elif defined(USE_POSIX)
 										if (sem_trywait(&s_sema_list[sema_id]) == 0)
 										{
 											fallback = true;
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
 								#else
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+										dumb_sema& sema = *s_sema_list[sema_id];
 										std::unique_lock lock(sema.mutex);
 										if (sema.count > 0)
 										{
 											sema.count--;
 											fallback = true;
 										}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								#endif
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#endif
 									}
 								#ifndef USE_FUTEX
 									if (sema_id)
 									{
 										sema_free(sema_id);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#endif
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
 									s_tls_wait_cb(nullptr);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
+								#ifdef USE_FUTEX
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+								void atomic_storage_futex::notify_one(const void* data)
 								{
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: increase hashtable capacity

Double size and ignore 2 lowest bits (effectively x8)

											
										
										
											2019-09-12 16:14:26 +02:00
+									atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const auto [prev, ok] = entry.fetch_op([&](u64& value)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+										if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+											if ((value & s_signal_mask) == s_signal_mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+											{
 												// Signal overflow, do nothing
 												return false;
 											}
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+											value += s_signal_mask & -s_signal_mask;
-												atomic.hpp: fix signal saturation logic

Make sure to notify_all at max signal count.

											
										
										
											2019-09-09 04:28:02 +02:00
 											if ((value & s_signal_mask) == s_signal_mask)
 											{
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+												// Signal will overflow, fallback to notify_all
 												notify_all(data);
-												atomic.hpp: fix signal saturation logic

Make sure to notify_all at max signal count.

											
										
										
											2019-09-09 04:28:02 +02:00
+												return false;
 											}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+											return true;
 										}
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+										else if (value & s_waiter_mask && value & s_collision_bit)
-												atomic.hpp: fix signal saturation logic

Make sure to notify_all at max signal count.

											
										
										
											2019-09-09 04:28:02 +02:00
+										{
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+											fallback_notify_one(data);
 											return false;
-												atomic.hpp: fix signal saturation logic

Make sure to notify_all at max signal count.

											
										
										
											2019-09-09 04:28:02 +02:00
+										}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										return false;
 									});
 									if (ok)
 									{
 										futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAKE_PRIVATE, 1);
 									}
 								}
 								void atomic_storage_futex::notify_all(const void* data)
 								{
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: increase hashtable capacity

Double size and ignore 2 lowest bits (effectively x8)

											
										
										
											2019-09-12 16:14:26 +02:00
+									atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+									const auto [_, ok] = entry.fetch_op([&](u64& value)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+									{
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+										if (value & s_waiter_mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										{
-												Change bits of waitable atomics

Reduce max waiter count to 2^14.
Refactor code to use waiter_mask and signal_mask constants.

											
										
										
											2019-07-29 19:28:20 +02:00
+											if ((value & s_signal_mask) == s_signal_mask)
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+											{
 												// Signal overflow, do nothing
 												return false;
 											}
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
+											if ((value & s_pointer_mask) == (iptr & s_pointer_mask))
-												atomic.hpp: optimize internal logic

Move waiter count to highest bits to prevent false futex wakeups.
Test pointer bits properly in notify_all to avoid false wakeups.

											
										
										
											2019-09-09 03:32:30 +02:00
+											{
 												value += s_signal_mask & -s_signal_mask;
 												return true;
 											}
-												atomic.hpp: implement collision fallback properly

Should prevent devastating effect of collisions

											
										
										
											2019-09-09 11:28:21 +02:00
 											if (value & s_collision_bit)
 											{
 												fallback_notify_all(data);
 												return false;
 											}
-												Implement waitable atomics

Moved Atomic.h to util/atomic.hpp
List source files in CMakeLists.txt

											
										
										
											2019-07-27 00:34:10 +02:00
+										}
 										return false;
 									});
 									if (ok)
 									{
 										futex(reinterpret_cast<char*>(&entry) + 4 * IS_BE_MACHINE, FUTEX_WAKE_PRIVATE, 0x7fffffff);
 									}
 								}
-												Add fallback implementation for waitable atomics

May improve perf on OSX/BSD

											
										
										
											2019-08-02 00:23:26 +02:00
 								#endif
-												atomic.hpp: implement wait callback interface

Will be used to wake up threads uniformly.

											
										
										
											2019-09-08 21:48:26 +02:00
 								void atomic_storage_futex::set_wait_callback(bool(*cb)(const void* data))
 								{
 									if (cb)
 									{
 										s_tls_wait_cb = cb;
 									}
 								}
 								void atomic_storage_futex::raw_notify(const void* data)
 								{
 									if (data)
 									{
 										notify_all(data);
 									}
 								}
-												atomic.hpp: use native semaphores on Windows

Windows: drop keyed events
Linux: keep using native futex
Implement unused POSIX semaphore path
Implement fallback semaphore with pure std (OSX, BSD, etc)

											
										
										
											2019-09-22 00:31:23 +02:00
 								#ifndef USE_FUTEX
 								void atomic_storage_futex::notify_one(const void* data)
 								{
 									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
 									atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];
 									const u64 value = entry;
 									if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
 									{
 										if ((value & s_signal_mask) == 0 || (value & s_sema_mask) == 0)
 										{
 											// No relevant waiters, do nothing
 											return;
 										}
 									}
 									else if (value & s_waiter_mask && value & s_collision_bit)
 									{
 										fallback_notify_one(data);
 										return;
 									}
 									else
 									{
 										return;
 									}
 									const u32 sema_id = (value & s_sema_mask) >> 2;
 									if (!sema_get(sema_id))
 									{
 										return;
 									}
 									const auto [_, ok] = entry.fetch_op([&](u64& value)
 									{
 										if ((value & s_waiter_mask) == 0 || (value & s_pointer_mask) != (iptr & s_pointer_mask))
 										{
 											return false;
 										}
 										if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
 										{
 											return false;
 										}
 										value -= s_signal_mask & -s_signal_mask;
 										// Reset allocated semaphore on last waiter
 										if ((value & s_signal_mask) == 0)
 										{
 											value &= ~s_sema_mask;
 										}
 										return true;
 									});
 									if (ok)
 									{
 								#ifdef USE_POSIX
 										sem_post(&s_sema_list[sema_id]);
 								#elif defined(_WIN32)
 										ReleaseSemaphore(s_sema_list[sema_id], 1, nullptr);
 								#else
 										dumb_sema& sema = *s_sema_list[sema_id];
 										sema.mutex.lock();
 										sema.count += 1;
 										sema.mutex.unlock();
 										sema.cond.notify_one();
 								#endif
 									}
 									sema_free(sema_id);
 								}
 								void atomic_storage_futex::notify_all(const void* data)
 								{
 									const std::uintptr_t iptr = reinterpret_cast<std::uintptr_t>(data);
 									atomic_t<u64>& entry = s_hashtable[(iptr >> 2) % s_hashtable_size];
 									const u64 value = entry;
 									if (value & s_waiter_mask && (value & s_pointer_mask) == (iptr & s_pointer_mask))
 									{
 										if ((value & s_signal_mask) == 0 || (value & s_sema_mask) == 0)
 										{
 											// No relevant waiters, do nothing
 											return;
 										}
 									}
 									else if (value & s_waiter_mask && value & s_collision_bit)
 									{
 										fallback_notify_all(data);
 										return;
 									}
 									else
 									{
 										return;
 									}
 									const u32 sema_id = (value & s_sema_mask) >> 2;
 									if (!sema_get(sema_id))
 									{
 										return;
 									}
 									const auto [_, count] = entry.fetch_op([&](u64& value) -> u32
 									{
 										if ((value & s_waiter_mask) == 0 || (value & s_pointer_mask) != (iptr & s_pointer_mask))
 										{
 											return 0;
 										}
 										if ((value & s_signal_mask) == 0 || (value & s_sema_mask) >> 2 != sema_id)
 										{
 											return 0;
 										}
 										const u32 r = (value & s_signal_mask) / (s_signal_mask & -s_signal_mask);
 										value &= ~s_sema_mask;
 										value &= ~s_signal_mask;
 										return r;
 									});
 								#ifdef USE_POSIX
 									for (u32 i = 0; i < count; i++)
 									{
 										sem_post(&s_sema_list[sema_id]);
 									}
 								#elif defined(_WIN32)
 									if (count)
 									{
 										ReleaseSemaphore(s_sema_list[sema_id], count, nullptr);
 									}
 								#else
 									if (count)
 									{
 										dumb_sema& sema = *s_sema_list[sema_id];
 										sema.mutex.lock();
 										sema.count += count;
 										sema.mutex.unlock();
 										sema.cond.notify_all();
 									}
 								#endif
 									sema_free(sema_id);
 								}
 								#endif