SPU: multithread compilation

Allow parallel compilation of SPU code, both at startup and runtime Remove 'SPU Shared Runtime' option (it became obsolete) Refactor spu_runtime class (now is common for ASMJIT and LLVM) Implement SPU ubertrampoline generation in raw assembly (LLVM) Minor improvement of balanced_wait_until<> and balanced_awaken<> Make JIT MemoryManager2 shared (global) Fix wrong assertion in cond_variable
2026-04-04 22:18:53 +00:00 · 2019-01-21 21:04:32 +03:00 · 2019-01-21 21:04:32 +03:00 · 4f152ad126
commit 4f152ad126
parent 8d5d44141e
9 changed files with 503 additions and 394 deletions
--- a/Utilities/JIT.cpp
+++ b/Utilities/JIT.cpp
@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
 	return utils::memory_reserve(s_memory_size);
 }();

+// Reserve 2G of memory, should replace previous area for ASLR compatibility
+static void* const s_memory2 = utils::memory_reserve(0x80000000);
+
+static u64 s_code_pos = 0;
+static u64 s_data_pos = 0;
+
 static void* s_next = s_memory;

 #ifdef _WIN32
@ -129,6 +135,11 @@ extern void jit_finalize()
 	utils::memory_decommit(s_memory, s_memory_size);

 	s_next = s_memory;
+
+	utils::memory_decommit(s_memory2, 0x80000000);
+
+	s_code_pos = 0;
+	s_data_pos = 0;
 }

 // Helper class
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
 // Simple memory manager
 struct MemoryManager2 : llvm::RTDyldMemoryManager
 {
-	// Reserve 2 GiB
-	void* const m_memory = utils::memory_reserve(0x80000000);
+	// Patchwork again...
+	void* const m_memory = s_memory2;

 	u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
 	u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;

-	u64 m_code_pos = 0;
-	u64 m_data_pos = 0;
+	u64& m_code_pos = s_code_pos;
+	u64& m_data_pos = s_data_pos;

 	MemoryManager2() = default;

 	~MemoryManager2() override
 	{
-		utils::memory_release(m_memory, 0x80000000);
 	}

 	u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
 	{
+		std::lock_guard lock(s_mutex);
+
 		// Simple allocation
 		const u64 old = m_code_pos;
 		const u64 pos = ::align(m_code_pos, align);
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
 			utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
 		}

+		if (!sec_id && sec_name.empty())
+		{
+			// Special case: don't log
+			return m_code + pos;
+		}
+
 		LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
 		return m_code + pos;
 	}

 	u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
 	{
+		std::lock_guard lock(s_mutex);
+
 		// Simple allocation
 		const u64 old = m_data_pos;
 		const u64 pos = ::align(m_data_pos, align);
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
 	return m_engine->getGlobalValueAddress(name);
 }

-std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
+u8* jit_compiler::alloc(u32 size)
 {
-	// Lock memory manager
-	std::lock_guard lock(s_mutex);
+	// Dummy memory manager object
+	MemoryManager2 mm;

-	std::unordered_map<std::string, u64> result;
-
-	std::size_t size = 0;
-
-	for (auto&& pair : data)
-	{
-		size += ::align(pair.second.size(), 16);
-	}
-
-	utils::memory_commit(s_next, size, utils::protection::wx);
-	std::memset(s_next, 0xc3, ::align(size, 4096));
-
-	for (auto&& pair : data)
-	{
-		std::memcpy(s_next, pair.second.data(), pair.second.size());
-		result.emplace(pair.first, (u64)s_next);
-		s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
-	}
-
-	s_next = (void*)::align((u64)s_next, 4096);
-
-	return result;
+	return mm.allocateCodeSection(size, 16, 0, {});
 }

 #endif
--- a/Utilities/JIT.h
+++ b/Utilities/JIT.h
@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)

 #include <memory>
 #include <string>
+#include <string_view>
 #include <unordered_map>

 #include "types.h"
@ -129,8 +130,8 @@ public:
 	// Get compiled function address
 	u64 get(const std::string& name);

-	// Add functions directly to the memory manager (name -> code)
-	static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
+	// Allocate writable executable memory (alignment is assumed 16)
+	static u8* alloc(u32 size);

 	// Get CPU info
 	static std::string cpu(const std::string& _cpu);
--- a/Utilities/cond.cpp
+++ b/Utilities/cond.cpp
@ -10,7 +10,7 @@

 bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
 {
-	verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
+	verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously

 	return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
 	{
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept

 void cond_variable::imp_wake(u32 _count) noexcept
 {
-	balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
+	// TODO (notify_one)
+	balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
 	{
 		// Subtract already signaled number from total amount of waiters
 		const u32 can_sig = (value & 0xffff) - (value >> 16);
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
 		return;
 	}

-	balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
+	balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
 }

 bool lf_queue_base::wait(u64 _timeout)
--- a/Utilities/sync.h
+++ b/Utilities/sync.h
@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		{
 			if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
 			{
-				if (!test_pred(value) && !test_pred(value, nullptr))
+				if (!test_pred(value, nullptr))
 				{
 					return false;
 				}
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 		return true;
 	}

-	if (!test_pred(value) && !test_pred(value, nullptr))
+	if (!test_pred(value, nullptr))
 	{
 		// Stolen notification: restore balance
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 	{
 		if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
 		{
-			if (!test_pred(value) && !test_pred(value, nullptr))
+			if (!test_pred(value, nullptr))
 			{
 				return false;
 			}
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
 #endif
 }

-template <typename T>
+template <bool All = false, typename T>
 void balanced_awaken(atomic_t<T>& var, u32 weight)
 {
 	static_assert(sizeof(T) == 4 || sizeof(T) == 8);
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 #ifdef _WIN32
 	if (OptWaitOnAddress)
 	{
-		if (weight > 1)
+		if (All || weight > 3)
 		{
 			OptWakeByAddressAll(&var);
+			return;
 		}
-		else if (weight == 1)
+
+		for (u32 i = 0; i < weight; i++)
 		{
 			OptWakeByAddressSingle(&var);
 		}
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
 		NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
 	}
 #else
-	if (weight)
+	if (All || weight)
 	{
-		futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
+		futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
 	}

 	return;