SPU: multithread compilation

Allow parallel compilation of SPU code, both at startup and runtime
Remove 'SPU Shared Runtime' option (it became obsolete)
Refactor spu_runtime class (now is common for ASMJIT and LLVM)
Implement SPU ubertrampoline generation in raw assembly (LLVM)
Minor improvement of balanced_wait_until<> and balanced_awaken<>
Make JIT MemoryManager2 shared (global)
Fix wrong assertion in cond_variable
This commit is contained in:
Nekotekina 2019-01-21 21:04:32 +03:00
parent 8d5d44141e
commit 4f152ad126
9 changed files with 503 additions and 394 deletions

View file

@ -95,6 +95,12 @@ static void* const s_memory = []() -> void*
return utils::memory_reserve(s_memory_size);
}();
// Reserve 2G of memory, should replace previous area for ASLR compatibility
static void* const s_memory2 = utils::memory_reserve(0x80000000);
static u64 s_code_pos = 0;
static u64 s_data_pos = 0;
static void* s_next = s_memory;
#ifdef _WIN32
@ -129,6 +135,11 @@ extern void jit_finalize()
utils::memory_decommit(s_memory, s_memory_size);
s_next = s_memory;
utils::memory_decommit(s_memory2, 0x80000000);
s_code_pos = 0;
s_data_pos = 0;
}
// Helper class
@ -311,24 +322,25 @@ struct MemoryManager : llvm::RTDyldMemoryManager
// Simple memory manager
struct MemoryManager2 : llvm::RTDyldMemoryManager
{
// Reserve 2 GiB
void* const m_memory = utils::memory_reserve(0x80000000);
// Patchwork again...
void* const m_memory = s_memory2;
u8* const m_code = static_cast<u8*>(m_memory) + 0x00000000;
u8* const m_data = static_cast<u8*>(m_memory) + 0x40000000;
u64 m_code_pos = 0;
u64 m_data_pos = 0;
u64& m_code_pos = s_code_pos;
u64& m_data_pos = s_data_pos;
MemoryManager2() = default;
~MemoryManager2() override
{
utils::memory_release(m_memory, 0x80000000);
}
u8* allocateCodeSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name) override
{
std::lock_guard lock(s_mutex);
// Simple allocation
const u64 old = m_code_pos;
const u64 pos = ::align(m_code_pos, align);
@ -349,12 +361,20 @@ struct MemoryManager2 : llvm::RTDyldMemoryManager
utils::memory_commit(m_code + olda, newa - olda, utils::protection::wx);
}
if (!sec_id && sec_name.empty())
{
// Special case: don't log
return m_code + pos;
}
LOG_NOTICE(GENERAL, "LLVM: Code section %u '%s' allocated -> %p (size=0x%x, align=0x%x)", sec_id, sec_name.data(), m_code + pos, size, align);
return m_code + pos;
}
u8* allocateDataSection(std::uintptr_t size, uint align, uint sec_id, llvm::StringRef sec_name, bool is_ro) override
{
std::lock_guard lock(s_mutex);
// Simple allocation
const u64 old = m_data_pos;
const u64 pos = ::align(m_data_pos, align);
@ -642,33 +662,12 @@ u64 jit_compiler::get(const std::string& name)
return m_engine->getGlobalValueAddress(name);
}
std::unordered_map<std::string, u64> jit_compiler::add(std::unordered_map<std::string, std::string> data)
u8* jit_compiler::alloc(u32 size)
{
// Lock memory manager
std::lock_guard lock(s_mutex);
// Dummy memory manager object
MemoryManager2 mm;
std::unordered_map<std::string, u64> result;
std::size_t size = 0;
for (auto&& pair : data)
{
size += ::align(pair.second.size(), 16);
}
utils::memory_commit(s_next, size, utils::protection::wx);
std::memset(s_next, 0xc3, ::align(size, 4096));
for (auto&& pair : data)
{
std::memcpy(s_next, pair.second.data(), pair.second.size());
result.emplace(pair.first, (u64)s_next);
s_next = (void*)::align((u64)s_next + pair.second.size(), 16);
}
s_next = (void*)::align((u64)s_next, 4096);
return result;
return mm.allocateCodeSection(size, 16, 0, {});
}
#endif

View file

@ -61,6 +61,7 @@ FT build_function_asm(F&& builder)
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
#include "types.h"
@ -129,8 +130,8 @@ public:
// Get compiled function address
u64 get(const std::string& name);
// Add functions directly to the memory manager (name -> code)
static std::unordered_map<std::string, u64> add(std::unordered_map<std::string, std::string>);
// Allocate writable executable memory (alignment is assumed 16)
static u8* alloc(u32 size);
// Get CPU info
static std::string cpu(const std::string& _cpu);

View file

@ -10,7 +10,7 @@
bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
{
verify("cond_variable overflow" HERE), (_old & 0xffff) == 0; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
verify("cond_variable overflow" HERE), (_old & 0xffff) != 0xffff; // Very unlikely: it requires 65535 distinct threads to wait simultaneously
return balanced_wait_until(m_value, _timeout, [&](u32& value, auto... ret) -> int
{
@ -42,7 +42,8 @@ bool cond_variable::imp_wait(u32 _old, u64 _timeout) noexcept
void cond_variable::imp_wake(u32 _count) noexcept
{
balanced_awaken(m_value, m_value.atomic_op([&](u32& value) -> u32
// TODO (notify_one)
balanced_awaken<true>(m_value, m_value.atomic_op([&](u32& value) -> u32
{
// Subtract already signaled number from total amount of waiters
const u32 can_sig = (value & 0xffff) - (value >> 16);
@ -266,7 +267,7 @@ void cond_x16::imp_notify() noexcept
return;
}
balanced_awaken(m_cvx16, utils::popcnt16(wait_mask));
balanced_awaken<true>(m_cvx16, utils::popcnt16(wait_mask));
}
bool lf_queue_base::wait(u64 _timeout)

View file

@ -186,7 +186,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
{
if (OptWaitOnAddress(&var, &value, sizeof(T), is_inf ? INFINITE : usec_timeout / 1000))
{
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
return false;
}
@ -220,7 +220,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
return true;
}
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
// Stolen notification: restore balance
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
@ -237,7 +237,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
{
if (futex(&var, FUTEX_WAIT_PRIVATE, static_cast<u32>(value), is_inf ? nullptr : &timeout) == 0)
{
if (!test_pred(value) && !test_pred(value, nullptr))
if (!test_pred(value, nullptr))
{
return false;
}
@ -257,7 +257,7 @@ bool balanced_wait_until(atomic_t<T>& var, u64 usec_timeout, Pred&& pred)
#endif
}
template <typename T>
template <bool All = false, typename T>
void balanced_awaken(atomic_t<T>& var, u32 weight)
{
static_assert(sizeof(T) == 4 || sizeof(T) == 8);
@ -265,11 +265,13 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
#ifdef _WIN32
if (OptWaitOnAddress)
{
if (weight > 1)
if (All || weight > 3)
{
OptWakeByAddressAll(&var);
return;
}
else if (weight == 1)
for (u32 i = 0; i < weight; i++)
{
OptWakeByAddressSingle(&var);
}
@ -282,9 +284,9 @@ void balanced_awaken(atomic_t<T>& var, u32 weight)
NtReleaseKeyedEvent(nullptr, &var, false, nullptr);
}
#else
if (weight)
if (All || weight)
{
futex(&var, FUTEX_WAKE_PRIVATE, std::min<u32>(INT_MAX, weight));
futex(&var, FUTEX_WAKE_PRIVATE, All ? INT_MAX : std::min<u32>(INT_MAX, weight));
}
return;