Initial Linux Aarch64 support

* Update asmjit dependency (aarch64 branch)
* Disable USE_DISCORD_RPC by default
* Dump some JIT objects in rpcs3 cache dir
* Add SIGILL handler for all platforms
* Fix resetting zeroing denormals in thread pool
* Refactor most v128:: utils into global gv_** functions
* Refactor PPU interpreter (incomplete), remove "precise"
* - Instruction specializations with multiple accuracy flags
* - Adjust calling convention for speed
* - Removed precise/fast setting, replaced with static
* - Started refactoring interpreters for building at runtime JIT
*   (I got tired of poor compiler optimizations)
* - Expose some accuracy settings (SAT, NJ, VNAN, FPCC)
* - Add exec_bytes PPU thread variable (akin to cycle count)
* PPU LLVM: fix VCTUXS+VCTSXS instruction NaN results
* SPU interpreter: remove "precise" for now (extremely non-portable)
* - As with PPU, settings changed to static/dynamic for interpreters.
* - Precise options will be implemented later
* Fix termination after fatal error dialog
This commit is contained in:
Nekotekina 2021-12-30 19:39:18 +03:00
parent d6aa834b5f
commit 580bd2b25e
89 changed files with 20360 additions and 5612 deletions

View file

@ -18,6 +18,12 @@ LOG_CHANNEL(jit_log, "JIT");
void jit_announce(uptr func, usz size, std::string_view name)
{
if (!size)
{
jit_log.error("Empty function announced: %s (%p)", name, func);
return;
}
#ifdef __linux__
static const fs::file s_map(fmt::format("/tmp/perf-%d.map", getpid()), fs::rewrite + fs::append);
@ -124,15 +130,31 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept
{
ensure(!code->flatten());
ensure(!code->resolveUnresolvedLinks());
usz codeSize = ensure(code->codeSize());
usz codeSize = code->codeSize();
if (!codeSize)
return nullptr;
auto p = ensure(this->_alloc(codeSize, 64));
ensure(!code->relocateToBase(uptr(p)));
asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
for (asmjit::Section* section : code->_sections)
{
std::memcpy(p + section->offset(), section->data(), section->bufferSize());
asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
for (asmjit::Section* section : code->_sections)
{
std::memcpy(p + section->offset(), section->data(), section->bufferSize());
}
}
if (!dump_name.empty())
{
// If directory ASMJIT doesn't exist, nothing will be written
fs::file dump(fmt::format("%s/ASMJIT/%s", fs::get_cache_dir(), dump_name), fs::rewrite);
if (dump)
{
dump.write(p, codeSize);
}
}
return p;
@ -349,8 +371,9 @@ static u64 make_null_function(const std::string& name)
using namespace asmjit;
// Build a "null" function that contains its name
const auto func = build_function_asm<void (*)()>("NULL", [&](x86::Assembler& c, auto& args)
const auto func = build_function_asm<void (*)()>("NULL", [&](native_asm& c, auto& args)
{
#if defined(ARCH_X64)
Label data = c.newLabel();
c.lea(args[0], x86::qword_ptr(data, 0));
c.jmp(Imm(&null));
@ -362,6 +385,7 @@ static u64 make_null_function(const std::string& name)
c.db(ch);
c.db(0);
c.align(AlignMode::kData, 16);
#endif
});
func_ptr = reinterpret_cast<u64>(func);

View file

@ -22,10 +22,17 @@
#pragma GCC diagnostic ignored "-Wredundant-decls"
#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
#pragma GCC diagnostic ignored "-Weffc++"
#ifndef __clang__
#ifdef __clang__
#pragma GCC diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
#pragma GCC diagnostic ignored "-Wcast-qual"
#else
#pragma GCC diagnostic ignored "-Wduplicated-branches"
#pragma GCC diagnostic ignored "-Wdeprecated-enum-enum-conversion"
#endif
#include <asmjit/asmjit.h>
#if defined(ARCH_ARM64)
#include <asmjit/a64.h>
#endif
#pragma GCC diagnostic pop
#endif
@ -36,6 +43,14 @@
#include <string_view>
#include <unordered_map>
#if defined(ARCH_X64)
using native_asm = asmjit::x86::Assembler;
using native_args = std::array<asmjit::x86::Gp, 4>;
#elif defined(ARCH_ARM64)
using native_asm = asmjit::a64::Assembler;
using native_args = std::array<asmjit::a64::Gp, 4>;
#endif
void jit_announce(uptr func, usz size, std::string_view name);
void jit_announce(auto* func, usz size, std::string_view name)
@ -62,6 +77,8 @@ struct jit_runtime_base
const asmjit::Environment& environment() const noexcept;
void* _add(asmjit::CodeHolder* code) noexcept;
virtual uchar* _alloc(usz size, usz align) noexcept = 0;
std::string_view dump_name;
};
// ASMJIT runtime for emitting code in a single 2G region
@ -167,11 +184,39 @@ namespace asmjit
}
}
inline void build_init_args_from_ghc(native_asm& c, native_args& args)
{
#if defined(ARCH_X64)
// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
c.mov(args[0], x86::r13);
c.mov(args[1], x86::rbp);
c.mov(args[2], x86::r12);
c.mov(args[3], x86::rbx);
#else
static_cast<void>(c);
static_cast<void>(args);
#endif
}
inline void build_init_ghc_args(native_asm& c, native_args& args)
{
#if defined(ARCH_X64)
// TODO: handle case when args don't overlap with r13/rbp/r12/rbx
c.mov(x86::r13, args[0]);
c.mov(x86::rbp, args[1]);
c.mov(x86::r12, args[2]);
c.mov(x86::rbx, args[3]);
#else
static_cast<void>(c);
static_cast<void>(args);
#endif
}
using imm_ptr = Imm;
}
// Build runtime function with asmjit::X86Assembler
template <typename FT, typename F>
template <typename FT, typename Asm = native_asm, typename F>
inline FT build_function_asm(std::string_view name, F&& builder)
{
using namespace asmjit;
@ -181,7 +226,8 @@ inline FT build_function_asm(std::string_view name, F&& builder)
CodeHolder code;
code.init(rt.environment());
std::array<x86::Gp, 4> args;
#if defined(ARCH_X64)
native_args args;
#ifdef _WIN32
args[0] = x86::rcx;
args[1] = x86::rdx;
@ -193,16 +239,27 @@ inline FT build_function_asm(std::string_view name, F&& builder)
args[2] = x86::rdx;
args[3] = x86::rcx;
#endif
#elif defined(ARCH_ARM64)
native_args args;
args[0] = a64::x0;
args[1] = a64::x1;
args[2] = a64::x2;
args[3] = a64::x3;
#endif
x86::Assembler compiler(&code);
Asm compiler(&code);
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
builder(std::ref(compiler), args);
if constexpr (std::is_invocable_v<F, Asm&, native_args&>)
builder(compiler, args);
else
builder(compiler);
rt.dump_name = name;
const auto result = rt._add(&code);
jit_announce(result, code.codeSize(), name);
return reinterpret_cast<FT>(uptr(result));
}
#ifdef __APPLE__
#if !defined(ARCH_X64) || defined(__APPLE__)
template <typename FT, usz = 4096>
class built_function
{
@ -213,9 +270,23 @@ public:
built_function& operator=(const built_function&) = delete;
template <typename F>
built_function(std::string_view name, F&& builder)
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder))))
template <typename F> requires (std::is_invocable_v<F, native_asm&, native_args&>)
built_function(std::string_view name, F&& builder,
u32 line = __builtin_LINE(),
u32 col = __builtin_COLUMN(),
const char* file = __builtin_FILE(),
const char* func = __builtin_FUNCTION())
: m_func(ensure(build_function_asm<FT>(name, std::forward<F>(builder)), const_str(), line, col, file, func))
{
}
template <typename F> requires (std::is_invocable_v<F>)
built_function(std::string_view, F&& getter,
u32 line = __builtin_LINE(),
u32 col = __builtin_COLUMN(),
const char* file = __builtin_FILE(),
const char* func = __builtin_FUNCTION())
: m_func(ensure(getter(), const_str(), line, col, file, func))
{
}
@ -251,7 +322,8 @@ public:
CodeHolder code;
code.init(rt.environment());
std::array<x86::Gp, 4> args;
#if defined(ARCH_X64)
native_args args;
#ifdef _WIN32
args[0] = x86::rcx;
args[1] = x86::rdx;
@ -263,10 +335,18 @@ public:
args[2] = x86::rdx;
args[3] = x86::rcx;
#endif
#elif defined(ARCH_ARM64)
native_args args;
args[0] = a64::x0;
args[1] = a64::x1;
args[2] = a64::x2;
args[3] = a64::x3;
#endif
x86::Assembler compiler(&code);
native_asm compiler(&code);
compiler.addEncodingOptions(EncodingOptions::kOptimizedAlign);
builder(std::ref(compiler), args);
builder(compiler, args);
rt.dump_name = name;
jit_announce(rt._add(&code), code.codeSize(), name);
}

View file

@ -239,7 +239,7 @@ struct fmt_class_string<T, void>
static void format(std::string& out, u64 arg)
{
const auto& obj = get_object(arg);
void format_byte_array(std::string&, const uchar*, usz);
format_byte_array(out, reinterpret_cast<const uchar*>(std::data(obj)), std::size(obj));
}

View file

@ -77,7 +77,7 @@
#include "util/logs.hpp"
#include "util/asm.hpp"
#include "util/v128.hpp"
#include "util/v128sse.hpp"
#include "util/simd.hpp"
#include "util/sysinfo.hpp"
#include "Emu/Memory/vm_locking.h"
@ -189,6 +189,7 @@ bool IsDebuggerPresent()
}
#endif
#if defined(ARCH_X64)
enum x64_reg_t : u32
{
X64R_RAX = 0,
@ -839,6 +840,7 @@ void decode_x64_reg_op(const u8* code, x64_op_t& out_op, x64_reg_t& out_reg, usz
#ifdef _WIN32
typedef CONTEXT x64_context;
typedef CONTEXT ucontext_t;
#define X64REG(context, reg) (&(&(context)->Rax)[reg])
#define XMMREG(context, reg) (reinterpret_cast<v128*>(&(&(context)->Xmm0)[reg]))
@ -1211,12 +1213,18 @@ usz get_x64_access_size(x64_context* context, x64_op_t op, x64_reg_t reg, usz d_
return d_size;
}
#elif defined(ARCH_ARM64)
#define RIP(context) ((context)->uc_mcontext.pc)
#endif /* ARCH_ */
namespace rsx
{
extern std::function<bool(u32 addr, bool is_writing)> g_access_violation_handler;
}
bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) noexcept
bool handle_access_violation(u32 addr, bool is_writing, ucontext_t* context) noexcept
{
g_tls_fault_all++;
@ -1243,6 +1251,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
}
}
#if defined(ARCH_X64)
const u8* const code = reinterpret_cast<u8*>(RIP(context));
x64_op_t op;
@ -1382,6 +1391,9 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
g_tls_fault_spu++;
return true;
} while (0);
#else
static_cast<void>(context);
#endif /* ARCH_ */
if (vm::check_addr(addr, is_writing ? vm::page_writable : vm::page_readable))
{
@ -1545,7 +1557,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
if (!g_tls_access_violation_recovered)
{
vm_log.notice("\n%s", dump_useful_thread_info());
vm_log.error("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
vm_log.error("Access violation %s location 0x%x (%s)", is_writing ? "writing" : "reading", addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
// TODO:
@ -1582,7 +1594,7 @@ bool handle_access_violation(u32 addr, bool is_writing, x64_context* context) no
// Do not log any further access violations in this case.
if (!g_tls_access_violation_recovered)
{
vm_log.fatal("Access violation %s location 0x%x (%s) [type=u%u]", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory", d_size * 8);
vm_log.fatal("Access violation %s location 0x%x (%s)", is_writing ? "writing" : (cpu && cpu->id_type() == 1 && cpu->get_pc() == addr ? "executing" : "reading"), addr, (is_writing && vm::check_addr(addr)) ? "read-only memory" : "unmapped memory");
}
while (Emu.IsPaused())
@ -1754,8 +1766,9 @@ const bool s_exception_handler_set = []() -> bool
static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
{
x64_context* context = static_cast<ucontext_t*>(uct);
ucontext_t* context = static_cast<ucontext_t*>(uct);
#if defined(ARCH_X64)
#ifdef __APPLE__
const u64 err = context->uc_mcontext->__es.__err;
#elif defined(__DragonFly__) || defined(__FreeBSD__)
@ -1770,6 +1783,23 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
const bool is_executing = err & 0x10;
const bool is_writing = err & 0x2;
#elif defined(ARCH_ARM64)
const bool is_executing = uptr(info->si_addr) == RIP(context);
const u32 insn = is_executing ? 0 : *reinterpret_cast<u32*>(RIP(context));
const bool is_writing = (insn & 0xbfff0000) == 0x0c000000
|| (insn & 0xbfe00000) == 0x0c800000
|| (insn & 0xbfdf0000) == 0x0d000000
|| (insn & 0xbfc00000) == 0x0d800000
|| (insn & 0x3f400000) == 0x08000000
|| (insn & 0x3bc00000) == 0x39000000
|| (insn & 0x3fc00000) == 0x3d800000
|| (insn & 0x3bc00000) == 0x38000000
|| (insn & 0x3fe00000) == 0x3c800000
|| (insn & 0x3a400000) == 0x28000000;
#else
#error "signal_handler not implemented"
#endif
const u64 exec64 = (reinterpret_cast<u64>(info->si_addr) - reinterpret_cast<u64>(vm::g_exec_addr)) / 2;
const auto cause = is_executing ? "executing" : is_writing ? "writing" : "reading";
@ -1809,6 +1839,26 @@ static void signal_handler(int /*sig*/, siginfo_t* info, void* uct) noexcept
thread_ctrl::emergency_exit(msg);
}
static void sigill_handler(int /*sig*/, siginfo_t* info, void* /*uct*/) noexcept
{
std::string msg = fmt::format("Illegal instruction at %p (%s).\n", info->si_addr, *reinterpret_cast<be_t<u128>*>(info->si_addr));
append_thread_name(msg);
if (IsDebuggerPresent())
{
sys_log.fatal("\n%s", msg);
sys_log.notice("\n%s", dump_useful_thread_info());
// Convert to SIGTRAP
raise(SIGTRAP);
return;
}
thread_ctrl::emergency_exit(msg);
}
void sigpipe_signaling_handler(int)
{
}
@ -1834,6 +1884,13 @@ const bool s_exception_handler_set = []() -> bool
}
#endif
sa.sa_sigaction = sigill_handler;
if (::sigaction(SIGILL, &sa, NULL) == -1)
{
std::fprintf(stderr, "sigaction(SIGILL) failed (%d).\n", errno);
std::abort();
}
sa.sa_handler = sigpipe_signaling_handler;
if (::sigaction(SIGPIPE, &sa, NULL) == -1)
{
@ -1852,11 +1909,7 @@ const bool s_terminate_handler_set = []() -> bool
std::set_terminate([]()
{
if (IsDebuggerPresent())
#ifdef _MSC_VER
__debugbreak();
#else
__asm("int3;");
#endif
utils::trap();
report_fatal_error("RPCS3 has abnormally terminated.");
});
@ -1935,7 +1988,7 @@ void thread_base::initialize(void (*error_cb)())
{
if (attempts == umax)
{
g_tls_wait_time += __rdtsc() - stamp0;
g_tls_wait_time += utils::get_tsc() - stamp0;
}
else if (attempts > 1)
{
@ -2096,6 +2149,8 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
std::fesetround(FE_TONEAREST);
gv_unset_zeroing_denormals();
static constexpr u64 s_stop_bit = 0x8000'0000'0000'0000ull;
static atomic_t<u64> s_pool_ctr = []
@ -2195,10 +2250,11 @@ thread_base::native_entry thread_base::finalize(u64 _self) noexcept
thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base* _base))
{
return build_function_asm<native_entry>("thread_base_trampoline", [&](asmjit::x86::Assembler& c, auto& args)
return build_function_asm<native_entry>("thread_base_trampoline", [&](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label _ret = c.newLabel();
c.push(x86::rbp);
c.sub(x86::rsp, 0x20);
@ -2222,6 +2278,7 @@ thread_base::native_entry thread_base::make_trampoline(u64(*entry)(thread_base*
c.bind(_ret);
c.add(x86::rsp, 0x28);
c.ret();
#endif
});
}
@ -2364,7 +2421,7 @@ bool thread_base::join(bool dtor) const
// Hacked for too sleepy threads (1ms) TODO: make sure it's unneeded and remove
const auto timeout = dtor && Emu.IsStopped() ? atomic_wait_timeout{1'000'000} : atomic_wait_timeout::inf;
auto stamp0 = __rdtsc();
auto stamp0 = utils::get_tsc();
for (u64 i = 0; (m_sync & 3) <= 1; i++)
{
@ -2377,7 +2434,7 @@ bool thread_base::join(bool dtor) const
if (i >= 16 && !(i & (i - 1)) && timeout != atomic_wait_timeout::inf)
{
sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (__rdtsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
sig_log.error(u8"Thread [%s] is too sleepy. Waiting for it %.3fµs already!", *m_tname.load(), (utils::get_tsc() - stamp0) / (utils::get_tsc_freq() / 1000000.));
}
}
@ -2522,17 +2579,8 @@ void thread_base::exec()
sig_log.fatal("Thread terminated due to fatal error: %s", reason);
#ifdef _WIN32
if (IsDebuggerPresent())
{
__debugbreak();
}
#else
if (IsDebuggerPresent())
{
__asm("int3;");
}
#endif
utils::trap();
if (const auto _this = g_tls_this_thread)
{

View file

@ -478,7 +478,19 @@ class named_thread final : public Context, result_storage<Context>, thread_base
return thread::finalize(thread_state::finished);
}
#if defined(ARCH_X64)
static inline thread::native_entry trampoline = thread::make_trampoline(entry_point);
#else
static void* trampoline(void* arg)
{
if (const auto next = thread_base::finalize(entry_point(static_cast<thread_base*>(arg))))
{
return next(thread_ctrl::get_current());
}
return nullptr;
}
#endif
friend class thread_ctrl;