Merge branch 'master' into nastys-patch-17

This commit is contained in:
nastys 2025-11-20 22:54:37 +01:00 committed by GitHub
commit 5372afa79f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 262 additions and 1517 deletions

View file

@ -12,7 +12,7 @@ pkg info # debug
pkg install "llvm$LLVM_COMPILER_VER"
# Mandatory dependencies (qtX-base is pulled via qtX-multimedia)
pkg install git ccache cmake ninja "qt$QT_VER_MAIN-multimedia" "qt$QT_VER_MAIN-svg" glew openal-soft ffmpeg
pkg install git ccache cmake ninja "qt$QT_VER_MAIN-multimedia" "qt$QT_VER_MAIN-svg" glew openal-soft ffmpeg pcre2
# Optional dependencies (libevdev is pulled by qtX-base)
pkg install pkgconf alsa-lib pulseaudio sdl3 evdev-proto vulkan-headers vulkan-loader opencv

View file

@ -278,7 +278,7 @@ if(USE_FAUDIO)
target_compile_definitions(FAudio-static INTERFACE -DHAVE_FAUDIO)
set(FAUDIO_TARGET FAudio-static)
else()
message(FATAL_ERROR
message(WARNING
"-- RPCS3: 3rdparty FAudio requires SDL 3.2.0 or newer. Since a valid SDL3"
">=3.2.0 version cannot be found, building with FAudio will be skipped.")
set(USE_FAUDIO OFF CACHE BOOL "Disabled FAudio with SDL < 3.2.0" FORCE)

View file

@ -133,7 +133,7 @@ if(MSVC)
endif()
if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
message( FATAL_ERROR "RPCS3 can only be compiled on 64-bit platforms." )
message(FATAL_ERROR "RPCS3 can only be compiled on 64-bit platforms.")
endif()
if(APPLE AND CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")

View file

@ -392,6 +392,10 @@ namespace fmt
{
raw_throw_exception(src_loc, reinterpret_cast<const char*>(fmt), type_info_v<Args...>, fmt_args_t<Args...>{fmt_unveil<Args>::get(args)...});
}
#if !defined(_MSC_VER) || defined(__clang__)
[[noreturn]] ~throw_exception();
#endif
};
template <typename CharT, usz N, typename... Args>

View file

@ -190,7 +190,7 @@ struct cpu_prof
reservation_samples = 0;
}
static std::string format(const std::multimap<u64, u64, std::greater<u64>>& chart, u64 samples, u64 idle, bool extended_print = false)
static std::string format(const std::multimap<u64, u64, std::greater<u64>>& chart, u64 samples, u64 idle, u32 type_id, bool extended_print = false)
{
// Print results
std::string results;
@ -204,11 +204,18 @@ struct cpu_prof
const f64 _frac = count / busy / samples;
// Print only 7 hash characters out of 11 (which covers roughly 48 bits)
fmt::append(results, "\n\t[%s", fmt::base57(be_t<u64>{name}));
results.resize(results.size() - 4);
if (type_id == 2)
{
fmt::append(results, "\n\t[%s", fmt::base57(be_t<u64>{name}));
results.resize(results.size() - 4);
// Print chunk address from lowest 16 bits
fmt::append(results, "...chunk-0x%05x]: %.4f%% (%u)", (name & 0xffff) * 4, _frac * 100., count);
// Print chunk address from lowest 16 bits
fmt::append(results, "...chunk-0x%05x]: %.4f%% (%u)", (name & 0xffff) * 4, _frac * 100., count);
}
else
{
fmt::append(results, "\n\t[0x%07x]: %.4f%% (%u)", name, _frac * 100., count);
}
if (results.size() >= (extended_print ? 10000 : 5000))
{
@ -257,27 +264,37 @@ struct cpu_prof
}
// Print results
const std::string results = format(chart, samples, idle);
const std::string results = format(chart, samples, idle, ptr->id_type());
profiler.notice("Thread \"%s\" [0x%08x]: %u samples (%.4f%% idle), %u new, %u reservation (%.4f%%):\n%s", ptr->get_name(), ptr->id, samples, get_percent(idle, samples), new_samples, reservation_samples, get_percent(reservation_samples, samples - idle), results);
new_samples = 0;
}
static void print_all(std::unordered_map<shared_ptr<cpu_thread>, sample_info>& threads, sample_info& all_info)
static void print_all(std::unordered_map<shared_ptr<cpu_thread>, sample_info>& threads, sample_info& all_info, u32 type_id)
{
u64 new_samples = 0;
// Print all results and cleanup
for (auto& [ptr, info] : threads)
{
if (ptr->id_type() != type_id)
{
continue;
}
new_samples += info.new_samples;
info.print(ptr);
}
std::multimap<u64, u64, std::greater<u64>> chart;
for (auto& [_, info] : threads)
for (auto& [ptr, info] : threads)
{
if (ptr->id_type() != type_id)
{
continue;
}
// This function collects thread information regardless of 'new_samples' member state
for (auto& [name, count] : info.freq)
{
@ -301,7 +318,7 @@ struct cpu_prof
if (new_samples < min_print_all_samples && thread_ctrl::state() != thread_state::aborting)
{
profiler.notice("All Threads: %u samples (%.4f%% idle), %u new, %u reservation (%.4f%%): Not enough new samples have been collected since the last print.", samples, get_percent(idle, samples), new_samples, reservation, get_percent(reservation, samples - idle));
profiler.notice("All %s Threads: %u samples (%.4f%% idle), %u new, %u reservation (%.4f%%): Not enough new samples have been collected since the last print.", type_id == 1 ? "PPU" : "SPU", samples, get_percent(idle, samples), new_samples, reservation, get_percent(reservation, samples - idle));
return;
}
@ -310,12 +327,13 @@ struct cpu_prof
chart.emplace(count, name);
}
const std::string results = format(chart, samples, idle, true);
profiler.notice("All Threads: %u samples (%.4f%% idle), %u new, %u reservation (%.4f%%):%s", samples, get_percent(idle, samples), new_samples, reservation, get_percent(reservation, samples - idle), results);
const std::string results = format(chart, samples, idle, type_id, true);
profiler.notice("All %s Threads: %u samples (%.4f%% idle), %u new, %u reservation (%.4f%%):%s", type_id == 1 ? "PPU" : "SPU", samples, get_percent(idle, samples), new_samples, reservation, get_percent(reservation, samples - idle), results);
}
};
sample_info all_threads_info{};
sample_info all_spu_threads_info{};
sample_info all_ppu_threads_info{};
void operator()()
{
@ -376,8 +394,11 @@ struct cpu_prof
{
if (auto state = +ptr->state; cpu_flag::exit - state)
{
const auto spu = ptr->try_get<spu_thread>();
const auto ppu = ptr->try_get<ppu_thread>();
// Get short function hash
const u64 name = atomic_storage<u64>::load(ptr->block_hash);
const u64 name = ppu ? atomic_storage<u32>::load(ppu->cia) : atomic_storage<u64>::load(ptr->block_hash);
// Append occurrence
info.samples++;
@ -387,17 +408,17 @@ struct cpu_prof
info.freq[name]++;
info.new_samples++;
if (auto spu = ptr->try_get<spu_thread>())
if (spu)
{
if (spu->raddr)
{
info.reservation_samples++;
}
}
// Append verification time to fixed common name 0000000...chunk-0x3fffc
if (name >> 16 && (name & 0xffff) == 0)
info.freq[0xffff]++;
// Append verification time to fixed common name 0000000...chunk-0x3fffc
if (name >> 16 && (name & 0xffff) == 0)
info.freq[0xffff]++;
}
}
else
{
@ -420,8 +441,10 @@ struct cpu_prof
{
profiler.success("Flushing profiling results...");
all_threads_info = {};
sample_info::print_all(threads, all_threads_info);
all_ppu_threads_info = {};
all_spu_threads_info = {};
sample_info::print_all(threads, all_ppu_threads_info, 1);
sample_info::print_all(threads, all_spu_threads_info, 2);
}
if (Emu.IsPaused())
@ -442,7 +465,8 @@ struct cpu_prof
}
// Print all remaining results
sample_info::print_all(threads, all_threads_info);
sample_info::print_all(threads, all_ppu_threads_info, 1);
sample_info::print_all(threads, all_spu_threads_info, 2);
}
static constexpr auto thread_name = "CPU Profiler"sv;
@ -459,7 +483,7 @@ extern f64 get_cpu_program_usage_percent(u64 hash)
{
u64 total = 0;
for (auto [name, count] : prof->all_threads_info.freq)
for (auto [name, count] : prof->all_spu_threads_info.freq)
{
if ((name & -65536) == hash)
{
@ -472,7 +496,7 @@ extern f64 get_cpu_program_usage_percent(u64 hash)
return 0;
}
return std::max<f64>(0.0001, static_cast<f64>(total) * 100 / (prof->all_threads_info.samples - prof->all_threads_info.idle));
return std::max<f64>(0.0001, static_cast<f64>(total) * 100 / (prof->all_spu_threads_info.samples - prof->all_spu_threads_info.idle));
}
}
@ -639,22 +663,17 @@ void cpu_thread::operator()()
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(get_class()));
}
while (!g_fxo->is_init<cpu_profiler>())
{
if (Emu.IsStopped())
{
return;
}
// Can we have a little race, right? First thread is started concurrently with g_fxo->init()
thread_ctrl::wait_for(1000);
}
ensure(g_fxo->is_init<cpu_profiler>());
switch (get_class())
{
case thread_class::ppu:
{
//g_fxo->get<cpu_profiler>().registered.push(id);
if (g_cfg.core.ppu_prof)
{
g_fxo->get<cpu_profiler>().registered.push(id);
}
break;
}
case thread_class::spu:
@ -1546,7 +1565,7 @@ void cpu_thread::flush_profilers() noexcept
return;
}
if (g_cfg.core.spu_prof || g_cfg.core.spu_debug)
if (g_cfg.core.spu_prof || g_cfg.core.spu_debug || g_cfg.core.ppu_prof)
{
g_fxo->get<cpu_profiler>().registered.push(0);
}

View file

@ -1139,8 +1139,10 @@ error_code cellCameraGetBufferInfo(s32 dev_num, vm::ptr<CellCameraInfo> info)
return CELL_OK;
}
error_code cellCameraGetBufferInfoEx(s32 dev_num, vm::ptr<CellCameraInfoEx> info)
error_code cellCameraGetBufferInfoEx(ppu_thread& ppu, s32 dev_num, vm::ptr<CellCameraInfoEx> info)
{
ppu.state += cpu_flag::wait;
cellCamera.notice("cellCameraGetBufferInfoEx(dev_num=%d, info=0x%x)", dev_num, info);
// calls cellCameraGetBufferInfo
@ -1151,10 +1153,16 @@ error_code cellCameraGetBufferInfoEx(s32 dev_num, vm::ptr<CellCameraInfoEx> info
}
auto& g_camera = g_fxo->get<camera_thread>();
std::lock_guard lock(g_camera.mutex);
*info = g_camera.info;
CellCameraInfoEx info_out;
{
std::lock_guard lock(g_camera.mutex);
info_out = g_camera.info;
}
*info = info_out;
return CELL_OK;
}

View file

@ -2308,8 +2308,10 @@ error_code cellGemConvertVideoFinish(ppu_thread& ppu)
return CELL_OK;
}
error_code cellGemConvertVideoStart(vm::cptr<void> video_frame)
error_code cellGemConvertVideoStart(ppu_thread& ppu, vm::cptr<void> video_frame)
{
ppu.state += cpu_flag::wait;
cellGem.warning("cellGemConvertVideoStart(video_frame=*0x%x)", video_frame);
auto& gem = g_fxo->get<gem_config>();
@ -2461,6 +2463,8 @@ error_code cellGemEnableMagnetometer2(u32 gem_num, u32 enable)
error_code cellGemEnd(ppu_thread& ppu)
{
ppu.state += cpu_flag::wait;
cellGem.warning("cellGemEnd()");
auto& gem = g_fxo->get<gem_config>();
@ -3265,7 +3269,7 @@ error_code cellGemPrepareCamera(s32 max_exposure, f32 image_quality)
extern error_code cellCameraGetAttribute(s32 dev_num, s32 attrib, vm::ptr<u32> arg1, vm::ptr<u32> arg2);
extern error_code cellCameraSetAttribute(s32 dev_num, s32 attrib, u32 arg1, u32 arg2);
extern error_code cellCameraGetBufferInfoEx(s32 dev_num, vm::ptr<CellCameraInfoEx> info);
extern error_code cellCameraGetBufferInfoEx(ppu_thread&, s32 dev_num, vm::ptr<CellCameraInfoEx> info);
vm::var<CellCameraInfoEx> info = vm::make_var<CellCameraInfoEx>({});
vm::var<u32> arg1 = vm::make_var<u32>({});
@ -3273,7 +3277,7 @@ error_code cellGemPrepareCamera(s32 max_exposure, f32 image_quality)
cellCameraGetAttribute(0, 0x3e6, arg1, arg2);
cellCameraSetAttribute(0, 0x3e6, 0x3e, *arg2 | 0x80);
cellCameraGetBufferInfoEx(0, info);
cellCameraGetBufferInfoEx(*cpu_thread::get_current<ppu_thread>(), 0, info);
if (info->width == 640)
{
@ -3605,6 +3609,8 @@ error_code cellGemTrackHues(vm::cptr<u32> req_hues, vm::ptr<u32> res_hues)
error_code cellGemUpdateFinish(ppu_thread& ppu)
{
ppu.state += cpu_flag::wait;
cellGem.warning("cellGemUpdateFinish()");
auto& gem = g_fxo->get<gem_config>();

View file

@ -2308,7 +2308,7 @@ void ppu_thread::cpu_sleep()
raddr = 0;
// Setup wait flag and memory flags to relock itself
state += g_use_rtm ? cpu_flag::wait : cpu_flag::wait + cpu_flag::memory;
state += cpu_flag::wait + cpu_flag::memory;
if (auto ptr = vm::g_tls_locked)
{
@ -2454,10 +2454,8 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
// Trigger the scheduler
state += cpu_flag::suspend;
if (!g_use_rtm)
{
state += cpu_flag::memory;
}
// Acquire memory passive lock
state += cpu_flag::memory;
call_history.data.resize(g_cfg.core.ppu_call_history ? call_history_max_size : 1);
syscall_history.data.resize(g_cfg.core.ppu_call_history ? syscall_history_max_size : 1);
@ -2703,11 +2701,7 @@ ppu_thread::ppu_thread(utils::serial& ar)
// Trigger the scheduler
state += cpu_flag::suspend;
if (!g_use_rtm)
{
state += cpu_flag::memory;
}
state += cpu_flag::memory;
ppu_tname = make_single<std::string>(ar.pop<std::string>());
@ -3191,221 +3185,6 @@ extern u64 ppu_ldarx(ppu_thread& ppu, u32 addr)
return ppu_load_acquire_reservation<u64>(ppu, addr);
}
const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, const void* _old, u64 _new)>("ppu_stcx_accurate_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
Label load = c.newLabel();
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r14);
c.sub(x86::rsp, 48);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(x86::rbp, -128);
c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.movzx(args[0].r32(), args[0].r16());
c.shr(args[0].r32(), 1);
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.and_(x86::r11, -128 / 2);
c.and_(args[0].r32(), 63);
// Prepare data
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::ymmword_ptr(args[2], 0));
c.vmovups(x86::ymm1, x86::ymmword_ptr(args[2], 32));
c.vmovups(x86::ymm2, x86::ymmword_ptr(args[2], 64));
c.vmovups(x86::ymm3, x86::ymmword_ptr(args[2], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
}
// Alloc r14 to stamp0
const auto stamp0 = x86::r14;
build_get_tsc(c, stamp0);
Label fail2 = c.newLabel();
Label tx1 = build_transaction_enter(c, fall, [&]()
{
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.movabs(x86::r13, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::r13));
c.jae(fall);
});
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&ppu_thread::state) - ::offset32(&ppu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::ymmword_ptr(x86::rbp, 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::ymmword_ptr(x86::rbp, 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::ymmword_ptr(x86::rbp, 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::ymmword_ptr(x86::rbp, 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
c.vptest(x86::ymm0, x86::ymm0);
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.xorps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.xorps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.xorps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.xorps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.xorps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.xorps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.xorps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
c.orps(x86::xmm6, x86::xmm7);
c.orps(x86::xmm0, x86::xmm2);
c.orps(x86::xmm4, x86::xmm6);
c.orps(x86::xmm0, x86::xmm4);
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail);
// Store 8 bytes
c.mov(x86::qword_ptr(x86::rbp, args[0], 1, 0), args[3]);
c.xend();
c.lock().add(x86::qword_ptr(x86::r11), 64);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so try to finish with xend instead
c.bind(fail);
// Load old data to store back in rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::ymmword_ptr(x86::rbp, 0));
c.vmovaps(x86::ymm1, x86::ymmword_ptr(x86::rbp, 32));
c.vmovaps(x86::ymm2, x86::ymmword_ptr(x86::rbp, 64));
c.vmovaps(x86::ymm3, x86::ymmword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.jmp(fail2);
c.bind(fall);
c.mov(x86::rax, -1);
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::r11), 64);
c.bind(load);
// Store previous data back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymmword_ptr(args[2], 0), x86::ymm0);
c.vmovaps(x86::ymmword_ptr(args[2], 32), x86::ymm1);
c.vmovaps(x86::ymmword_ptr(args[2], 64), x86::ymm2);
c.vmovaps(x86::ymmword_ptr(args[2], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
}
c.mov(x86::rax, -1);
c.mov(x86::qword_ptr(args[2], ::offset32(&ppu_thread::last_ftime) - ::offset32(&ppu_thread::rdata)), x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 48);
c.pop(x86::r14);
c.pop(x86::r13);
c.pop(x86::rbp);
maybe_flush_lbr(c);
c.ret();
#else
UNUSED(args);
// Unimplemented should fail.
c.brk(Imm(0x42));
c.ret(a64::x30);
#endif
});
template <typename T>
static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
{
@ -3486,77 +3265,6 @@ static bool ppu_store_reservation(ppu_thread& ppu, u32 addr, u64 reg_value)
return false;
}
if (g_use_rtm) [[likely]]
{
switch (u64 count = ppu_stcx_accurate_tx(addr & -8, rtime, ppu.rdata, std::bit_cast<u64>(new_data)))
{
case umax:
{
auto& all_data = *vm::get_super_ptr<spu_rdata_t>(addr & -128);
auto& sdata = *vm::get_super_ptr<atomic_be_t<u64>>(addr & -8);
const bool ok = cpu_thread::suspend_all<+3>(&ppu, {all_data, all_data + 64, &res}, [&]
{
if ((res & -128) == rtime && cmp_rdata(ppu.rdata, all_data))
{
sdata.release(new_data);
res += 64;
return true;
}
mov_rdata_nt(ppu.rdata, all_data);
res -= 64;
return false;
});
if (ok)
{
break;
}
ppu.last_ftime = -1;
[[fallthrough]];
}
case 0:
{
if (ppu.last_faddr == addr)
{
ppu.last_fail++;
}
if (ppu.last_ftime != umax)
{
ppu.last_faddr = 0;
return false;
}
utils::prefetch_read(ppu.rdata);
utils::prefetch_read(ppu.rdata + 64);
ppu.last_faddr = addr;
ppu.last_ftime = res.load() & -128;
ppu.last_ftsc = utils::get_tsc();
return false;
}
default:
{
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("STCX: took too long: %.3fus (%u c)", count / (utils::get_tsc_freq() / 1000'000.), count);
}
break;
}
}
if (ppu.last_faddr == addr)
{
ppu.last_succ++;
}
ppu.last_faddr = 0;
return true;
}
// Align address: we do not need the lower 7 bits anymore
addr &= -128;

View file

@ -593,6 +593,11 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect)
{
callee = m_module->getOrInsertFunction(fmt::format("__0x%x", target_last - base), type);
cast<Function>(callee.getCallee())->setCallingConv(CallingConv::GHC);
if (g_cfg.core.ppu_prof)
{
m_ir->CreateStore(m_ir->getInt32(target_last), m_ir->CreateStructGEP(m_thread_type, m_thread, static_cast<uint>(&m_cia - m_locals)));
}
}
}
else

View file

@ -3991,7 +3991,7 @@ public:
bool must_use_cpp_functions = !!g_cfg.core.spu_accurate_dma;
if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || !g_use_rtm)
if (u64 cmdh = ci->getZExtValue() & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_RESULT_MASK); g_cfg.core.rsx_fifo_accuracy || g_cfg.video.strict_rendering_mode || /*!g_use_rtm*/ true)
{
// TODO: don't require TSX (current implementation is TSX-only)
if (cmdh == MFC_PUT_CMD || cmdh == MFC_SNDSIG_CMD)

View file

@ -639,549 +639,6 @@ std::array<u32, 2> op_branch_targets(u32 pc, spu_opcode_t op)
return res;
}
const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void* _old, const void* _new)>("spu_putllc_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label fail = c.newLabel();
Label _ret = c.newLabel();
Label load = c.newLabel();
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::rbx);
#ifdef _WIN32
c.sub(x86::rsp, 168);
if (s_tsx_avx)
{
c.vmovups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.vmovups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
else
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
c.movups(x86::oword_ptr(x86::rsp, 32), x86::xmm8);
c.movups(x86::oword_ptr(x86::rsp, 48), x86::xmm9);
c.movups(x86::oword_ptr(x86::rsp, 64), x86::xmm10);
c.movups(x86::oword_ptr(x86::rsp, 80), x86::xmm11);
c.movups(x86::oword_ptr(x86::rsp, 96), x86::xmm12);
c.movups(x86::oword_ptr(x86::rsp, 112), x86::xmm13);
c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14);
c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15);
}
#else
c.sub(x86::rsp, 40);
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.movabs(args[1], reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(args[1], x86::qword_ptr(args[1]));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
c.prefetchw(x86::byte_ptr(args[1], 0));
c.prefetchw(x86::byte_ptr(args[1], 64));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Prepare data
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::ymmword_ptr(args[2], 0));
c.vmovups(x86::ymm1, x86::ymmword_ptr(args[2], 32));
c.vmovups(x86::ymm2, x86::ymmword_ptr(args[2], 64));
c.vmovups(x86::ymm3, x86::ymmword_ptr(args[2], 96));
c.vmovups(x86::ymm4, x86::ymmword_ptr(args[3], 0));
c.vmovups(x86::ymm5, x86::ymmword_ptr(args[3], 32));
c.vmovups(x86::ymm6, x86::ymmword_ptr(args[3], 64));
c.vmovups(x86::ymm7, x86::ymmword_ptr(args[3], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[2], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[2], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[2], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[2], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[2], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[2], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[2], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[2], 112));
c.movaps(x86::xmm8, x86::oword_ptr(args[3], 0));
c.movaps(x86::xmm9, x86::oword_ptr(args[3], 16));
c.movaps(x86::xmm10, x86::oword_ptr(args[3], 32));
c.movaps(x86::xmm11, x86::oword_ptr(args[3], 48));
c.movaps(x86::xmm12, x86::oword_ptr(args[3], 64));
c.movaps(x86::xmm13, x86::oword_ptr(args[3], 80));
c.movaps(x86::xmm14, x86::oword_ptr(args[3], 96));
c.movaps(x86::xmm15, x86::oword_ptr(args[3], 112));
}
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
Label fail2 = c.newLabel();
Label tx1 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&spu_thread::state) - ::offset32(&spu_thread::rdata)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vxorps(x86::ymm0, x86::ymm0, x86::ymmword_ptr(args[1], 0));
c.vxorps(x86::ymm1, x86::ymm1, x86::ymmword_ptr(args[1], 32));
c.vxorps(x86::ymm2, x86::ymm2, x86::ymmword_ptr(args[1], 64));
c.vxorps(x86::ymm3, x86::ymm3, x86::ymmword_ptr(args[1], 96));
c.vorps(x86::ymm0, x86::ymm0, x86::ymm1);
c.vorps(x86::ymm1, x86::ymm2, x86::ymm3);
c.vorps(x86::ymm0, x86::ymm1, x86::ymm0);
c.vptest(x86::ymm0, x86::ymm0);
}
else
{
c.xorps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.xorps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.xorps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.xorps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.xorps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.xorps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.xorps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.xorps(x86::xmm7, x86::oword_ptr(args[1], 112));
c.orps(x86::xmm0, x86::xmm1);
c.orps(x86::xmm2, x86::xmm3);
c.orps(x86::xmm4, x86::xmm5);
c.orps(x86::xmm6, x86::xmm7);
c.orps(x86::xmm0, x86::xmm2);
c.orps(x86::xmm4, x86::xmm6);
c.orps(x86::xmm0, x86::xmm4);
c.ptest(x86::xmm0, x86::xmm0);
}
c.jnz(fail);
if (s_tsx_avx)
{
c.vmovaps(x86::ymmword_ptr(args[1], 0), x86::ymm4);
c.vmovaps(x86::ymmword_ptr(args[1], 32), x86::ymm5);
c.vmovaps(x86::ymmword_ptr(args[1], 64), x86::ymm6);
c.vmovaps(x86::ymmword_ptr(args[1], 96), x86::ymm7);
}
else
{
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm8);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm9);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm10);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm11);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm12);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm13);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm14);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm15);
}
c.xend();
c.lock().add(x86::qword_ptr(x86::r11), 64);
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
// XABORT is expensive so try to finish with xend instead
c.bind(fail);
// Load previous data to store back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymm0, x86::ymmword_ptr(args[1], 0));
c.vmovaps(x86::ymm1, x86::ymmword_ptr(args[1], 32));
c.vmovaps(x86::ymm2, x86::ymmword_ptr(args[1], 64));
c.vmovaps(x86::ymm3, x86::ymmword_ptr(args[1], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx) - ::offset32(&spu_thread::rdata)), 1);
c.jmp(fail2);
c.bind(fall);
c.mov(x86::rax, -1);
c.jmp(_ret);
c.bind(fail2);
c.lock().sub(x86::qword_ptr(x86::r11), 64);
c.bind(load);
// Store previous data back to rdata
if (s_tsx_avx)
{
c.vmovaps(x86::ymmword_ptr(args[2], 0), x86::ymm0);
c.vmovaps(x86::ymmword_ptr(args[2], 32), x86::ymm1);
c.vmovaps(x86::ymmword_ptr(args[2], 64), x86::ymm2);
c.vmovaps(x86::ymmword_ptr(args[2], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[2], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[2], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[2], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[2], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[2], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[2], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[2], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[2], 112), x86::xmm7);
}
c.mov(x86::rax, -1);
c.mov(x86::qword_ptr(args[2], ::offset32(&spu_thread::last_ftime) - ::offset32(&spu_thread::rdata)), x86::rax);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (s_tsx_avx)
{
c.vmovups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.vmovups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
else
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
c.movups(x86::xmm8, x86::oword_ptr(x86::rsp, 32));
c.movups(x86::xmm9, x86::oword_ptr(x86::rsp, 48));
c.movups(x86::xmm10, x86::oword_ptr(x86::rsp, 64));
c.movups(x86::xmm11, x86::oword_ptr(x86::rsp, 80));
c.movups(x86::xmm12, x86::oword_ptr(x86::rsp, 96));
c.movups(x86::xmm13, x86::oword_ptr(x86::rsp, 112));
c.movups(x86::xmm14, x86::oword_ptr(x86::rsp, 128));
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
}
c.add(x86::rsp, 168);
#else
c.add(x86::rsp, 40);
#endif
c.pop(x86::rbx);
c.pop(x86::rbp);
if (s_tsx_avx)
{
c.vzeroupper();
}
maybe_flush_lbr(c);
c.ret();
#else
UNUSED(args);
c.brk(Imm(0x42));
c.ret(a64::x30);
#endif
});
const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rdata, u64* _stx, u64* _ftx)>("spu_putlluc_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label _ret = c.newLabel();
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.movabs(x86::r11, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::r11, x86::qword_ptr(x86::r11));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.prefetchw(x86::byte_ptr(x86::r11, 0));
c.prefetchw(x86::byte_ptr(x86::r11, 64));
// Prepare data
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::ymmword_ptr(args[1], 0));
c.vmovups(x86::ymm1, x86::ymmword_ptr(args[1], 32));
c.vmovups(x86::ymm2, x86::ymmword_ptr(args[1], 64));
c.vmovups(x86::ymm3, x86::ymmword_ptr(args[1], 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(args[1], 0));
c.movaps(x86::xmm1, x86::oword_ptr(args[1], 16));
c.movaps(x86::xmm2, x86::oword_ptr(args[1], 32));
c.movaps(x86::xmm3, x86::oword_ptr(args[1], 48));
c.movaps(x86::xmm4, x86::oword_ptr(args[1], 64));
c.movaps(x86::xmm5, x86::oword_ptr(args[1], 80));
c.movaps(x86::xmm6, x86::oword_ptr(args[1], 96));
c.movaps(x86::xmm7, x86::oword_ptr(args[1], 112));
}
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.movabs(args[1], reinterpret_cast<u64>(+vm::g_reservations));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
Label tx1 = build_transaction_enter(c, fall, [&]()
{
// ftx++;
c.add(x86::qword_ptr(args[3]), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
c.xbegin(tx1);
if (s_tsx_avx)
{
c.vmovaps(x86::ymmword_ptr(x86::r11, 0), x86::ymm0);
c.vmovaps(x86::ymmword_ptr(x86::r11, 32), x86::ymm1);
c.vmovaps(x86::ymmword_ptr(x86::r11, 64), x86::ymm2);
c.vmovaps(x86::ymmword_ptr(x86::r11, 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(x86::r11, 0), x86::xmm0);
c.movaps(x86::oword_ptr(x86::r11, 16), x86::xmm1);
c.movaps(x86::oword_ptr(x86::r11, 32), x86::xmm2);
c.movaps(x86::oword_ptr(x86::r11, 48), x86::xmm3);
c.movaps(x86::oword_ptr(x86::r11, 64), x86::xmm4);
c.movaps(x86::oword_ptr(x86::r11, 80), x86::xmm5);
c.movaps(x86::oword_ptr(x86::r11, 96), x86::xmm6);
c.movaps(x86::oword_ptr(x86::r11, 112), x86::xmm7);
}
c.xend();
c.lock().add(x86::qword_ptr(args[1]), 32);
// stx++
c.add(x86::qword_ptr(args[2]), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.jmp(_ret);
c.bind(fall);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
c.add(x86::rsp, 40);
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::rbp);
maybe_flush_lbr(c);
c.ret();
#else
UNUSED(args);
c.brk(Imm(0x42));
c.ret(a64::x30);
#endif
});
const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cpu_thread* _cpu, u64 rtime)>("spu_getllar_tx", [](native_asm& c, auto& args)
{
using namespace asmjit;
#if defined(ARCH_X64)
Label fall = c.newLabel();
Label _ret = c.newLabel();
//if (utils::has_avx() && !s_tsx_avx)
//{
// c.vzeroupper();
//}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
c.movups(x86::oword_ptr(x86::rsp, 16), x86::xmm7);
}
#endif
// Prepare registers
build_swap_rdx_with(c, args, x86::r10);
c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1);
c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Alloc args[0] to stamp0
const auto stamp0 = args[0];
build_get_tsc(c, stamp0);
// Begin transaction
Label tx0 = build_transaction_enter(c, fall, [&]()
{
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit1));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall);
});
// Check pause flag
c.bt(x86::dword_ptr(args[2], ::offset32(&cpu_thread::state)), static_cast<u32>(cpu_flag::pause));
c.jc(fall);
c.mov(x86::rax, x86::qword_ptr(x86::r11));
c.and_(x86::rax, -128);
c.cmp(x86::rax, args[3]);
c.jne(fall);
c.xbegin(tx0);
// Just read data to registers
if (s_tsx_avx)
{
c.vmovups(x86::ymm0, x86::ymmword_ptr(x86::rbp, 0));
c.vmovups(x86::ymm1, x86::ymmword_ptr(x86::rbp, 32));
c.vmovups(x86::ymm2, x86::ymmword_ptr(x86::rbp, 64));
c.vmovups(x86::ymm3, x86::ymmword_ptr(x86::rbp, 96));
}
else
{
c.movaps(x86::xmm0, x86::oword_ptr(x86::rbp, 0));
c.movaps(x86::xmm1, x86::oword_ptr(x86::rbp, 16));
c.movaps(x86::xmm2, x86::oword_ptr(x86::rbp, 32));
c.movaps(x86::xmm3, x86::oword_ptr(x86::rbp, 48));
c.movaps(x86::xmm4, x86::oword_ptr(x86::rbp, 64));
c.movaps(x86::xmm5, x86::oword_ptr(x86::rbp, 80));
c.movaps(x86::xmm6, x86::oword_ptr(x86::rbp, 96));
c.movaps(x86::xmm7, x86::oword_ptr(x86::rbp, 112));
}
c.xend();
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::stx)), 1);
build_get_tsc(c);
c.sub(x86::rax, stamp0);
// Store data
if (s_tsx_avx)
{
c.vmovaps(x86::ymmword_ptr(args[1], 0), x86::ymm0);
c.vmovaps(x86::ymmword_ptr(args[1], 32), x86::ymm1);
c.vmovaps(x86::ymmword_ptr(args[1], 64), x86::ymm2);
c.vmovaps(x86::ymmword_ptr(args[1], 96), x86::ymm3);
}
else
{
c.movaps(x86::oword_ptr(args[1], 0), x86::xmm0);
c.movaps(x86::oword_ptr(args[1], 16), x86::xmm1);
c.movaps(x86::oword_ptr(args[1], 32), x86::xmm2);
c.movaps(x86::oword_ptr(args[1], 48), x86::xmm3);
c.movaps(x86::oword_ptr(args[1], 64), x86::xmm4);
c.movaps(x86::oword_ptr(args[1], 80), x86::xmm5);
c.movaps(x86::oword_ptr(args[1], 96), x86::xmm6);
c.movaps(x86::oword_ptr(args[1], 112), x86::xmm7);
}
c.jmp(_ret);
c.bind(fall);
c.xor_(x86::eax, x86::eax);
//c.jmp(_ret);
c.bind(_ret);
#ifdef _WIN32
if (!s_tsx_avx)
{
c.movups(x86::xmm6, x86::oword_ptr(x86::rsp, 0));
c.movups(x86::xmm7, x86::oword_ptr(x86::rsp, 16));
}
#endif
if (s_tsx_avx)
{
c.vzeroupper();
}
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::rbp);
maybe_flush_lbr(c);
c.ret();
#else
UNUSED(args);
c.brk(Imm(0x42));
c.ret(a64::x30);
#endif
});
void spu_int_ctrl_t::set(u64 ints)
{
// leave only enabled interrupts
@ -2396,60 +1853,6 @@ void spu_thread::push_snr(u32 number, u32 value)
const u32 event_bit = SPU_EVENT_S1 >> (number & 1);
const bool bitor_bit = !!((snr_config >> number) & 1);
// Redundant, g_use_rtm is checked inside tx_start now.
if (g_use_rtm && false)
{
bool channel_notify = false;
bool thread_notify = false;
const bool ok = utils::tx_start([&]
{
channel_notify = (channel->data.raw() == spu_channel::bit_wait);
thread_notify = (channel->data.raw() & spu_channel::bit_count) == 0;
if (channel_notify)
{
ensure(channel->jostling_value.raw() == spu_channel::bit_wait);
channel->jostling_value.raw() = value;
channel->data.raw() = 0;
}
else if (bitor_bit)
{
channel->data.raw() &= ~spu_channel::bit_wait;
channel->data.raw() |= spu_channel::bit_count | value;
}
else
{
channel->data.raw() = spu_channel::bit_count | value;
}
if (thread_notify)
{
ch_events.raw().events |= event_bit;
if (ch_events.raw().mask & event_bit)
{
ch_events.raw().count = 1;
thread_notify = ch_events.raw().waiting != 0;
}
else
{
thread_notify = false;
}
}
});
if (ok)
{
if (channel_notify)
channel->data.notify_one();
if (thread_notify)
this->notify();
return;
}
}
// Lock event channel in case it needs event notification
ch_events.atomic_op([](ch_events_t& ev)
{
@ -2590,7 +1993,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*
rsx::reservation_lock<false, 1> rsx_lock(eal, args.size, !is_get && (g_cfg.video.strict_rendering_mode || (g_cfg.core.rsx_fifo_accuracy && !g_cfg.core.spu_accurate_dma && eal < rsx::constants::local_mem_base)));
if ((!g_use_rtm && !is_get) || g_cfg.core.spu_accurate_dma) [[unlikely]]
if (!is_get || g_cfg.core.spu_accurate_dma) [[unlikely]]
{
perf_meter<"ADMA_GET"_u64> perf_get = perf_;
perf_meter<"ADMA_PUT"_u64> perf_put = perf_;
@ -3697,10 +3100,7 @@ bool spu_thread::do_list_transfer(spu_mfc_cmd& args)
{
rsx_lock.update_if_enabled(addr, size, range_lock);
if (!g_use_rtm)
{
vm::range_lock(range_lock, addr & -128, utils::align<u32>(addr + size, 128) - (addr & -128));
}
vm::range_lock(range_lock, addr & -128, utils::align<u32>(addr + size, 128) - (addr & -128));
}
else
{
@ -3912,90 +3312,9 @@ bool spu_thread::do_putllc(const spu_mfc_cmd& args)
return true;
}
}
else if (!g_use_rtm)
else
{
*vm::_ptr<atomic_t<u32>>(addr) += 0;
}
if (g_use_rtm) [[likely]]
{
switch (u64 count = spu_putllc_tx(addr, rtime, rdata, to_write))
{
case umax:
{
auto& data = *vm::get_super_ptr<spu_rdata_t>(addr);
const bool ok = cpu_thread::suspend_all<+3>(this, {data, data + 64, &res}, [&]()
{
if ((res & -128) == rtime)
{
if (cmp_rdata(rdata, data))
{
mov_rdata(data, to_write);
res += 64;
return true;
}
}
// Save previous data
mov_rdata_nt(rdata, data);
res -= 64;
return false;
});
const u64 count2 = utils::get_tsc() - perf2.get();
if (count2 > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("PUTLLC: took too long: %.3fus (%u c) (addr=0x%x) (S)", count2 / (utils::get_tsc_freq() / 1000'000.), count2, addr);
}
if (ok)
{
break;
}
last_ftime = -1;
[[fallthrough]];
}
case 0:
{
if (addr == last_faddr)
{
last_fail++;
}
if (last_ftime != umax)
{
last_faddr = 0;
return false;
}
utils::prefetch_read(rdata);
utils::prefetch_read(rdata + 64);
last_faddr = addr;
last_ftime = res.load() & -128;
last_ftsc = utils::get_tsc();
return false;
}
default:
{
if (count > 20000 && g_cfg.core.perf_report) [[unlikely]]
{
perf_log.warning("PUTLLC: took too long: %.3fus (%u c) (addr = 0x%x)", count / (utils::get_tsc_freq() / 1000'000.), count, addr);
}
break;
}
}
if (addr == last_faddr)
{
last_succ++;
}
last_faddr = 0;
return true;
utils::trigger_write_page_fault(vm::base(addr));
}
auto& super_data = *vm::get_super_ptr<spu_rdata_t>(addr);
@ -4189,7 +3508,7 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
{
result = 0;
}
else if (!g_use_rtm)
else
{
// Provoke page fault
utils::trigger_write_page_fault(vm::base(addr));
@ -4200,16 +3519,6 @@ void do_cell_atomic_128_store(u32 addr, const void* to_write)
mov_rdata(sdata, *static_cast<const spu_rdata_t*>(to_write));
vm::reservation_acquire(addr) += 32;
}
else if (cpu->get_class() != thread_class::spu)
{
u64 stx, ftx;
result = spu_putlluc_tx(addr, to_write, &stx, &ftx);
}
else
{
auto _spu = static_cast<spu_thread*>(cpu);
result = spu_putlluc_tx(addr, to_write, &_spu->stx, &_spu->ftx);
}
if (result == 0)
{
@ -4565,7 +3874,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
return is_range_limited;
}
if (type == spu_itype::BRSL)
if (type == spu_itype::BRSL && op.rt == 0)
{
// Insert a virtual return-to-next, because it is usually a call
results[1] = addr + 4;
@ -4816,7 +4125,7 @@ bool spu_thread::process_mfc_cmd()
if (raddr != addr)
{
// Last check for event before we replace the reservation with a new one
if (reservation_check(raddr, rdata))
if (~ch_events.load().events & SPU_EVENT_LR && reservation_check(raddr, rdata, addr))
{
set_events(SPU_EVENT_LR);
}
@ -5104,29 +4413,15 @@ bool spu_thread::process_mfc_cmd()
{
ntime = vm::reservation_acquire(addr);
if (ntime & vm::rsrv_unique_lock)
if (ntime & 127)
{
// There's an on-going reservation store, wait
continue;
}
u64 test_mask = -1;
mov_rdata(rdata, data);
if (ntime & 127)
{
// Try to use TSX to obtain data atomically
if (!g_use_rtm || !spu_getllar_tx(addr, rdata, this, ntime & -128))
{
// See previous ntime check.
continue;
}
}
else
{
mov_rdata(rdata, data);
}
if (u64 time0 = vm::reservation_acquire(addr); (ntime & test_mask) != (time0 & test_mask))
if (u64 time0 = vm::reservation_acquire(addr); ntime != time0)
{
// Reservation data has been modified recently
if (time0 & vm::rsrv_unique_lock) i += 12;
@ -5526,7 +4821,7 @@ bool spu_thread::process_mfc_cmd()
ch_mfc_cmd.cmd, ch_mfc_cmd.lsa, ch_mfc_cmd.eal, ch_mfc_cmd.tag, ch_mfc_cmd.size);
}
bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal) const
{
if (!addr)
{
@ -5545,9 +4840,24 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data) const
return !cmp_rdata(data, *vm::get_super_ptr<decltype(rdata)>(addr));
}
if ((addr >> 20) == (current_eal >> 20))
{
if (vm::check_addr(addr, vm::page_1m_size))
{
// Same random-access-memory page as the current MFC command, assume allocated
return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr));
}
if ((addr >> 16) == (current_eal >> 16) && vm::check_addr(addr, vm::page_64k_size))
{
// Same random-access-memory page as the current MFC command, assume allocated
return !cmp_rdata(data, vm::_ref<decltype(rdata)>(addr));
}
}
// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
range_lock->store(u64{128} << 32 | addr | vm::range_readable);
u64 lock_val = *std::prev(std::end(vm::g_range_lock_set));
u64 old_lock = 0;
@ -5628,7 +4938,7 @@ bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_
// Ensure data is allocated (HACK: would raise LR event if not)
// Set range_lock first optimistically
range_lock->store(u64{128} << 32 | addr);
range_lock->store(u64{128} << 32 | addr | vm::range_readable);
u64 lock_val = *std::prev(std::end(vm::g_range_lock_set));
u64 old_lock = 0;

View file

@ -901,7 +901,8 @@ public:
// Returns true if reservation existed but was just discovered to be lost
// It is safe to use on any address, even if not directly accessed by SPU (so it's slower)
bool reservation_check(u32 addr, const decltype(rdata)& data) const;
// Optionally pass a known allocated address for internal optimization (the current Effective-Address of the MFC command)
bool reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal = 0) const;
static bool reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_lock);
usz register_cache_line_waiter(u32 addr);
void deregister_cache_line_waiter(usz index);

View file

@ -5,6 +5,7 @@
#include "Emu/CPU/CPUThread.h"
#include "Emu/Cell/ErrorCodes.h"
#include "Emu/Cell/SPUThread.h"
#include "Emu/Cell/PPUThread.h"
#include "Emu/IdManager.h"
#include "util/asm.hpp"
@ -249,17 +250,37 @@ error_code sys_memory_free(cpu_thread& cpu, u32 addr)
return CELL_OK;
}
error_code sys_memory_get_page_attribute(cpu_thread& cpu, u32 addr, vm::ptr<sys_page_attr_t> attr)
error_code sys_memory_get_page_attribute(ppu_thread& ppu, u32 addr, vm::ptr<sys_page_attr_t> attr)
{
cpu.state += cpu_flag::wait;
ppu.state += cpu_flag::wait;
sys_memory.trace("sys_memory_get_page_attribute(addr=0x%x, attr=*0x%x)", addr, attr);
vm::writer_lock rlock;
if (!vm::check_addr(addr) || addr >= SPU_FAKE_BASE_ADDR)
if ((addr >> 28) == (ppu.stack_addr >> 28))
{
return CELL_EINVAL;
// Stack address: fast path
if (!(addr >= ppu.stack_addr && addr < ppu.stack_addr + ppu.stack_size) && !vm::check_addr(addr))
{
return { CELL_EINVAL, addr };
}
if (!vm::check_addr(attr.addr(), vm::page_readable, attr.size()))
{
return CELL_EFAULT;
}
attr->attribute = 0x40000ull; // SYS_MEMORY_PROT_READ_WRITE
attr->access_right = SYS_MEMORY_ACCESS_RIGHT_PPU_THR;
attr->page_size = 4096;
attr->pad = 0; // Always write 0
return CELL_OK;
}
const auto [ok, vm_flags] = vm::get_addr_flags(addr);
if (!ok || addr >= SPU_FAKE_BASE_ADDR)
{
return { CELL_EINVAL, addr };
}
if (!vm::check_addr(attr.addr(), vm::page_readable, attr.size()))
@ -268,19 +289,20 @@ error_code sys_memory_get_page_attribute(cpu_thread& cpu, u32 addr, vm::ptr<sys_
}
attr->attribute = 0x40000ull; // SYS_MEMORY_PROT_READ_WRITE (TODO)
attr->access_right = addr >> 28 == 0xdu ? SYS_MEMORY_ACCESS_RIGHT_PPU_THR : SYS_MEMORY_ACCESS_RIGHT_ANY;// (TODO)
attr->access_right = SYS_MEMORY_ACCESS_RIGHT_ANY; // TODO: Report accurately
if (vm::check_addr(addr, vm::page_1m_size))
if (vm_flags & vm::page_1m_size)
{
attr->page_size = 0x100000;
}
else if (vm::check_addr(addr, vm::page_64k_size))
else if (vm_flags & vm::page_64k_size)
{
attr->page_size = 0x10000;
}
else
{
attr->page_size = 4096;
//attr->page_size = 4096;
fmt::throw_exception("Unreachable");
}
attr->pad = 0; // Always write 0

View file

@ -4,6 +4,7 @@
#include "Emu/Cell/ErrorCodes.h"
class cpu_thread;
class ppu_thread;
enum lv2_mem_container_id : u32
{
@ -131,7 +132,7 @@ struct sys_memory_user_memory_stat_t
error_code sys_memory_allocate(cpu_thread& cpu, u64 size, u64 flags, vm::ptr<u32> alloc_addr);
error_code sys_memory_allocate_from_container(cpu_thread& cpu, u64 size, u32 cid, u64 flags, vm::ptr<u32> alloc_addr);
error_code sys_memory_free(cpu_thread& cpu, u32 start_addr);
error_code sys_memory_get_page_attribute(cpu_thread& cpu, u32 addr, vm::ptr<sys_page_attr_t> attr);
error_code sys_memory_get_page_attribute(ppu_thread& cpu, u32 addr, vm::ptr<sys_page_attr_t> attr);
error_code sys_memory_get_user_memory_size(cpu_thread& cpu, vm::ptr<sys_memory_info_t> mem_info);
error_code sys_memory_get_user_memory_stat(cpu_thread& cpu, vm::ptr<sys_memory_user_memory_stat_t> mem_stat);
error_code sys_memory_container_create(cpu_thread& cpu, vm::ptr<u32> cid, u64 size);

View file

@ -547,6 +547,13 @@ namespace vm
{
to_clear = for_all_range_locks(to_clear & ~get_range_lock_bits(true), [&](u64 addr2, u32 size2)
{
constexpr u32 range_size_loc = vm::range_pos - 32;
if ((size2 >> range_size_loc) == (vm::range_readable >> vm::range_pos))
{
return 0;
}
// Split and check every 64K page separately
for (u64 hi = addr2 >> 16, max = (addr2 + size2 - 1) >> 16; hi <= max; hi++)
{

View file

@ -81,7 +81,7 @@ namespace vm
bool check_addr(u32 addr, u8 flags, u32 size);
template <u32 Size = 1>
bool check_addr(u32 addr, u8 flags = page_readable)
inline bool check_addr(u32 addr, u8 flags = page_readable)
{
extern std::array<memory_page, 0x100000000 / 4096> g_pages;
@ -94,6 +94,16 @@ namespace vm
return !(~g_pages[addr / 4096] & (flags | page_allocated));
}
// Like check_addr but should only be used in lock-free context with care
inline std::pair<bool, u8> get_addr_flags(u32 addr) noexcept
{
extern std::array<memory_page, 0x100000000 / 4096> g_pages;
const u8 flags = g_pages[addr / 4096].load();
return std::make_pair(!!(flags & page_allocated), flags);
}
// Read string in a safe manner (page aware) (bool true = if null-termination)
bool read_string(u32 addr, u32 max_size, std::string& out_string, bool check_pages = true) noexcept;

View file

@ -6,9 +6,6 @@
#include "util/tsc.hpp"
#include <functional>
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit2;
#ifdef _MSC_VER
extern "C"
{
@ -143,7 +140,7 @@ namespace vm
void reservation_op_internal(u32 addr, std::function<bool()> func);
template <bool Ack = false, typename CPU, typename T, typename AT = u32, typename F>
inline SAFE_BUFFERS(auto) reservation_op(CPU& cpu, _ptr_base<T, AT> ptr, F op)
inline SAFE_BUFFERS(auto) reservation_op(CPU& /*cpu*/, _ptr_base<T, AT> ptr, F op)
{
// Atomic operation will be performed on aligned 128 bytes of data, so the data size and alignment must comply
static_assert(sizeof(T) <= 128 && alignof(T) == sizeof(T), "vm::reservation_op: unsupported type");
@ -162,188 +159,6 @@ namespace vm
auto& res = vm::reservation_acquire(addr);
//_m_prefetchw(&res);
#if defined(ARCH_X64)
if (g_use_rtm)
{
// Stage 1: single optimistic transaction attempt
unsigned status = -1;
u64 _old = 0;
auto stamp0 = utils::get_tsc(), stamp1 = stamp0, stamp2 = stamp0;
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[stage2];" ::: "memory" : stage2);
#else
status = _xbegin();
if (status == umax)
#endif
{
if (res & rsrv_unique_lock)
{
#ifndef _MSC_VER
__asm__ volatile ("xend; mov $-1, %%eax;" ::: "memory");
#else
_xend();
#endif
goto stage2;
}
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
std::invoke(op, *sptr);
const u64 old_time = res.fetch_add(128);
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
if constexpr (Ack)
reservation_notifier_notify(addr, old_time);
return;
}
else
{
if (auto result = std::invoke(op, *sptr))
{
const u64 old_time = res.fetch_add(128);
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
if constexpr (Ack)
reservation_notifier_notify(addr, old_time);
return result;
}
else
{
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
return result;
}
}
}
stage2:
#ifndef _MSC_VER
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
#endif
stamp1 = utils::get_tsc();
// Stage 2: try to lock reservation first
_old = res.fetch_add(1);
// Compute stamps excluding memory touch
stamp2 = utils::get_tsc() - (stamp1 - stamp0);
// Start lightened transaction
for (; !(_old & vm::rsrv_unique_lock) && stamp2 - stamp0 <= g_rtm_tx_limit2; stamp2 = utils::get_tsc())
{
if (cpu.has_pause_flag())
{
break;
}
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
#else
status = _xbegin();
if (status != umax) [[unlikely]]
{
goto retry;
}
#endif
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
std::invoke(op, *sptr);
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
res += 127;
if (Ack)
reservation_notifier_notify(addr, _old);
return;
}
else
{
if (auto result = std::invoke(op, *sptr))
{
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
res += 127;
if (Ack)
reservation_notifier_notify(addr, _old);
return result;
}
else
{
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
return result;
}
}
retry:
#ifndef _MSC_VER
__asm__ volatile ("mov %%eax, %0;" : "=r" (status) :: "memory");
#endif
if (!status)
{
break;
}
}
// Stage 3: all failed, heavyweight fallback (see comments at the bottom)
if constexpr (std::is_void_v<std::invoke_result_t<F, T&>>)
{
vm::reservation_op_internal(addr, [&]
{
std::invoke(op, *sptr);
return true;
});
if constexpr (Ack)
reservation_notifier_notify(addr, _old);
return;
}
else
{
auto result = std::invoke_result_t<F, T&>();
vm::reservation_op_internal(addr, [&]
{
if ((result = std::invoke(op, *sptr)))
{
return true;
}
else
{
return false;
}
});
if (Ack && result)
reservation_notifier_notify(addr, _old);
return result;
}
}
#else
static_cast<void>(cpu);
#endif /* ARCH_X64 */
// Lock reservation and perform heavyweight lock
reservation_shared_lock_internal(res);

View file

@ -607,7 +607,7 @@ namespace rsx
compiled_resources_temp.clear();
auto& cmd_text = compiled_resources_temp.append({});
cmd_text.config.set_font(font_ref ? font_ref : fontmgr::get("Arial", 12));
cmd_text.config.set_font(get_font());
cmd_text.config.color = fore_color;
cmd_text.verts = render_text(text.c_str(), static_cast<f32>(x), static_cast<f32>(y));

View file

@ -52,13 +52,13 @@ namespace rsx
return quad;
}
font::font(const char* ttf_name, f32 size)
font::font(std::string_view ttf_name, f32 size)
{
// Convert pt to px
size_px = ceilf(size * 96.f / 72.f);
size_pt = size;
font_name = ttf_name;
font_name = std::string(ttf_name);
initialized = true;
}
@ -135,10 +135,17 @@ namespace rsx
// Attempt to load a font from dev_flash before any other source
result.font_names.emplace_back("SCE-PS3-SR-R-JPN.TTF");
result.font_names.emplace_back("SCE-PS3-DH-R-CGB.TTF");
// Known system font as last fallback
result.font_names.emplace_back("Yu Gothic.ttf");
result.font_names.emplace_back("YuGothR.ttc");
#ifdef _WIN32
result.font_names.emplace_back("msyh.ttc");
result.font_names.emplace_back("simsunb.ttc");
result.font_names.emplace_back("simsun.ttc");
result.font_names.emplace_back("SimsunExtG.ttf");
#endif
break;
}
case language_class::hangul:
@ -159,25 +166,58 @@ namespace rsx
return result;
}
codepage* font::initialize_codepage(char32_t codepage_id)
codepage* font::initialize_codepage(char32_t c)
{
// Init glyph
const auto codepage_id = get_page_id(c);
const auto class_ = classify(codepage_id);
const auto fs_settings = get_glyph_files(class_);
// Attemt to load requested font
std::vector<u8> bytes;
std::string file_path;
std::vector<u8> fallback_bytes;
std::string fallback_file;
bool font_found = false;
const auto get_font = [&](const std::string& file_path) -> bool
{
// Read font
fs::file f(file_path);
f.read(bytes, f.size());
// Check if the character exists in the font
stbtt_fontinfo info;
if (stbtt_InitFont(&info, bytes.data(), stbtt_GetFontOffsetForIndex(bytes.data(), 0)) != 0)
{
font_found = stbtt_FindGlyphIndex(&info, c) != 0;
}
if (!font_found)
{
if (fallback_bytes.empty())
{
// Save this font as a fallback so we don't get a segfault or exception
fallback_bytes = std::move(bytes);
fallback_file = file_path;
}
bytes.clear();
}
return font_found;
};
for (const auto& font_file : fs_settings.font_names)
{
if (fs::is_file(font_file))
{
// Check for absolute paths or fonts 'installed' to executable folder
file_path = font_file;
font_found = true;
break;
if (get_font(font_file))
{
break;
}
continue;
}
std::string extension;
@ -196,11 +236,13 @@ namespace rsx
for (const auto& font_dir : fs_settings.lookup_font_dirs)
{
file_path = font_dir + file_name;
const std::string file_path = font_dir + file_name;
if (fs::is_file(file_path))
{
font_found = true;
break;
if (get_font(file_path))
{
break;
}
}
}
@ -210,16 +252,15 @@ namespace rsx
}
}
// Read font
if (font_found)
if (!font_found)
{
fs::file f(file_path);
f.read(bytes, f.size());
}
else
{
rsx_log.error("Failed to initialize font '%s.ttf' on codepage %d", font_name, static_cast<u32>(codepage_id));
return nullptr;
if (fallback_bytes.empty())
{
fmt::throw_exception("Failed to initialize font for character 0x%x on codepage %d.", static_cast<u32>(c), static_cast<u32>(codepage_id));
}
rsx_log.error("Failed to initialize font for character 0x%x on codepage %d. Falling back to font '%s'", static_cast<u32>(c), static_cast<u32>(codepage_id), fallback_file);
bytes = std::move(fallback_bytes);
}
codepage_cache.page = nullptr;
@ -245,7 +286,8 @@ namespace rsx
if (!initialized)
return {};
const auto page_id = (c >> 8);
const auto page_id = get_page_id(c);
if (codepage_cache.codepage_id == page_id && codepage_cache.page) [[likely]]
{
return codepage_cache.page->get_char(c, x_advance, y_advance);
@ -257,7 +299,7 @@ namespace rsx
for (const auto& e : m_glyph_map)
{
if (e.first == unsigned(page_id))
if (e.first == page_id)
{
codepage_cache.page = e.second.get();
break;
@ -266,7 +308,7 @@ namespace rsx
if (!codepage_cache.page) [[unlikely]]
{
codepage_cache.page = initialize_codepage(page_id);
codepage_cache.page = initialize_codepage(c);
}
return codepage_cache.page->get_char(c, x_advance, y_advance);

View file

@ -64,12 +64,13 @@ namespace rsx
}
codepage_cache;
static char32_t get_page_id(char32_t c) { return c >> 8; }
static language_class classify(char32_t codepage_id);
glyph_load_setup get_glyph_files(language_class class_) const;
codepage* initialize_codepage(char32_t codepage_id);
codepage* initialize_codepage(char32_t c);
public:
font(const char* ttf_name, f32 size);
font(std::string_view ttf_name, f32 size);
stbtt_aligned_quad get_char(char32_t c, f32& x_advance, f32& y_advance);
@ -79,7 +80,7 @@ namespace rsx
std::pair<f32, f32> get_char_offset(const char32_t* text, usz max_length, u16 max_width = -1, bool wrap = false);
bool matches(const char* name, int size) const { return static_cast<int>(size_pt) == size && font_name == name; }
bool matches(std::string_view name, int size) const { return static_cast<int>(size_pt) == size && font_name == name; }
std::string_view get_name() const { return font_name; }
f32 get_size_pt() const { return size_pt; }
f32 get_size_px() const { return size_px; }
@ -97,7 +98,7 @@ namespace rsx
std::vector<std::unique_ptr<font>> fonts;
static fontmgr* m_instance;
font* find(const char* name, int size)
font* find(std::string_view name, int size)
{
for (const auto& f : fonts)
{
@ -121,7 +122,7 @@ namespace rsx
}
}
static font* get(const char* name, int size)
static font* get(std::string_view name, int size)
{
if (m_instance == nullptr)
m_instance = new fontmgr;

View file

@ -68,10 +68,6 @@ LOG_CHANNEL(sys_log, "SYS");
// Preallocate 32 MiB
stx::manual_typemap<void, 0x20'00000, 128> g_fixed_typemap;
bool g_use_rtm = false;
u64 g_rtm_tx_limit1 = 0;
u64 g_rtm_tx_limit2 = 0;
std::string g_cfg_defaults;
atomic_t<u64> g_watchdog_hold_ctr{0};
@ -1540,9 +1536,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch,
m_localized_title = std::string(psf::get_string(_psf, fmt::format("TITLE_%02d", static_cast<s32>(g_cfg.sys.language.get())), m_title));
sys_log.notice("Localized Title: %s", GetLocalizedTitle());
// Set RTM usage
g_use_rtm = utils::has_rtm() && (((utils::has_mpx() && !utils::has_tsx_force_abort()) && g_cfg.core.enable_TSX == tsx_usage::enabled) || g_cfg.core.enable_TSX == tsx_usage::forced);
{
// Log some extra info in case of boot
#if defined(HAVE_VULKAN)
@ -1553,11 +1546,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch,
#endif
sys_log.notice("Used configuration:\n%s\n", g_cfg.to_string());
if (g_use_rtm && (!utils::has_mpx() || utils::has_tsx_force_abort()))
{
sys_log.warning("TSX forced by User");
}
// Initialize patch engine
g_fxo->need<patch_engine>();
@ -1566,14 +1554,6 @@ game_boot_result Emulator::Load(const std::string& title_id, bool is_disc_patch,
g_fxo->get<patch_engine>().append_title_patches(m_title_id);
}
if (g_use_rtm)
{
// Update supplementary settings
const f64 _1ns = utils::get_tsc_freq() / 1000'000'000.;
g_rtm_tx_limit1 = static_cast<u64>(g_cfg.core.tx_limit1_ns * _1ns);
g_rtm_tx_limit2 = static_cast<u64>(g_cfg.core.tx_limit2_ns * _1ns);
}
// Set bdvd_dir
std::string bdvd_dir = g_cfg_vfs.get(g_cfg_vfs.dev_bdvd, rpcs3::utils::get_emu_dir());
{

View file

@ -478,7 +478,3 @@ public:
};
extern Emulator Emu;
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
extern u64 g_rtm_tx_limit2;

View file

@ -8,11 +8,6 @@
cfg_root g_cfg{};
cfg_root g_backup_cfg{};
bool cfg_root::node_core::enable_tsx_by_default()
{
return utils::has_rtm() && utils::has_mpx() && !utils::has_tsx_force_abort();
}
std::string cfg_root::node_sys::get_random_system_name()
{
std::srand(static_cast<u32>(std::time(nullptr)));

View file

@ -12,11 +12,6 @@ struct cfg_root : cfg::node
{
struct node_core : cfg::node
{
private:
/** We don't wanna include the sysinfo header here */
static bool enable_tsx_by_default();
public:
node_core(cfg::node* _this) : cfg::node(_this, "Core") {}
cfg::_enum<ppu_decoder_type> ppu_decoder{ this, "PPU Decoder", ppu_decoder_type::llvm };
@ -62,10 +57,10 @@ struct cfg_root : cfg::node
cfg::_bool spu_verification{ this, "SPU Verification", true }; // Should be enabled
cfg::_bool spu_cache{ this, "SPU Cache", true };
cfg::_bool spu_prof{ this, "SPU Profiler", false };
cfg::_bool ppu_prof{ this, "PPU Profiler", false };
cfg::uint<0, 16> mfc_transfers_shuffling{ this, "MFC Commands Shuffling Limit", 0 };
cfg::uint<0, 10000> mfc_transfers_timeout{ this, "MFC Commands Timeout", 0, true };
cfg::_bool mfc_shuffling_in_steps{ this, "MFC Commands Shuffling In Steps", false, true };
cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
cfg::_enum<xfloat_accuracy> spu_xfloat_accuracy{ this, "XFloat Accuracy", xfloat_accuracy::approximate, false };
cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
@ -84,8 +79,6 @@ struct cfg_root : cfg::node
cfg::_bool hle_lwmutex{ this, "HLE lwmutex" }; // Force alternative lwmutex/lwcond implementation
cfg::uint64 spu_llvm_lower_bound{ this, "SPU LLVM Lower Bound" };
cfg::uint64 spu_llvm_upper_bound{ this, "SPU LLVM Upper Bound", 0xffffffffffffffff };
cfg::uint64 tx_limit1_ns{this, "TSX Transaction First Limit", 800}; // In nanoseconds
cfg::uint64 tx_limit2_ns{this, "TSX Transaction Second Limit", 2000}; // In nanoseconds
cfg::_int<10, 3000> clocks_scale{ this, "Clocks scale", 100 }; // Changing this from 100 (percentage) may affect game speed in unexpected ways
cfg::uint<0, 3000> spu_wakeup_delay{ this, "SPU Wake-Up Delay", 0, true };

View file

@ -196,22 +196,6 @@ void fmt_class_string<screen_quadrant>::format(std::string& out, u64 arg)
});
}
template <>
void fmt_class_string<tsx_usage>::format(std::string& out, u64 arg)
{
format_enum(out, arg, [](tsx_usage value)
{
switch (value)
{
case tsx_usage::disabled: return "Disabled";
case tsx_usage::enabled: return "Enabled";
case tsx_usage::forced: return "Forced";
}
return unknown;
});
}
template <>
void fmt_class_string<rsx_fifo_mode>::format(std::string& out, u64 arg)
{

View file

@ -248,13 +248,6 @@ enum class rsx_fifo_mode : unsigned
as_ps3,
};
enum class tsx_usage
{
disabled,
enabled,
forced,
};
enum class enter_button_assign
{
circle, // CELL_SYSUTIL_ENTER_BUTTON_ASSIGN_CIRCLE

View file

@ -988,14 +988,6 @@ QString emu_settings::GetLocalizedSetting(const QString& original, emu_settings_
case thread_scheduler_mode::os: return tr("Operating System", "Thread Scheduler Mode");
}
break;
case emu_settings_type::EnableTSX:
switch (static_cast<tsx_usage>(index))
{
case tsx_usage::disabled: return tr("Disabled", "Enable TSX");
case tsx_usage::enabled: return tr("Enabled", "Enable TSX");
case tsx_usage::forced: return tr("Forced", "Enable TSX");
}
break;
case emu_settings_type::Renderer:
switch (static_cast<video_renderer>(index))
{

View file

@ -20,7 +20,6 @@ enum class emu_settings_type
MFCDebug,
MaxLLVMThreads,
LLVMPrecompilation,
EnableTSX,
AccurateSpuDMA,
AccurateClineStores,
AccurateRSXAccess,
@ -233,7 +232,6 @@ inline static const std::map<emu_settings_type, cfg_location> settings_location
{ emu_settings_type::MFCDebug, { "Core", "MFC Debug"}},
{ emu_settings_type::MaxLLVMThreads, { "Core", "Max LLVM Compile Threads"}},
{ emu_settings_type::LLVMPrecompilation, { "Core", "LLVM Precompilation"}},
{ emu_settings_type::EnableTSX, { "Core", "Enable TSX"}},
{ emu_settings_type::AccurateSpuDMA, { "Core", "Accurate SPU DMA"}},
{ emu_settings_type::AccurateClineStores, { "Core", "Accurate Cache Line Stores"}},
{ emu_settings_type::AccurateRSXAccess, { "Core", "Accurate RSX reservation access"}},

View file

@ -651,15 +651,29 @@ void kernel_explorer::update()
return fmt::format(" (%.1fs)", wait_time);
};
std::vector<std::pair<s32, std::string>> ppu_threads;
idm::select<named_thread<ppu_thread>>([&](u32 id, ppu_thread& ppu)
{
const auto func = ppu.last_function;
const ppu_thread_status status = lv2_obj::ppu_state(&ppu, false, false).first;
add_leaf(find_node(root, additional_nodes::ppu_threads), QString::fromStdString(fmt::format(u8"PPU 0x%07x: “%s”, PRIO: %d, Joiner: %s, Status: %s, State: %s, %s func: “%s”%s", id, *ppu.ppu_tname.load(), ppu.prio.load().prio, ppu.joiner.load(), status, ppu.state.load()
, ppu.ack_suspend ? "After" : (ppu.current_function ? "In" : "Last"), func ? func : "", get_wait_time_str(ppu.start_time))));
const s32 prio = ppu.prio.load().prio;
std::string prio_text = fmt::format("%4d", prio);
prio_text = fmt::replace_all(prio_text, " ", " ");
ppu_threads.emplace_back(prio, fmt::format(u8"PPU 0x%07x: PRIO: %s, “%s”Joiner: %s, Status: %s, State: %s, %s func: “%s”%s", id, prio_text, *ppu.ppu_tname.load(), ppu.joiner.load(), status, ppu.state.load()
, ppu.ack_suspend ? "After" : (ppu.current_function ? "In" : "Last"), func ? func : "", get_wait_time_str(ppu.start_time)));
}, idm::unlocked);
// Sort by priority
std::stable_sort(ppu_threads.begin(), ppu_threads.end(), FN(x.first < y.first));
for (const auto& [prio, text] : ppu_threads)
{
add_leaf(find_node(root, additional_nodes::ppu_threads), QString::fromStdString(text));
}
lock_idm_lv2.reset();
idm::select<named_thread<spu_thread>>([&](u32 /*id*/, spu_thread& spu)

View file

@ -288,79 +288,6 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
SubscribeTooltip(ui->gb_spu_threads, tooltips.settings.preferred_spu_threads);
ui->preferredSPUThreads->setItemText(ui->preferredSPUThreads->findData(0), tr("Auto", "Preferred SPU threads"));
if (utils::has_rtm())
{
m_emu_settings->EnhanceComboBox(ui->enableTSX, emu_settings_type::EnableTSX);
SubscribeTooltip(ui->gb_tsx, tooltips.settings.enable_tsx);
if (!utils::has_mpx() || utils::has_tsx_force_abort())
{
remove_item(ui->enableTSX, static_cast<int>(tsx_usage::enabled), static_cast<int>(g_cfg.core.enable_TSX.def));
}
connect(ui->enableTSX, QOverload<int>::of(&QComboBox::currentIndexChanged), this, [this](int index)
{
if (index < 0) return;
if (const auto [text, value] = get_data(ui->enableTSX, index); value == static_cast<int>(tsx_usage::forced) &&
(!utils::has_mpx() || utils::has_tsx_force_abort()))
{
QString title;
QString message;
if (!utils::has_mpx())
{
title = tr("Haswell/Broadwell TSX Warning");
message = gui::utils::make_paragraph(tr(
"RPCS3 has detected that you are using TSX functions on a Haswell or Broadwell CPU.\n"
"Intel has deactivated these functions in newer Microcode revisions, since they can lead to unpredicted behaviour.\n"
"That means using TSX may break games or even <font color=\"red\"><b>damage</b></font> your data.\n"
"We recommend to disable this feature and update your computer BIOS.\n"
"\n"
"Do you wish to use TSX anyway?"
));
}
else
{
title = tr("TSX-FA Warning");
message = gui::utils::make_paragraph(tr(
"RPCS3 has detected your CPU only supports TSX-FA.\n"
"That means using TSX may break games or even <font color=\"red\"><b>damage</b></font> your data.\n"
"We recommend to disable this feature.\n"
"\n"
"Do you wish to use TSX anyway?"
));
}
QMessageBox mb;
mb.setWindowModality(Qt::WindowModal);
mb.setWindowTitle(title);
mb.setIcon(QMessageBox::Critical);
mb.setTextFormat(Qt::RichText);
mb.setStandardButtons(QMessageBox::Yes | QMessageBox::No);
mb.setDefaultButton(QMessageBox::No);
mb.setText(message);
mb.layout()->setSizeConstraint(QLayout::SetFixedSize);
if (mb.exec() == QMessageBox::No)
{
// Reset if the messagebox was answered with no. This prevents the currentIndexChanged signal in EnhanceComboBox
ui->enableTSX->setCurrentIndex(find_item(ui->enableTSX, static_cast<int>(g_cfg.core.enable_TSX.def)));
}
}
});
}
else
{
ui->enableTSX->setEnabled(false);
ui->enableTSX->setPlaceholderText(tr("Not supported", "Enable TSX"));
SubscribeTooltip(ui->enableTSX, tr("Unfortunately, your CPU model does not support this instruction set.", "Enable TSX"));
m_emu_settings->SetSetting(emu_settings_type::EnableTSX, fmt::format("%s", tsx_usage::disabled));
connect(this, &settings_dialog::signal_restore_dependant_defaults, [this]()
{
m_emu_settings->SetSetting(emu_settings_type::EnableTSX, fmt::format("%s", tsx_usage::disabled));
});
}
// PPU tool tips
SubscribeTooltip(ui->ppu__static, tooltips.settings.ppu__static);
SubscribeTooltip(ui->ppu_llvm, tooltips.settings.ppu_llvm);

View file

@ -266,24 +266,6 @@
</item>
<item>
<layout class="QVBoxLayout" name="coreTabRightLayout" stretch="0,0,0,0">
<item>
<widget class="QGroupBox" name="gb_tsx">
<property name="sizePolicy">
<sizepolicy hsizetype="Preferred" vsizetype="Minimum">
<horstretch>0</horstretch>
<verstretch>0</verstretch>
</sizepolicy>
</property>
<property name="title">
<string>TSX Instructions</string>
</property>
<layout class="QVBoxLayout" name="gb_tsx_layout">
<item>
<widget class="QComboBox" name="enableTSX"/>
</item>
</layout>
</widget>
</item>
<item>
<widget class="QGroupBox" name="gb_spu_threads">
<property name="sizePolicy">

View file

@ -91,7 +91,6 @@ public:
const QString xfloat = tr("Control accuracy to SPU float vectors processing.\nFixes bugs in various games at the cost of performance.\nThis setting is only applied when SPU Decoder is set to Dynamic or LLVM.");
const QString enable_thread_scheduler = tr("Control how RPCS3 utilizes the threads of your system.\nEach option heavily depends on the game and on your CPU. It's recommended to try each option to find out which performs the best.\nChanging the thread scheduler is not supported on CPUs with less than 12 threads.");
const QString spu_loop_detection = tr("Try to detect loop conditions in SPU kernels and use them as scheduling hints.\nImproves performance and reduces CPU usage.\nMay cause severe audio stuttering in rare cases.");
const QString enable_tsx = tr("Enable usage of TSX instructions.\nNeeds to be forced on some Haswell or Broadwell CPUs or CPUs with the TSX-FA instruction set.\nForcing TSX in these cases may lead to system and performance instability, use it with caution.");
const QString spu_block_size = tr("This option controls the SPU analyser, particularly the size of compiled units. The Mega and Giga modes may improve performance by tying smaller units together, decreasing the number of compiled units but increasing their size.\nUse the Safe mode for maximum compatibility.");
const QString preferred_spu_threads = tr("Some SPU stages are sensitive to race conditions and allowing a limited number at a time helps alleviate performance stalls.\nSetting this to a smaller value might improve performance and reduce stuttering in some games.\nLeave this on auto if performance is negatively affected when setting a small value.");
const QString max_cpu_preempt = tr("Reduces CPU usage and power consumption, improving battery life on mobile devices. (0 means disabled)\nHigher values cause a more pronounced effect, but may cause audio or performance issues. A value of 50 or less is recommended.\nThis option forces an FPS limit because it's active when framerate is stable.\nThe lighter the game is on the hardware, the more power is saved by it. (until the preemption count barrier is reached)");

View file

@ -5,9 +5,6 @@
#include "util/atomic.hpp"
#include <functional>
extern bool g_use_rtm;
extern u64 g_rtm_tx_limit1;
#ifdef ARCH_X64
#ifdef _MSC_VER
#include <intrin.h>
@ -19,70 +16,6 @@ extern u64 g_rtm_tx_limit1;
namespace utils
{
// Transaction helper (result = pair of success and op result, or just bool)
template <typename F, typename R = std::invoke_result_t<F>>
inline auto tx_start(F op)
{
#if defined(ARCH_X64)
uint status = -1;
for (auto stamp0 = get_tsc(), stamp1 = stamp0; g_use_rtm && stamp1 - stamp0 <= g_rtm_tx_limit1; stamp1 = get_tsc())
{
#ifndef _MSC_VER
__asm__ goto ("xbegin %l[retry];" ::: "memory" : retry);
#else
status = _xbegin();
if (status != _XBEGIN_STARTED) [[unlikely]]
{
goto retry;
}
#endif
if constexpr (std::is_void_v<R>)
{
std::invoke(op);
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
return true;
}
else
{
auto result = std::invoke(op);
#ifndef _MSC_VER
__asm__ volatile ("xend;" ::: "memory");
#else
_xend();
#endif
return std::make_pair(true, std::move(result));
}
retry:
#ifndef _MSC_VER
__asm__ volatile ("movl %%eax, %0;" : "=r" (status) :: "memory");
#endif
if (!status) [[unlikely]]
{
break;
}
}
#else
static_cast<void>(op);
#endif
if constexpr (std::is_void_v<R>)
{
return false;
}
else
{
return std::make_pair(false, R());
}
};
// Try to prefetch to Level 2 cache since it's not split to data/code on most processors
template <typename T>
constexpr void prefetch_exec(T func)