diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index c9d784d3ac..d23903997f 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -720,9 +720,19 @@ void spu_cache::initialize(bool build_existing_cache) } // SPU cache file (version + block size type) - const std::string loc = ppu_cache + "spu-" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1-tane.dat"; + const std::string filename = "spu-" + fmt::to_lower(g_cfg.core.spu_block_size.to_string()) + "-v1-tane.dat"; + const std::string loc = ppu_cache + filename; + const std::string loc_debug = fs::get_cache_dir() + "DEBUG/" + filename; - spu_cache cache(loc); + bool is_debug = false; + + if (fs::is_file(loc_debug)) + { + spu_log.success("SPU Cache override applied!"); + is_debug = true; + } + + spu_cache cache(is_debug ? loc_debug : loc); if (!cache) { @@ -4963,6 +4973,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s u32 lsa_last_pc = SPU_LS_SIZE; // PC of first LSA write u32 get_pc = SPU_LS_SIZE; // PC of GETLLAR u32 put_pc = SPU_LS_SIZE; // PC of PUTLLC + u32 rdatomic_pc = SPU_LS_SIZE; // PC of last RdAtomcStat read reg_state_t ls{}; // state of LS load/store address register reg_state_t ls_offs = reg_state_t::from_value(0); // Added value to ls reg_state_t lsa{}; // state of LSA register on GETLLAR @@ -5008,7 +5019,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s ls_invalid = true; ls_write |= write; - if (write) + if (ls_write) { return discard(); } @@ -6323,6 +6334,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } + atomic16->rdatomic_pc = pos; + const auto it = atomic16_all.find(pos); if (it == atomic16_all.end()) @@ -7263,7 +7276,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s for (const auto& [pc_commited, pattern] : atomic16_all) { - if (!pattern.active) + if (!pattern.active || pattern.lsa_pc >= pattern.rdatomic_pc) { continue; } @@ -7273,6 +7286,17 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s continue; } + std::string pattern_hash; + { + sha1_context ctx; + u8 output[20]{}; + + sha1_starts(&ctx); + sha1_update(&ctx, reinterpret_cast(result.data.data()) + (pattern.lsa_pc - result.lower_bound), pattern.rdatomic_pc - pattern.lsa_pc); + sha1_finish(&ctx, output); + fmt::append(pattern_hash, "%s", fmt::base57(output)); + } + union putllc16_or_0_info { u64 data; @@ -7295,8 +7319,8 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s value.required_pc = pattern.required_pc; } - spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all); - add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa, value.data); + // spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all); + // add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa, value.data); continue; } @@ -7363,16 +7387,35 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s value.reg2 = pattern.reg2; } + bool allow_pattern = true; + if (g_cfg.core.spu_accurate_reservations) { - // Because enabling it is a hack, as it turns out - // continue; + // The problem with PUTLLC16 optimization, that it is in theory correct at the bounds of the spu function. + // But if the SPU code reuses the cache line data observed, it is not truly atomic. + // So we may enable it only for known cases where SPU atomic data is not used after the function leaves. + + // So the two options are: + + // 1. Atomic compare exchange 16 bytes operation. (rest of data is not read) -> good for RPCS3 to optimize. + // 2. Fetch 128 bytes (read them later), modify only 16 bytes. -> Bad for RPCS3 to optimize. + + // This difference cannot be known at analyzer time but from observing callers. + static constexpr std::initializer_list allowed_patterns = + { + "620oYSe8uQqq9eTkhWfMqoEXX0us"sv, // CellSpurs JobChain acquire pattern + }; + + allow_pattern = std::any_of(allowed_patterns.begin(), allowed_patterns.end(), FN(pattern_hash == x)); } - add_pattern(false, inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data); + if (allow_pattern) + { + add_pattern(false, inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data); + } - spu_log.success("PUTLLC16 Pattern Detected! (mem_count=%d, put_pc=0x%x, pc_rel=%d, offset=0x%x, const=%u, two_regs=%d, reg=%u, runtime=%d, 0x%x-%s) (putllc0=%d, putllc16+0=%d, all=%d)" - , pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, +stats.nowrite, ++stats.single, +stats.all); + spu_log.success("PUTLLC16 Pattern Detected! (mem_count=%d, put_pc=0x%x, pc_rel=%d, offset=0x%x, const=%u, two_regs=%d, reg=%u, runtime=%d, 0x%x-%s, pattern-hash=%s) (putllc0=%d, putllc16+0=%d, all=%d)" + , pattern.mem_count, pattern.put_pc, value.type == v_relative, value.off18, value.type == v_const, value.type == v_reg2, value.reg, value.runtime16_select, entry_point, func_hash, pattern_hash, +stats.nowrite, ++stats.single, +stats.all); } for (const auto& [read_pc, pattern] : rchcnt_loop_all)