diff --git a/rpcs3/Emu/Cell/PPUTranslator.cpp b/rpcs3/Emu/Cell/PPUTranslator.cpp index e59f14892b..6d4766a947 100644 --- a/rpcs3/Emu/Cell/PPUTranslator.cpp +++ b/rpcs3/Emu/Cell/PPUTranslator.cpp @@ -595,7 +595,7 @@ void PPUTranslator::CallFunction(u64 target, Value* indirect) if (g_cfg.core.ppu_prof) { - m_ir->CreateStore(m_ir->getInt32(target_last), m_ir->CreateStructGEP(m_thread_type, m_thread, static_cast(&m_cia - m_locals))); + m_ir->CreateStore(GetAddr(target_last - m_addr), m_ir->CreateStructGEP(m_thread_type, m_thread, static_cast(&m_cia - m_locals))); } } } diff --git a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp index 9a192989e0..e3455fcb2d 100644 --- a/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPUCommonRecompiler.cpp @@ -3069,6 +3069,39 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s values[op.rt] = pos + 4; } + const u32 pos_next = wa; + + bool is_no_return = false; + + if (pos_next >= lsa && pos_next < limit) + { + const u32 data_next = ls[pos_next / 4]; + const auto type_next = g_spu_itype.decode(data_next); + const auto flag_next = g_spu_iflag.decode(data_next); + const auto op_next = spu_opcode_t{data_next}; + + if (!(type_next & spu_itype::zregmod) && !(type_next & spu_itype::branch)) + { + if (auto iflags = g_spu_iflag.decode(data_next)) + { + if (+flag_next & +spu_iflag::use_ra) + { + is_no_return = is_no_return || (op_next.ra >= 4 && op_next.ra < 10); + } + + if (+flag_next & +spu_iflag::use_rb) + { + is_no_return = is_no_return || (op_next.rb >= 4 && op_next.rb < 10); + } + + if (type_next & spu_itype::_quadrop && +iflags & +spu_iflag::use_rc) + { + is_no_return = is_no_return || (op_next.ra >= 4 && op_next.rb < 10); + } + } + } + } + if (af & vf::is_const) { const u32 target = spu_branch_target(av); @@ -3105,7 +3138,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s limit = std::min(limit, target); } - if (sl && g_cfg.core.spu_block_size != spu_block_size_type::safe) + if (!is_no_return && sl && g_cfg.core.spu_block_size != spu_block_size_type::safe) { m_ret_info[pos / 4 + 1] = true; m_entry_info[pos / 4 + 1] = true; @@ -3294,9 +3327,9 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s spu_log.notice("[0x%x] At 0x%x: ignoring indirect branch (SYNC)", entry_point, pos); } - if (type == spu_itype::BI || sl) + if (type == spu_itype::BI || sl || is_no_return) { - if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe) + if (type == spu_itype::BI || g_cfg.core.spu_block_size == spu_block_size_type::safe || is_no_return) { m_targets[pos]; } @@ -3333,9 +3366,42 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s break; } + const u32 pos_next = wa; + + bool is_no_return = false; + + if (pos_next >= lsa && pos_next < limit) + { + const u32 data_next = ls[pos_next / 4]; + const auto type_next = g_spu_itype.decode(data_next); + const auto flag_next = g_spu_iflag.decode(data_next); + const auto op_next = spu_opcode_t{data_next}; + + if (!(type_next & spu_itype::zregmod) && !(type_next & spu_itype::branch)) + { + if (auto iflags = g_spu_iflag.decode(data_next)) + { + if (+flag_next & +spu_iflag::use_ra) + { + is_no_return = is_no_return || (op_next.ra >= 4 && op_next.ra < 10); + } + + if (+flag_next & +spu_iflag::use_rb) + { + is_no_return = is_no_return || (op_next.rb >= 4 && op_next.rb < 10); + } + + if (type_next & spu_itype::_quadrop && +iflags & +spu_iflag::use_rc) + { + is_no_return = is_no_return || (op_next.rc >= 4 && op_next.rc < 10); + } + } + } + } + m_targets[pos].push_back(target); - if (g_cfg.core.spu_block_size != spu_block_size_type::safe) + if (!is_no_return && g_cfg.core.spu_block_size != spu_block_size_type::safe) { m_ret_info[pos / 4 + 1] = true; m_entry_info[pos / 4 + 1] = true; @@ -3343,7 +3409,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s add_block(pos + 4); } - if (g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync) + if (!is_no_return && g_cfg.core.spu_block_size == spu_block_size_type::giga && !sync) { m_entry_info[target / 4] = true; add_block(target); @@ -4903,20 +4969,27 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s bool select_16_or_0_at_runtime = false; bool put_active = false; // PUTLLC happened bool get_rdatomic = false; // True if MFC_RdAtomicStat was read after GETLLAR + u32 required_pc = SPU_LS_SIZE; // Require program to be location specific for this optimization (SPU_LS_SIZE - no requirement) u32 mem_count = 0; + u32 break_cause = 100; + u32 break_pc = SPU_LS_SIZE; // Return old state for error reporting atomic16_t discard() { const u32 pc = lsa_pc; const u32 last_pc = lsa_last_pc; + const u32 cause = break_cause; + const u32 break_pos = break_pc; const atomic16_t old = *this; *this = atomic16_t{}; // Keep some members - lsa_pc = pc; - lsa_last_pc = last_pc; + this->lsa_pc = pc; + this->lsa_last_pc = last_pc; + this->break_cause = cause; + this->break_pc = break_pos; return old; } @@ -5123,15 +5196,17 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { if (previous.active && likely_putllc_loop && getllar_starts.contains(previous.lsa_pc)) { - const bool is_first = !std::exchange(getllar_starts[previous.lsa_pc], true); + had_putllc_evaluation = true; - if (!is_first) + if (cause != 24) { + atomic16->break_cause = cause; + atomic16->break_pc = pos; return; } - had_putllc_evaluation = true; - + cause = atomic16->break_cause; + getllar_starts[previous.lsa_pc] = true; g_fxo->get().breaking_reason[cause]++; if (!spu_log.notice) @@ -5139,7 +5214,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s return; } - std::string break_error = fmt::format("PUTLLC pattern breakage [%x mem=%d lsa_const=%d cause=%u] (lsa_pc=0x%x)", pos, previous.mem_count, u32{!previous.ls_offs.is_const()} * 2 + previous.lsa.is_const(), cause, previous.lsa_pc); + std::string break_error = fmt::format("PUTLLC pattern breakage [%x mem=%d lsa_const=%d cause=%u] (lsa_pc=0x%x)", atomic16->break_pc, previous.mem_count, u32{!previous.ls_offs.is_const()} * 2 + previous.lsa.is_const(), cause, previous.lsa_pc); const auto values = sort_breakig_reasons(g_fxo->get().breaking_reason); @@ -6301,6 +6376,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s existing.ls_invalid |= atomic16->ls_invalid; existing.ls_access |= atomic16->ls_access; existing.mem_count = std::max(existing.mem_count, atomic16->mem_count); + existing.required_pc = std::min(existing.required_pc, atomic16->required_pc); existing.select_16_or_0_at_runtime |= atomic16->select_16_or_0_at_runtime; } @@ -6315,6 +6391,24 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s invalidate = false; } } + else if (atomic16->break_cause != 100 && atomic16->lsa_pc != SPU_LS_SIZE) + { + const auto it = atomic16_all.find(pos); + + if (it == atomic16_all.end()) + { + // Ensure future failure + atomic16_all.emplace(pos, *atomic16); + break_putllc16(24, FN(x.active = true, x)(as_rvalue(*atomic16))); + } + else if (it->second.active && atomic16->break_cause != 100) + { + it->second = *atomic16; + break_putllc16(24, FN(x.active = true, x)(as_rvalue(*atomic16))); + } + + atomic16->break_cause = 100; + } break; } @@ -6385,6 +6479,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // Do not clear lower 16 bytes addressing because the program can move on 4-byte basis const u32 offs = spu_branch_target(pos - result.lower_bound, op.si16); + const u32 true_offs = spu_branch_target(pos, op.si16); + + // Make this optimization depend on the location of the program + atomic16->required_pc = result.lower_bound; if (atomic16->lsa.is_const() && [&]() { @@ -6409,6 +6507,10 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s { // Ignore memory access in this case } + else if (atomic16->lsa.is_const() && !atomic16->lsa.compare_with_mask_indifference(true_offs, SPU_LS_MASK_128)) + { + // Same + } else if (atomic16->ls_invalid && is_store) { break_putllc16(35, atomic16->set_invalid_ls(is_store)); @@ -7162,27 +7264,33 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s continue; } + union putllc16_or_0_info + { + u64 data; + bf_t required_pc; + bf_t type; + bf_t runtime16_select; + bf_t no_notify; + bf_t reg; + bf_t off18; + bf_t reg2; + } value{}; + auto& stats = g_fxo->get(); had_putllc_evaluation = true; if (!pattern.ls_write) { + if (pattern.required_pc != SPU_LS_SIZE) + { + value.required_pc = pattern.required_pc; + } + spu_log.success("PUTLLC0 Pattern Detected! (put_pc=0x%x, %s) (putllc0=%d, putllc16+0=%d, all=%d)", pattern.put_pc, func_hash, ++stats.nowrite, ++stats.single, +stats.all); - add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa); + add_pattern(false, inst_attr::putllc0, pattern.put_pc - lsa, value.data); continue; } - union putllc16_info - { - u32 data; - bf_t type; - bf_t runtime16_select; - bf_t no_notify; - bf_t reg; - bf_t off18; - bf_t reg2; - } value{}; - enum : u32 { v_const = 0, @@ -7213,6 +7321,11 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s value.runtime16_select = pattern.select_16_or_0_at_runtime; value.reg = s_reg_max; + if (pattern.required_pc != SPU_LS_SIZE) + { + value.required_pc = pattern.required_pc; + } + if (pattern.ls.is_const()) { ensure(pattern.reg == s_reg_max && pattern.reg2 == s_reg_max && pattern.ls_offs.is_const(), "Unexpected register usage"); @@ -7244,7 +7357,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (g_cfg.core.spu_accurate_reservations) { // Because enabling it is a hack, as it turns out - continue; + // continue; } add_pattern(false, inst_attr::putllc16, pattern.put_pc - result.entry_point, value.data); @@ -7268,7 +7381,7 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s if (inst_attr attr = m_inst_attrs[(read_pc - entry_point) / 4]; attr == inst_attr::none) { - add_pattern(false, inst_attr::rchcnt_loop, read_pc - result.entry_point); + add_pattern(false, inst_attr::rchcnt_loop, read_pc - result.entry_point, 0); spu_log.error("Channel Loop Pattern Detected! Report to developers! (read_pc=0x%x, branch_pc=0x%x, branch_target=0x%x, 0x%x-%s)", read_pc, pattern.branch_pc, pattern.branch_target, entry_point, func_hash); } @@ -7284,6 +7397,13 @@ spu_program spu_recompiler_base::analyse(const be_t* ls, u32 entry_point, s // Blocks starting from 0x0 or invalid instruction won't be compiled, may need special interpreter fallback } + if (!m_patterns.empty()) + { + std::string out_dump; + dump(result, out_dump); + spu_log.notice("Dump SPU Function with pattern(s):\n%s", out_dump); + } + for (u32 i = 0; i < result.data.size(); i++) { const be_t ls_val = ls[result.lower_bound / 4 + i]; @@ -8346,19 +8466,10 @@ std::array& block_reg_info::evaluate_start_state(const s return walkby_state; } -void spu_recompiler_base::add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end) +void spu_recompiler_base::add_pattern(bool fill_all, inst_attr attr, u32 start, u64 info) { - if (end == umax) - { - end = start; - } - - m_patterns[start] = pattern_info{utils::address_range32::start_end(start, end)}; - - for (u32 i = start; i <= (fill_all ? end : start); i += 4) - { - m_inst_attrs[i / 4] = attr; - } + m_patterns[start] = pattern_info{info}; + m_inst_attrs[start / 4] = attr; } extern std::string format_spu_func_info(u32 addr, cpu_thread* spu) diff --git a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp index 8b820ca600..989ba2e84f 100644 --- a/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/SPULLVMRecompiler.cpp @@ -1080,7 +1080,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_body); } - void putllc16_pattern(const spu_program& /*prog*/, utils::address_range32 range) + void putllc16_pattern(const spu_program& /*prog*/, u64 pattern_info) { // Prevent store elimination m_block->store_context_ctr[s_reg_mfc_eal]++; @@ -1109,16 +1109,17 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } }; - const union putllc16_info + const union putllc16_or_0_info { - u32 data; - bf_t type; - bf_t runtime16_select; - bf_t no_notify; - bf_t reg; - bf_t off18; - bf_t reg2; - } info = std::bit_cast(range.end); + u64 data; + bf_t required_pc; + bf_t type; + bf_t runtime16_select; + bf_t no_notify; + bf_t reg; + bf_t off18; + bf_t reg2; + } info = std::bit_cast(pattern_info); enum : u32 { @@ -1150,8 +1151,10 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator value_t eal_val; eal_val.value = _eal; - auto get_reg32 = [&](u32 reg) + auto get_reg32 = [&](u64 reg_) { + const u32 reg = static_cast(reg_); + if (get_reg_type(reg) != get_type()) { return get_reg_fixed(reg, get_type()); @@ -1170,6 +1173,19 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } else if (info.type == v_relative) { + if (info.required_pc && info.required_pc != SPU_LS_SIZE) + { + const auto short_op = llvm::BasicBlock::Create(m_context, "__putllc16_short_op", m_function); + const auto heavy_op = llvm::BasicBlock::Create(m_context, "__putllc16_heavy_op", m_function); + + m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->getInt32(info.required_pc), m_base_pc), heavy_op, short_op); + m_ir->SetInsertPoint(heavy_op); + update_pc(); + call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); + m_ir->CreateBr(_final); + m_ir->SetInsertPoint(short_op); + } + dest = m_ir->CreateAnd(get_pc(spu_branch_target(info.off18 + m_base)), 0x3fff0); } else if (info.type == v_reg_offs) @@ -1268,17 +1284,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto _new = m_ir->CreateAlignedLoad(get_type(), _ptr(m_lsptr, dest), llvm::MaybeAlign{16}); const auto _rdata = m_ir->CreateAlignedLoad(get_type(), _ptr(spu_ptr(&spu_thread::rdata), m_ir->CreateAnd(diff, 0x70)), llvm::MaybeAlign{16}); - const bool is_accurate_op = !!g_cfg.core.spu_accurate_reservations; + const bool is_accurate_op = true || !!g_cfg.core.spu_accurate_reservations; - const auto compare_data_change_res = is_accurate_op ? m_ir->getTrue() : m_ir->CreateICmpNE(_new, _rdata); + const auto compare_data_change_res = m_ir->CreateICmpNE(_new, _rdata); + const auto second_test_for_complete_op = is_accurate_op ? m_ir->getTrue() : compare_data_change_res; if (info.runtime16_select) { - m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpULT(diff, m_ir->getInt64(128)), compare_data_change_res), _begin_op, _inc_res, m_md_likely); + m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpULT(diff, m_ir->getInt64(128)), second_test_for_complete_op), _begin_op, _inc_res, m_md_likely); } else { - m_ir->CreateCondBr(compare_data_change_res, _begin_op, _inc_res, m_md_unlikely); + m_ir->CreateCondBr(second_test_for_complete_op, _begin_op, _inc_res, m_md_unlikely); } m_ir->SetInsertPoint(_begin_op); @@ -1323,7 +1340,14 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator if (!info.no_notify) { + const auto notify_block = llvm::BasicBlock::Create(m_context, "__putllc16_block_notify", m_function); + const auto notify_next = llvm::BasicBlock::Create(m_context, "__putllc16_block_notify_next", m_function); + + m_ir->CreateCondBr(compare_data_change_res, notify_block, notify_next); + m_ir->SetInsertPoint(notify_block); call("atomic_wait_engine::notify_all", static_cast(atomic_wait_engine::notify_all), rptr); + m_ir->CreateBr(notify_next); + m_ir->SetInsertPoint(notify_next); } m_ir->CreateBr(_success); @@ -1373,7 +1397,7 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator m_ir->SetInsertPoint(_final); } - void putllc0_pattern(const spu_program& /*prog*/, utils::address_range32 /*range*/) + void putllc0_pattern(const spu_program& /*prog*/, u64 pattern_info) { // Prevent store elimination m_block->store_context_ctr[s_reg_mfc_eal]++; @@ -1401,6 +1425,18 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator } }; + const union putllc16_or_0_info + { + u64 data; + bf_t required_pc; + bf_t type; + bf_t runtime16_select; + bf_t no_notify; + bf_t reg; + bf_t off18; + bf_t reg2; + } info = std::bit_cast(pattern_info); + const auto _next = llvm::BasicBlock::Create(m_context, "", m_function); const auto _next0 = llvm::BasicBlock::Create(m_context, "", m_function); const auto _fail = llvm::BasicBlock::Create(m_context, "", m_function); @@ -1409,6 +1445,19 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator const auto _eal = (get_reg_fixed(s_reg_mfc_eal) & -128).eval(m_ir); const auto _raddr = m_ir->CreateLoad(get_type(), spu_ptr(&spu_thread::raddr)); + if (info.required_pc && info.required_pc != SPU_LS_SIZE) + { + const auto short_op = llvm::BasicBlock::Create(m_context, "__putllc0_short_op", m_function); + const auto heavy_op = llvm::BasicBlock::Create(m_context, "__putllc0_heavy_op", m_function); + + m_ir->CreateCondBr(m_ir->CreateICmpNE(m_ir->getInt32(info.required_pc), m_base_pc), heavy_op, short_op); + m_ir->SetInsertPoint(heavy_op); + update_pc(); + call("spu_exec_mfc_cmd", &exec_mfc_cmd, m_thread); + m_ir->CreateBr(_final); + m_ir->SetInsertPoint(short_op); + } + m_ir->CreateCondBr(m_ir->CreateAnd(m_ir->CreateICmpEQ(_eal, _raddr), m_ir->CreateIsNotNull(_raddr)), _next, _fail, m_md_likely); m_ir->SetInsertPoint(_next); @@ -2143,12 +2192,12 @@ public: { case inst_attr::putllc0: { - putllc0_pattern(func, m_patterns.at(m_pos - start).range); + putllc0_pattern(func, m_patterns.at(m_pos - start).info); continue; } case inst_attr::putllc16: { - putllc16_pattern(func, m_patterns.at(m_pos - start).range); + putllc16_pattern(func, m_patterns.at(m_pos - start).info); continue; } case inst_attr::omit: diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index ddee888b1e..6bddb5a035 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -397,12 +397,12 @@ protected: struct pattern_info { - utils::address_range32 range; + u64 info; }; std::unordered_map m_patterns; - void add_pattern(bool fill_all, inst_attr attr, u32 start, u32 end = -1); + void add_pattern(bool fill_all, inst_attr attr, u32 start, u64 info); private: // For private use diff --git a/rpcs3/tests/test_simple_array.cpp b/rpcs3/tests/test_simple_array.cpp index 05bba60f4a..8d64599b96 100644 --- a/rpcs3/tests/test_simple_array.cpp +++ b/rpcs3/tests/test_simple_array.cpp @@ -341,7 +341,6 @@ namespace rsx const auto ptr_value = reinterpret_cast(ptr2); rsx::aligned_allocator::free(ptr2); - EXPECT_NE(ptr, ptr2); EXPECT_NE(ptr_value, 0); EXPECT_EQ(ptr_value % 256, 0); }