From 1e6fe1f4ab406a8980695eddc8178dfe28931e09 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 7 Dec 2025 23:59:48 +0300 Subject: [PATCH] rsx/cfg/fp: Add delay-slot detection to remove unnecessary barriers - Reduces emitted barriers by like 99% --- .../Passes/FP/RegisterAnnotationPass.cpp | 42 ++++++++++++++++++- .../Passes/FP/RegisterAnnotationPass.h | 12 +++++- .../RSX/Program/FragmentProgramDecompiler.cpp | 2 +- rpcs3/tests/test_rsx_fp_asm.cpp | 26 ++++++++++++ 4 files changed, 77 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp index b96856b7a6..58a589c6f7 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.cpp @@ -1,6 +1,7 @@ #include "stdafx.h" #include "RegisterAnnotationPass.h" #include "Emu/RSX/Program/Assembler/FPOpcodes.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" #include #include @@ -13,6 +14,38 @@ namespace rsx::assembler::FP static constexpr char content_float16 = 'H'; static constexpr char content_dual = 'D'; + bool is_delay_slot(const Instruction& instruction) + { + OPDEST dst{ .HEX = instruction.bytecode[0] }; + SRC0 src0{ .HEX = instruction.bytecode[1] }; + SRC1 src1{ .HEX = instruction.bytecode[2] }; + + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self + dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter + dst.saturate || // Precision modifier + (dst.prec != RSX_FP_PRECISION_REAL && + dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers + { + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; + } + std::vector compile_register_file(const std::array& file) { std::vector results; @@ -90,10 +123,15 @@ namespace rsx::assembler::FP } // Decay instructions into register references - void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog) + void annotate_instructions(BasicBlock* block, const RSXFragmentProgram& prog, bool skip_delay_slots) { for (auto& instruction : block->instructions) { + if (skip_delay_slots && is_delay_slot(instruction)) + { + continue; + } + const u32 operand_count = get_operand_count(static_cast(instruction.opcode)); for (u32 i = 0; i < operand_count; i++) { @@ -178,7 +216,7 @@ namespace rsx::assembler::FP { for (auto& block : graph.blocks) { - annotate_instructions(&block, m_prog); + annotate_instructions(&block, m_prog, m_config.skip_delay_slots); annotate_block_io(&block); } } diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h index c719a80381..b5cab3da85 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h @@ -6,6 +6,11 @@ struct RSXFragmentProgram; namespace rsx::assembler::FP { + struct RegisterAnnotationPassOptions + { + bool skip_delay_slots = false; // When enabled, detect delay slots and ignore annotating them. + }; + // The annotation pass annotates each basic block with 2 pieces of information: // 1. The "input" register list for a block. // 2. The "output" register list for a block (clobber list). @@ -14,13 +19,16 @@ namespace rsx::assembler::FP class RegisterAnnotationPass : public CFGPass { public: - RegisterAnnotationPass(const RSXFragmentProgram& prog) - : m_prog(prog) + RegisterAnnotationPass( + const RSXFragmentProgram& prog, + const RegisterAnnotationPassOptions& options = {}) + : m_prog(prog), m_config(options) {} void run(FlowGraph& graph) override; private: const RSXFragmentProgram& m_prog; + RegisterAnnotationPassOptions m_config; }; } diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 73e907130e..a524773e83 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -1294,7 +1294,7 @@ std::string FragmentProgramDecompiler::Decompile() const auto rop_inputs = get_fragment_program_output_set(m_prog.ctrl, m_prog.mrt_buffers_count); rop_block->input_list.insert(rop_block->input_list.end(), rop_inputs.begin(), rop_inputs.end()); - FP::RegisterAnnotationPass annotation_pass{ m_prog }; + FP::RegisterAnnotationPass annotation_pass{ m_prog, { .skip_delay_slots = true } }; FP::RegisterDependencyPass dependency_pass{}; annotation_pass.run(graph); diff --git a/rpcs3/tests/test_rsx_fp_asm.cpp b/rpcs3/tests/test_rsx_fp_asm.cpp index be4f19abc6..c30f5ff172 100644 --- a/rpcs3/tests/test_rsx_fp_asm.cpp +++ b/rpcs3/tests/test_rsx_fp_asm.cpp @@ -568,4 +568,30 @@ namespace rsx::assembler EXPECT_EQ(src1.fp16, 1); EXPECT_EQ(src1.swizzle_x, 1); } + + TEST(TestFPIR, RegisterDependencyPass_SkipDelaySlots) + { + // Instruction 2 clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 but is a delay slot that does nothing and can be NOPed. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1, R1 + MOV R0, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog, { .skip_delay_slots = true } }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + // Delay slot detection will cause no dependency injection + ASSERT_EQ(block.instructions.size(), 3); + } }