diff --git a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp index 1e60c85519..33bad149ea 100644 --- a/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp +++ b/rpcs3/Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.cpp @@ -1,10 +1,288 @@ #include "stdafx.h" #include "RegisterDependencyPass.h" +#include "Emu/RSX/Program/Assembler/FPOpcodes.h" +#include "Emu/RSX/Program/RSXFragmentProgram.h" + +#include namespace rsx::assembler::FP { + static constexpr u32 register_file_length = 48 * 8; // 24 F32 or 48 F16 registers + static constexpr char content_unknown = 0; + static constexpr char content_float32 = 'R'; + static constexpr char content_float16 = 'H'; + static constexpr char content_dual = 'D'; + + std::vector decode_lanes16(const std::unordered_set& lanes) + { + std::vector result; + + for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 8) + { + // Each register has 4 16-bit lanes + u32 mask = 0; + if (lanes.contains(file_offset + 0)) mask |= (1u << 0); + if (lanes.contains(file_offset + 2)) mask |= (1u << 1); + if (lanes.contains(file_offset + 4)) mask |= (1u << 2); + if (lanes.contains(file_offset + 6)) mask |= (1u << 3); + + if (mask == 0) + { + continue; + } + + RegisterRef ref{ .reg{.id = static_cast(index), .f16 = true } }; + ref.mask = mask; + result.push_back(ref); + } + return result; + } + + std::vector decode_lanes32(const std::unordered_set& lanes) + { + std::vector result; + + for (u32 index = 0, file_offset = 0; index < 48; ++index, file_offset += 16) + { + // Each register has 8 16-bit lanes + + u32 mask = 0; + if (lanes.contains(file_offset + 0) || lanes.contains(file_offset + 2)) mask |= (1u << 0); + if (lanes.contains(file_offset + 4) || lanes.contains(file_offset + 6)) mask |= (1u << 1); + if (lanes.contains(file_offset + 8) || lanes.contains(file_offset + 10)) mask |= (1u << 2); + if (lanes.contains(file_offset + 12) || lanes.contains(file_offset + 14)) mask |= (1u << 3); + + if (mask == 0) + { + continue; + } + + RegisterRef ref{ .reg{.id = static_cast(index), .f16 = false } }; + ref.mask = mask; + result.push_back(ref); + } + + return result; + } + + std::vector build_barrier32(const RegisterRef& reg) + { + // Upto 4 instructions are needed per 32-bit register + // R0.x = packHalf2x16(H0.xy) + // R0.y = packHalf2x16(H0.zw); + // R0.z = packHalf2x16(H1.xy); + // R0.w = packHalf2x16(H1.zw); + + std::vector result; + + for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch) + { + if (!(mask & 1)) + { + continue; + } + + Instruction instruction{}; + OPDEST dst{}; + dst.opcode = RSX_FP_OPCODE_PK2; + dst.prec = RSX_FP_PRECISION_REAL; + dst.fp16 = 0; + dst.dest_reg = reg.reg.id; + dst.write_mask = (1u << ch); + + const u32 src_reg_id = (ch / 2) + (reg.reg.id * 2); + const bool is_word0 = !(ch & 1); // Only even + + SRC0 src0{}; + src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; + src0.fp16 = 1; + + if (is_word0) + { + src0.swizzle_x = 0; + src0.swizzle_y = 1; + } + else + { + src0.swizzle_x = 2; + src0.swizzle_y = 3; + } + + src0.swizzle_z = 2; + src0.swizzle_w = 3; + src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src0.tmp_reg_index = src_reg_id; + + instruction.opcode = dst.opcode; + instruction.bytecode[0] = dst.HEX; + instruction.bytecode[1] = src0.HEX; + + Register src_reg{ .id = static_cast(src_reg_id), .f16 = true }; + instruction.srcs.push_back({ .reg=src_reg, .mask=0xF }); + instruction.dsts.push_back({ .reg{ .id = reg.reg.id, .f16 = false }, .mask = (1u << ch) }); + result.push_back(instruction); + } + + return result; + } + + std::vector build_barrier16(const RegisterRef& reg) + { + // H0.xy = unpackHalf2x16(R0.x) + // H0.zw = unpackHalf2x16(R0.y) + // H1.xy = unpackHalf2x16(R0.z) + // H1.zw = unpackHalf2x16(R0.w) + + std::vector result; + + for (u32 mask = reg.mask, ch = 0; mask > 0; mask >>= 1, ++ch) + { + if (!(mask & 1)) + { + continue; + } + + Instruction instruction{}; + OPDEST dst{}; + dst.opcode = RSX_FP_OPCODE_UP2; + dst.prec = RSX_FP_PRECISION_HALF; + dst.fp16 = 1; + dst.dest_reg = reg.reg.id; + dst.write_mask = 1u << ch; + + const u32 src_reg_id = reg.reg.id / 2; + const bool is_odd_reg = !!(reg.reg.id & 1); + const bool is_word0 = ch < 2; + + // If we're a non-odd register, we should also write the next channel (y/w) + if (!is_odd_reg && (mask & 2)) + { + mask >>= 1; + ++ch; + dst.write_mask |= (1u << ch); + } + + SRC0 src0{}; + src0.exec_if_eq = src0.exec_if_gr = src0.exec_if_lt = 1; + + if (is_word0) + { + src0.swizzle_x = is_odd_reg ? 2 : 0; + } + else + { + src0.swizzle_x = is_odd_reg ? 3 : 1; + } + + src0.swizzle_y = 1; + src0.swizzle_z = 2; + src0.swizzle_w = 3; + src0.reg_type = RSX_FP_REGISTER_TYPE_TEMP; + src0.tmp_reg_index = src_reg_id; + + instruction.opcode = dst.opcode; + instruction.bytecode[0] = dst.HEX; + instruction.bytecode[1] = src0.HEX; + + Register src_reg{ .id = static_cast(src_reg_id), .f16 = true }; + instruction.srcs.push_back({ .reg = src_reg, .mask = 0xF }); + instruction.dsts.push_back({ .reg{.id = reg.reg.id, .f16 = false }, .mask = dst.write_mask }); + result.push_back(instruction); + } + + return result; + } + + void insert_dependency_barriers(BasicBlock* block) + { + std::array register_file; + std::memset(register_file.data(), content_unknown, register_file_length); + + std::unordered_set barrier16; + std::unordered_set barrier32; + + // This subpass does not care about the prologue and epilogue and assumes each block is unique. + for (auto it = block->instructions.begin(); it != block->instructions.end();) + { + auto& inst = *it; + + barrier16.clear(); + barrier32.clear(); + + for (const auto& src : inst.srcs) + { + const auto read_bytes = get_register_file_range(src); + const char expected_type = src.reg.f16 ? content_float16 : content_float32; + for (const auto& index : read_bytes) + { + if (register_file[index] == content_unknown) + { + // Skip input + continue; + } + + if (register_file[index] == expected_type || register_file[index] == content_dual) + { + // Match - nothing to do + continue; + } + + // Collision on the lane + register_file[index] = content_dual; + (src.reg.f16 ? barrier16 : barrier32).insert(index); + } + } + + for (const auto& dst : inst.dsts) + { + const auto write_bytes = get_register_file_range(dst); + const char expected_type = dst.reg.f16 ? content_float16 : content_float32; + + for (const auto& index : write_bytes) + { + register_file[index] = expected_type; + } + } + + if (barrier16.empty() && barrier32.empty()) + { + ++it; + continue; + } + + // We need to inject some barrier instructions + if (!barrier16.empty()) + { + auto barrier16_in = decode_lanes16(barrier16); + for (const auto& reg : barrier16_in) + { + auto instructions = build_barrier16(reg); + it = block->instructions.insert(it, instructions.begin(), instructions.end()); + std::advance(it, instructions.size() + 1); + } + } + + if (!barrier32.empty()) + { + auto barrier32_in = decode_lanes32(barrier32); + for (const auto& reg : barrier32_in) + { + auto instructions = build_barrier32(reg); + it = block->instructions.insert(it, instructions.begin(), instructions.end()); + std::advance(it, instructions.size() + 1); + } + } + } + } + void RegisterDependencyPass::run(FlowGraph& graph) { - // TODO + // First, run intra-block dependency + for (auto& block : graph.blocks) + { + insert_dependency_barriers(&block); + } + + // TODO: Create prologue/epilogue instructions } } diff --git a/rpcs3/tests/test_rsx_fp_asm.cpp b/rpcs3/tests/test_rsx_fp_asm.cpp index b09088c80c..61202aa3e1 100644 --- a/rpcs3/tests/test_rsx_fp_asm.cpp +++ b/rpcs3/tests/test_rsx_fp_asm.cpp @@ -3,6 +3,7 @@ #include "Emu/RSX/Common/simple_array.hpp" #include "Emu/RSX/Program/Assembler/FPASM.h" #include "Emu/RSX/Program/Assembler/Passes/FP/RegisterAnnotationPass.h" +#include "Emu/RSX/Program/Assembler/Passes/FP/RegisterDependencyPass.h" #include "Emu/RSX/Program/RSXFragmentProgram.h" namespace rsx::assembler @@ -85,7 +86,7 @@ namespace rsx::assembler auto& block = graph.blocks.front(); RSXFragmentProgram prog{}; - FP::RegisterAnnotationPass annotation_pass(prog); + FP::RegisterAnnotationPass annotation_pass{ prog }; annotation_pass.run(graph); @@ -115,7 +116,7 @@ namespace rsx::assembler auto& block = graph.blocks.front(); RSXFragmentProgram prog{}; - FP::RegisterAnnotationPass annotation_pass(prog); + FP::RegisterAnnotationPass annotation_pass{ prog }; annotation_pass.run(graph); @@ -129,4 +130,110 @@ namespace rsx::assembler EXPECT_EQ(block.input_list[0].reg, R0); EXPECT_EQ(block.input_list[1].reg, R1); } + + TEST(TestFPIR, RegisterDependencyPass_Simple16) + { + // Instruction 2 clobers R0 which in turn clobbers H0. + // Instruction 3 reads from H0 so a barrier16 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + PK8U R0, R1; + MOV H2, H0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 5); + + // H0.xy = unpackHalf2(r0.x); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 1); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0); + + // H0.zw = unpackHalf2(r0.y); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_UP2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 1); + } + + TEST(TestFPIR, RegisterDependencyPass_Simple32) + { + // Instruction 2 clobers H1 which in turn clobbers R0. + // Instruction 3 reads from R0 so a barrier32 is needed between them. + auto graph = CFG_from_source(R"( + ADD R1, R0, R1; + MOV H1, R1 + MOV R2, R0; + )"); + + ASSERT_EQ(graph.blocks.size(), 1); + ASSERT_EQ(graph.blocks.front().instructions.size(), 3); + + auto& block = graph.blocks.front(); + RSXFragmentProgram prog{}; + + FP::RegisterAnnotationPass annotation_pass{ prog }; + FP::RegisterDependencyPass deps_pass{}; + + annotation_pass.run(graph); + deps_pass.run(graph); + + ASSERT_EQ(block.instructions.size(), 5); + + // R0.z = packHalf2(H1.xy); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.dest_reg, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_z, true); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[2].bytecode[0] }.mask_w, false); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.tmp_reg_index, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_x, 0); + EXPECT_EQ(SRC0{ .HEX = block.instructions[2].bytecode[1] }.swizzle_y, 1); + + // R0.w = packHalf2(H1.zw); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.opcode, RSX_FP_OPCODE_PK2); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.fp16, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.dest_reg, 0); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_x, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_y, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_z, false); + EXPECT_EQ(OPDEST{ .HEX = block.instructions[3].bytecode[0] }.mask_w, true); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.reg_type, RSX_FP_REGISTER_TYPE_TEMP); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.tmp_reg_index, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.fp16, 1); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_x, 2); + EXPECT_EQ(SRC0{ .HEX = block.instructions[3].bytecode[1] }.swizzle_y, 3); + } + + TEST(TestFPIR, RegisterDependencyPass_Complex) + { + // TODO: Multi-level block structure with nested IFs/LOOPs + } }