From c00eeb5b10a14f5edbc0489ec5a75afad7afb156 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 14:25:14 +0300 Subject: [PATCH 1/8] rsx: Implement register lane gathering for operations other than UPX --- .../RSX/Program/FragmentProgramDecompiler.cpp | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index e5742fffda..f49ceb3742 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -4,6 +4,7 @@ #include "ProgramStateCache.h" #include +#include namespace rsx { @@ -527,21 +528,48 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) if (!src.fp16) { + // We may need to perform gather on all the f32 lanes. + // First, confirm that we're actually reading all the lanes + // TODO: GetSRC should take a lane count argument + std::unordered_set to_gather; + + // Unpack instructions always read the whole vector if (dst.opcode == RSX_FP_OPCODE_UP16 || dst.opcode == RSX_FP_OPCODE_UP2 || dst.opcode == RSX_FP_OPCODE_UP4 || dst.opcode == RSX_FP_OPCODE_UPB || dst.opcode == RSX_FP_OPCODE_UPG) { - auto ® = temp_registers[src.tmp_reg_index]; - if (reg.requires_gather(src.swizzle_x)) + to_gather.insert(src.swizzle_x); + } + else if (!dst.no_dest) + { + // FIXME: No_DST inputs also require lane gather. A regalloc pre-pass will solve this. + // We try to guess which channels will be read. Since there is no lane count, mask input on output mask + if (dst.mask_x) to_gather.insert(src.swizzle_x); + if (dst.mask_y) to_gather.insert(src.swizzle_y); + if (dst.mask_z) to_gather.insert(src.swizzle_z); + if (dst.mask_w) to_gather.insert(src.swizzle_w); + } + + auto& reg = temp_registers[src.tmp_reg_index]; + bool skip_reg_assign = false; + for (const auto& ch : to_gather) + { + if (reg.requires_gather(ch)) { properties.has_gather_op = true; AddReg(src.tmp_reg_index, src.fp16); ret = getFloatTypeName(4) + reg.gather_r(); + skip_reg_assign = true; break; } } + + if (skip_reg_assign) + { + break; + } } else if (precision_modifier == RSX_FP_PRECISION_HALF) { From f64db8b5edbe8bc072882035770c097238dd0eaa Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 14:25:46 +0300 Subject: [PATCH 2/8] rsx: Fix convolution filter decoding --- rpcs3/Emu/RSX/RSXTexture.cpp | 2 +- rpcs3/Emu/RSX/gcm_enums.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/RSXTexture.cpp b/rpcs3/Emu/RSX/RSXTexture.cpp index 9b8f4bfb2e..52f8183545 100644 --- a/rpcs3/Emu/RSX/RSXTexture.cpp +++ b/rpcs3/Emu/RSX/RSXTexture.cpp @@ -251,7 +251,7 @@ namespace rsx u8 fragment_texture::convolution_filter() const { - return ((registers[NV4097_SET_TEXTURE_FILTER + (m_index * 8)] >> 13) & 0xf); + return ((registers[NV4097_SET_TEXTURE_FILTER + (m_index * 8)] >> 13) & 0x7); } u8 fragment_texture::argb_signed() const diff --git a/rpcs3/Emu/RSX/gcm_enums.h b/rpcs3/Emu/RSX/gcm_enums.h index 61b51c3857..4c88176fa4 100644 --- a/rpcs3/Emu/RSX/gcm_enums.h +++ b/rpcs3/Emu/RSX/gcm_enums.h @@ -954,6 +954,8 @@ namespace gcm CELL_GCM_TEXTURE_LINEAR_LINEAR = 6, CELL_GCM_TEXTURE_CONVOLUTION_MIN = 7, CELL_GCM_TEXTURE_CONVOLUTION_MAG = 4, + + // Convolution mode CELL_GCM_TEXTURE_CONVOLUTION_QUINCUNX = 1, CELL_GCM_TEXTURE_CONVOLUTION_GAUSSIAN = 2, CELL_GCM_TEXTURE_CONVOLUTION_QUINCUNX_ALT = 3, From 3102ae4b5213f4fd2bc66514193763b7e08f64c2 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 14:26:41 +0300 Subject: [PATCH 3/8] vk: Include GCM format in debug name for temp texture-cache resources --- rpcs3/Emu/RSX/VK/VKTextureCache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 454038c962..1e910f0f81 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -716,7 +716,7 @@ namespace vk view_swizzle = source->native_component_map; } - image->set_debug_name("Temp view"); + image->set_debug_name(fmt::format("Temp view, fmt=0x%x", gcm_format)); image->set_native_component_layout(view_swizzle); auto view = image->get_view(remap_vector); From c2a894996aa3a6a9793a99c1e132dd5c9ffa0307 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 17:56:27 +0300 Subject: [PATCH 4/8] rsx: Implement more robust FP register file aliasing resolution --- .../RSX/Program/FragmentProgramDecompiler.cpp | 213 ++++++++++++++++-- .../RSX/Program/FragmentProgramDecompiler.h | 6 + 2 files changed, 198 insertions(+), 21 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index f49ceb3742..33dfafe78e 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -32,6 +32,174 @@ enum VectorLane : u8 W = 3, }; +u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, int operand) +{ + auto decode = [&](const std::string& expr) -> u32 + { + const auto ops = fmt::split(expr, { "," }); + u32 ret = 0; + + if (ops.size() <= operand) + { + return 0; + } + + const auto& m = ops[operand]; + if (m.find("x") != std::string::npos) ret &= 1; + if (m.find("y") != std::string::npos) ret &= (1 << 1); + if (m.find("z") != std::string::npos) ret &= (1 << 2); + if (m.find("w") != std::string::npos) ret &= (1 << 3); + + return ret; + }; + + constexpr u32 x = 0b1; + constexpr u32 xy = 0b11; + constexpr u32 xyz = 0b111; + constexpr u32 xyzw = 0b1111; + constexpr u32 use_dst_mask = 1u << 31; + + u32 temp = 0; + + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + case RSX_FP_OPCODE_MAD: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_DP3: + return xyz; + case RSX_FP_OPCODE_DP4: + return xyzw; + case RSX_FP_OPCODE_DST: + return decode("yz, yw"); + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + return xyzw; + case RSX_FP_OPCODE_UP4: + return x; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + switch (m_prog.get_texture_dimension(dst.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return x; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyz; + default: + return 0; + } + case RSX_FP_OPCODE_TXP: + switch (m_prog.get_texture_dimension(dst.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xyz; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyzw; + default: + return 0; + } + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return x; + case RSX_FP_OPCODE_LIT: + return xyzw; + case RSX_FP_OPCODE_LRP: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return x; + case RSX_FP_OPCODE_PK2: + return xy; + case RSX_FP_OPCODE_UP2: + return x; + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused ?? + case RSX_FP_OPCODE_PKB: + return xyzw; + case RSX_FP_OPCODE_UPB: + return x; + case RSX_FP_OPCODE_PK16: + return xy; + case RSX_FP_OPCODE_UP16: + return x; + case RSX_FP_OPCODE_BEM: + return decode("xy, xy, xyzw"); + case RSX_FP_OPCODE_PKG: + return xyzw; + case RSX_FP_OPCODE_UPG: + return x; + case RSX_FP_OPCODE_DP2A: + return decode("xy, xy, x"); + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return decode("xy, x"); + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + return decode("xy, xy, xyzw"); // Coordinate generated from BEM operation + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEMLUM instruction"); // Unused + case RSX_FP_OPCODE_REFL: + return xyzw; + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + case RSX_FP_OPCODE_DP2: + return xy; + case RSX_FP_OPCODE_NRM: + return xyz; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return decode("xyzw, x"); + case RSX_FP_OPCODE_LIF: + return decode("yw"); + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + default: + break; + } + + return 0; +} + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -508,53 +676,56 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) { std::string ret; u32 precision_modifier = 0; + int operand_idx = -1; if constexpr (std::is_same_v) { precision_modifier = src1.src0_prec_mod; + operand_idx = 0; } else if constexpr (std::is_same_v) { precision_modifier = src1.src1_prec_mod; + operand_idx = 1; } else if constexpr (std::is_same_v) { precision_modifier = src1.src2_prec_mod; + operand_idx = 2; } + ensure(operand_idx != -1); + switch (src.reg_type) { case RSX_FP_REGISTER_TYPE_TEMP: if (!src.fp16) { - // We may need to perform gather on all the f32 lanes. - // First, confirm that we're actually reading all the lanes - // TODO: GetSRC should take a lane count argument - std::unordered_set to_gather; + // We need to determine if any vector lanes need a gather op + // In theory, splitting can also be required, but that is currently unsupported + u32 src_lane_mask = get_src_vector_mask(dst.opcode, operand_idx); + std::unordered_set lanes_to_gather; - // Unpack instructions always read the whole vector - if (dst.opcode == RSX_FP_OPCODE_UP16 || - dst.opcode == RSX_FP_OPCODE_UP2 || - dst.opcode == RSX_FP_OPCODE_UP4 || - dst.opcode == RSX_FP_OPCODE_UPB || - dst.opcode == RSX_FP_OPCODE_UPG) + const bool apply_dst_mask = src_lane_mask & (1u << 31); + src_lane_mask &= ~(1u << 31); + + if (apply_dst_mask && !dst.no_dest) { - to_gather.insert(src.swizzle_x); - } - else if (!dst.no_dest) - { - // FIXME: No_DST inputs also require lane gather. A regalloc pre-pass will solve this. - // We try to guess which channels will be read. Since there is no lane count, mask input on output mask - if (dst.mask_x) to_gather.insert(src.swizzle_x); - if (dst.mask_y) to_gather.insert(src.swizzle_y); - if (dst.mask_z) to_gather.insert(src.swizzle_z); - if (dst.mask_w) to_gather.insert(src.swizzle_w); + if (!dst.mask_x) src_lane_mask &= ~(1u << 0); + if (!dst.mask_y) src_lane_mask &= ~(1u << 1); + if (!dst.mask_z) src_lane_mask &= ~(1u << 2); + if (!dst.mask_w) src_lane_mask &= ~(1u << 3); } + if (src_lane_mask & (1u << 0)) lanes_to_gather.insert(src.swizzle_x); + if (src_lane_mask & (1u << 1)) lanes_to_gather.insert(src.swizzle_y); + if (src_lane_mask & (1u << 2)) lanes_to_gather.insert(src.swizzle_z); + if (src_lane_mask & (1u << 3)) lanes_to_gather.insert(src.swizzle_w); + auto& reg = temp_registers[src.tmp_reg_index]; bool skip_reg_assign = false; - for (const auto& ch : to_gather) + for (const auto& ch : lanes_to_gather) { if (reg.requires_gather(ch)) { diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index 467c6f3ac7..2fea76ef9e 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -99,6 +99,12 @@ class FragmentProgramDecompiler */ bool handle_tex_srb(u32 opcode); + /** + * Calculates the lane mask for a given input + * This is a temporary workaround until the decompiler is rewritten with some IR to allow granular split/gather passes + */ + u32 get_src_vector_mask(u32 opcode, int operand); + protected: const RSXFragmentProgram &m_prog; u32 m_ctrl = 0; From b32ccacc57725def08258cb3d3c5cf8c00494ee3 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 18:13:32 +0300 Subject: [PATCH 5/8] rsx: Fix build --- .../RSX/Program/FragmentProgramDecompiler.cpp | 17 ++++++++++------- .../Emu/RSX/Program/FragmentProgramDecompiler.h | 8 ++++++-- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 33dfafe78e..0778141e1f 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -32,7 +32,7 @@ enum VectorLane : u8 W = 3, }; -u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, int operand) +u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) { auto decode = [&](const std::string& expr) -> u32 { @@ -59,8 +59,6 @@ u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, int operand) constexpr u32 xyzw = 0b1111; constexpr u32 use_dst_mask = 1u << 31; - u32 temp = 0; - switch (opcode) { case RSX_FP_OPCODE_NOP: @@ -672,11 +670,13 @@ void FragmentProgramDecompiler::AddCodeCond(const std::string& lhs, const std::s AddCode(lhs + " = _select(" + lhs + ", " + src_prefix + rhs + ", " + cond + ");"); } -template std::string FragmentProgramDecompiler::GetSRC(T src) +template + requires std::is_same_v || std::is_same_v || std::is_same_v +std::string FragmentProgramDecompiler::GetSRC(T src) { std::string ret; u32 precision_modifier = 0; - int operand_idx = -1; + u32 operand_idx = umax; if constexpr (std::is_same_v) { @@ -693,8 +693,11 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) precision_modifier = src1.src2_prec_mod; operand_idx = 2; } - - ensure(operand_idx != -1); + else + { + // Unreachable unless we add another SRC type + fmt::throw_exception("Invalid SRC input"); + } switch (src.reg_type) { diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index 2fea76ef9e..60b493d3a6 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -80,7 +80,11 @@ class FragmentProgramDecompiler void AddCodeCond(const std::string& lhs, const std::string& rhs); std::string GetRawCond(); std::string GetCond(); - template std::string GetSRC(T src); + + template + requires std::is_same_v || std::is_same_v || std::is_same_v + std::string GetSRC(T src); + std::string BuildCode(); static u32 GetData(const u32 d) { return d << 16 | d >> 16; } @@ -103,7 +107,7 @@ class FragmentProgramDecompiler * Calculates the lane mask for a given input * This is a temporary workaround until the decompiler is rewritten with some IR to allow granular split/gather passes */ - u32 get_src_vector_mask(u32 opcode, int operand); + u32 get_src_vector_mask(u32 opcode, u32 operand); protected: const RSXFragmentProgram &m_prog; From b65687e1e9c6365f802fd73ef4a0075ef368b2ff Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 16 Nov 2025 20:20:50 +0300 Subject: [PATCH 6/8] rsx: Do not emit gather for FP delay slots --- .../RSX/Program/FragmentProgramDecompiler.cpp | 36 +++++++++++++++++-- .../RSX/Program/FragmentProgramDecompiler.h | 7 +++- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 0778141e1f..56b207a7e9 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -32,7 +32,7 @@ enum VectorLane : u8 W = 3, }; -u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) +u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) const { auto decode = [&](const std::string& expr) -> u32 { @@ -198,6 +198,33 @@ u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) return 0; } +bool FragmentProgramDecompiler::is_delay_slot() const +{ + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter + dst.saturate || // Precision modifier + (dst.prec != RSX_FP_PRECISION_REAL && + dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers + { + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; +} + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -333,7 +360,10 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) } } - temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); + if (!is_delay_slot()) + { + temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); + } } void FragmentProgramDecompiler::AddFlowOp(const std::string& code) @@ -707,7 +737,7 @@ std::string FragmentProgramDecompiler::GetSRC(T src) { // We need to determine if any vector lanes need a gather op // In theory, splitting can also be required, but that is currently unsupported - u32 src_lane_mask = get_src_vector_mask(dst.opcode, operand_idx); + u32 src_lane_mask = is_delay_slot() ? 0u : get_src_vector_mask(dst.opcode, operand_idx); std::unordered_set lanes_to_gather; const bool apply_dst_mask = src_lane_mask & (1u << 31); diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index 60b493d3a6..10cf35af59 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -107,7 +107,12 @@ class FragmentProgramDecompiler * Calculates the lane mask for a given input * This is a temporary workaround until the decompiler is rewritten with some IR to allow granular split/gather passes */ - u32 get_src_vector_mask(u32 opcode, u32 operand); + u32 get_src_vector_mask(u32 opcode, u32 operand) const; + + /** + * Detects delay slots. These evaluate to a NOP so we don't actually need to emit them + */ + bool is_delay_slot() const; protected: const RSXFragmentProgram &m_prog; From a914fc8ca13ee434474d76e0ee3b344937ad3760 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 17 Nov 2025 21:38:57 +0300 Subject: [PATCH 7/8] rsx/fp: Use precomputed masks instead of translating from strings --- .../RSX/Program/FragmentProgramDecompiler.cpp | 50 ++++++++----------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 56b207a7e9..af34157663 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -2,6 +2,7 @@ #include "FragmentProgramDecompiler.h" #include "ProgramStateCache.h" +#include "Emu/RSX/Common/simple_array.hpp" #include #include @@ -34,31 +35,22 @@ enum VectorLane : u8 u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) const { - auto decode = [&](const std::string& expr) -> u32 - { - const auto ops = fmt::split(expr, { "," }); - u32 ret = 0; - - if (ops.size() <= operand) - { - return 0; - } - - const auto& m = ops[operand]; - if (m.find("x") != std::string::npos) ret &= 1; - if (m.find("y") != std::string::npos) ret &= (1 << 1); - if (m.find("z") != std::string::npos) ret &= (1 << 2); - if (m.find("w") != std::string::npos) ret &= (1 << 3); - - return ret; - }; - - constexpr u32 x = 0b1; - constexpr u32 xy = 0b11; - constexpr u32 xyz = 0b111; + constexpr u32 x = 0b0001; + constexpr u32 y = 0b0010; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + constexpr u32 xy = 0b0011; + constexpr u32 xyz = 0b0111; constexpr u32 xyzw = 0b1111; constexpr u32 use_dst_mask = 1u << 31; + const auto decode = [&](const rsx::simple_array& masks) -> u32 + { + return operand < masks.size() + ? masks[operand] + : 0u; + }; + switch (opcode) { case RSX_FP_OPCODE_NOP: @@ -73,7 +65,7 @@ u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) cons case RSX_FP_OPCODE_DP4: return xyzw; case RSX_FP_OPCODE_DST: - return decode("yz, yw"); + return decode({ y|z, y|w }); case RSX_FP_OPCODE_MIN: case RSX_FP_OPCODE_MAX: return xyzw | use_dst_mask; @@ -153,19 +145,19 @@ u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) cons case RSX_FP_OPCODE_UP16: return x; case RSX_FP_OPCODE_BEM: - return decode("xy, xy, xyzw"); + return decode({ xy, xy, xyzw }); case RSX_FP_OPCODE_PKG: return xyzw; case RSX_FP_OPCODE_UPG: return x; case RSX_FP_OPCODE_DP2A: - return decode("xy, xy, x"); + return decode({ xy, xy, x }); case RSX_FP_OPCODE_TXL: case RSX_FP_OPCODE_TXB: - return decode("xy, x"); + return decode({ xy, x }); case RSX_FP_OPCODE_TEXBEM: case RSX_FP_OPCODE_TXPBEM: - return decode("xy, xy, xyzw"); // Coordinate generated from BEM operation + return decode({ xy, xy, xyzw }); // Coordinate generated from BEM operation case RSX_FP_OPCODE_BEMLUM: fmt::throw_exception("Unimplemented BEMLUM instruction"); // Unused case RSX_FP_OPCODE_REFL: @@ -178,9 +170,9 @@ u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) cons return xyz; case RSX_FP_OPCODE_DIV: case RSX_FP_OPCODE_DIVSQ: - return decode("xyzw, x"); + return decode({ xyzw, x }); case RSX_FP_OPCODE_LIF: - return decode("yw"); + return decode({ y|w }); case RSX_FP_OPCODE_FENCT: case RSX_FP_OPCODE_FENCB: case RSX_FP_OPCODE_BRK: From 4a4569fa28dc73a513020bb819f8ebf7e6be09ca Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 17 Nov 2025 22:04:44 +0300 Subject: [PATCH 8/8] rsx/fp: Fix the delay slot detection --- rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index af34157663..c3b4a57517 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -195,6 +195,7 @@ bool FragmentProgramDecompiler::is_delay_slot() const if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV dst.no_dest || // Must have a sink src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter dst.saturate || // Precision modifier (dst.prec != RSX_FP_PRECISION_REAL &&