diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp index 2ebfd7d8d7..2a79f68e8d 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.cpp @@ -2,8 +2,10 @@ #include "FragmentProgramDecompiler.h" #include "ProgramStateCache.h" +#include "Emu/RSX/Common/simple_array.hpp" #include +#include namespace rsx { @@ -31,6 +33,191 @@ enum VectorLane : u8 W = 3, }; +u32 FragmentProgramDecompiler::get_src_vector_mask(u32 opcode, u32 operand) const +{ + constexpr u32 x = 0b0001; + constexpr u32 y = 0b0010; + constexpr u32 z = 0b0100; + constexpr u32 w = 0b1000; + constexpr u32 xy = 0b0011; + constexpr u32 xyz = 0b0111; + constexpr u32 xyzw = 0b1111; + constexpr u32 use_dst_mask = 1u << 31; + + const auto decode = [&](const rsx::simple_array& masks) -> u32 + { + return operand < masks.size() + ? masks[operand] + : 0u; + }; + + switch (opcode) + { + case RSX_FP_OPCODE_NOP: + return 0; + case RSX_FP_OPCODE_MOV: + case RSX_FP_OPCODE_MUL: + case RSX_FP_OPCODE_ADD: + case RSX_FP_OPCODE_MAD: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_DP3: + return xyz; + case RSX_FP_OPCODE_DP4: + return xyzw; + case RSX_FP_OPCODE_DST: + return decode({ y|z, y|w }); + case RSX_FP_OPCODE_MIN: + case RSX_FP_OPCODE_MAX: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_SLT: + case RSX_FP_OPCODE_SGE: + case RSX_FP_OPCODE_SLE: + case RSX_FP_OPCODE_SGT: + case RSX_FP_OPCODE_SNE: + case RSX_FP_OPCODE_SEQ: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_FRC: + case RSX_FP_OPCODE_FLR: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_KIL: + return 0; + case RSX_FP_OPCODE_PK4: + return xyzw; + case RSX_FP_OPCODE_UP4: + return x; + case RSX_FP_OPCODE_DDX: + case RSX_FP_OPCODE_DDY: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_TEX: + case RSX_FP_OPCODE_TXD: + switch (m_prog.get_texture_dimension(dst.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return x; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyz; + default: + return 0; + } + case RSX_FP_OPCODE_TXP: + switch (m_prog.get_texture_dimension(dst.tex_num)) + { + case rsx::texture_dimension_extended::texture_dimension_1d: + return xy; + case rsx::texture_dimension_extended::texture_dimension_2d: + return xyz; + case rsx::texture_dimension_extended::texture_dimension_3d: + case rsx::texture_dimension_extended::texture_dimension_cubemap: + return xyzw; + default: + return 0; + } + case RSX_FP_OPCODE_RCP: + case RSX_FP_OPCODE_RSQ: + case RSX_FP_OPCODE_EX2: + case RSX_FP_OPCODE_LG2: + return x; + case RSX_FP_OPCODE_LIT: + return xyzw; + case RSX_FP_OPCODE_LRP: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_STR: + case RSX_FP_OPCODE_SFL: + return xyzw | use_dst_mask; + case RSX_FP_OPCODE_COS: + case RSX_FP_OPCODE_SIN: + return x; + case RSX_FP_OPCODE_PK2: + return xy; + case RSX_FP_OPCODE_UP2: + return x; + case RSX_FP_OPCODE_POW: + fmt::throw_exception("Unimplemented POW instruction."); // Unused ?? + case RSX_FP_OPCODE_PKB: + return xyzw; + case RSX_FP_OPCODE_UPB: + return x; + case RSX_FP_OPCODE_PK16: + return xy; + case RSX_FP_OPCODE_UP16: + return x; + case RSX_FP_OPCODE_BEM: + return decode({ xy, xy, xyzw }); + case RSX_FP_OPCODE_PKG: + return xyzw; + case RSX_FP_OPCODE_UPG: + return x; + case RSX_FP_OPCODE_DP2A: + return decode({ xy, xy, x }); + case RSX_FP_OPCODE_TXL: + case RSX_FP_OPCODE_TXB: + return decode({ xy, x }); + case RSX_FP_OPCODE_TEXBEM: + case RSX_FP_OPCODE_TXPBEM: + return decode({ xy, xy, xyzw }); // Coordinate generated from BEM operation + case RSX_FP_OPCODE_BEMLUM: + fmt::throw_exception("Unimplemented BEMLUM instruction"); // Unused + case RSX_FP_OPCODE_REFL: + return xyzw; + case RSX_FP_OPCODE_TIMESWTEX: + fmt::throw_exception("Unimplemented TIMESWTEX instruction"); // Unused + case RSX_FP_OPCODE_DP2: + return xy; + case RSX_FP_OPCODE_NRM: + return xyz; + case RSX_FP_OPCODE_DIV: + case RSX_FP_OPCODE_DIVSQ: + return decode({ xyzw, x }); + case RSX_FP_OPCODE_LIF: + return decode({ y|w }); + case RSX_FP_OPCODE_FENCT: + case RSX_FP_OPCODE_FENCB: + case RSX_FP_OPCODE_BRK: + case RSX_FP_OPCODE_CAL: + case RSX_FP_OPCODE_IFE: + case RSX_FP_OPCODE_LOOP: + case RSX_FP_OPCODE_REP: + case RSX_FP_OPCODE_RET: + // Flow control. Special registers are provided for these outside the common file + return 0; + default: + break; + } + + return 0; +} + +bool FragmentProgramDecompiler::is_delay_slot() const +{ + if (dst.opcode != RSX_FP_OPCODE_MOV || // These slots are always populated with MOV + dst.no_dest || // Must have a sink + src0.reg_type != RSX_FP_REGISTER_TYPE_TEMP || // Must read from reg + dst.dest_reg != src0.tmp_reg_index || // Must be a write-to-self + dst.fp16 || // Always full lane. We need to collect more data on this but it won't matter + dst.saturate || // Precision modifier + (dst.prec != RSX_FP_PRECISION_REAL && + dst.prec != RSX_FP_PRECISION_UNKNOWN)) // Cannot have precision modifiers + { + return false; + } + + // Check if we have precision modifiers on the source + if (src0.abs || src0.neg || src1.scale) + { + return false; + } + + if (dst.mask_x && src0.swizzle_x != 0) return false; + if (dst.mask_y && src0.swizzle_y != 1) return false; + if (dst.mask_z && src0.swizzle_z != 2) return false; + if (dst.mask_w && src0.swizzle_w != 3) return false; + + return true; +} + FragmentProgramDecompiler::FragmentProgramDecompiler(const RSXFragmentProgram &prog, u32& size) : m_size(size) , m_prog(prog) @@ -166,7 +353,10 @@ void FragmentProgramDecompiler::SetDst(std::string code, u32 flags) } } - temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); + if (!is_delay_slot()) + { + temp_registers[reg_index].tag(dst.dest_reg, !!dst.fp16, dst.mask_x, dst.mask_y, dst.mask_z, dst.mask_w); + } } void FragmentProgramDecompiler::AddFlowOp(const std::string& code) @@ -501,22 +691,33 @@ void FragmentProgramDecompiler::AddCodeCond(const std::string& lhs, const std::s AddCode(lhs + " = _select(" + lhs + ", " + src_prefix + rhs + ", " + cond + ");"); } -template std::string FragmentProgramDecompiler::GetSRC(T src) +template + requires std::is_same_v || std::is_same_v || std::is_same_v +std::string FragmentProgramDecompiler::GetSRC(T src) { std::string ret; u32 precision_modifier = 0; + u32 operand_idx = umax; if constexpr (std::is_same_v) { precision_modifier = src1.src0_prec_mod; + operand_idx = 0; } else if constexpr (std::is_same_v) { precision_modifier = src1.src1_prec_mod; + operand_idx = 1; } else if constexpr (std::is_same_v) { precision_modifier = src1.src2_prec_mod; + operand_idx = 2; + } + else + { + // Unreachable unless we add another SRC type + fmt::throw_exception("Invalid SRC input"); } switch (src.reg_type) @@ -525,21 +726,45 @@ template std::string FragmentProgramDecompiler::GetSRC(T src) if (!src.fp16) { - if (dst.opcode == RSX_FP_OPCODE_UP16 || - dst.opcode == RSX_FP_OPCODE_UP2 || - dst.opcode == RSX_FP_OPCODE_UP4 || - dst.opcode == RSX_FP_OPCODE_UPB || - dst.opcode == RSX_FP_OPCODE_UPG) + // We need to determine if any vector lanes need a gather op + // In theory, splitting can also be required, but that is currently unsupported + u32 src_lane_mask = is_delay_slot() ? 0u : get_src_vector_mask(dst.opcode, operand_idx); + std::unordered_set lanes_to_gather; + + const bool apply_dst_mask = src_lane_mask & (1u << 31); + src_lane_mask &= ~(1u << 31); + + if (apply_dst_mask && !dst.no_dest) { - auto ® = temp_registers[src.tmp_reg_index]; - if (reg.requires_gather(src.swizzle_x)) + if (!dst.mask_x) src_lane_mask &= ~(1u << 0); + if (!dst.mask_y) src_lane_mask &= ~(1u << 1); + if (!dst.mask_z) src_lane_mask &= ~(1u << 2); + if (!dst.mask_w) src_lane_mask &= ~(1u << 3); + } + + if (src_lane_mask & (1u << 0)) lanes_to_gather.insert(src.swizzle_x); + if (src_lane_mask & (1u << 1)) lanes_to_gather.insert(src.swizzle_y); + if (src_lane_mask & (1u << 2)) lanes_to_gather.insert(src.swizzle_z); + if (src_lane_mask & (1u << 3)) lanes_to_gather.insert(src.swizzle_w); + + auto& reg = temp_registers[src.tmp_reg_index]; + bool skip_reg_assign = false; + for (const auto& ch : lanes_to_gather) + { + if (reg.requires_gather(ch)) { properties.has_gather_op = true; AddReg(src.tmp_reg_index, src.fp16); ret = getFloatTypeName(4) + reg.gather_r(); + skip_reg_assign = true; break; } } + + if (skip_reg_assign) + { + break; + } } else if (precision_modifier == RSX_FP_PRECISION_HALF) { diff --git a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h index b68750bdfc..c4a46f590a 100644 --- a/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h +++ b/rpcs3/Emu/RSX/Program/FragmentProgramDecompiler.h @@ -81,7 +81,11 @@ class FragmentProgramDecompiler void AddCodeCond(const std::string& lhs, const std::string& rhs); std::string GetRawCond(); std::string GetCond(); - template std::string GetSRC(T src); + + template + requires std::is_same_v || std::is_same_v || std::is_same_v + std::string GetSRC(T src); + std::string BuildCode(); static u32 GetData(const u32 d) { return d << 16 | d >> 16; } @@ -100,6 +104,17 @@ class FragmentProgramDecompiler */ bool handle_tex_srb(u32 opcode); + /** + * Calculates the lane mask for a given input + * This is a temporary workaround until the decompiler is rewritten with some IR to allow granular split/gather passes + */ + u32 get_src_vector_mask(u32 opcode, u32 operand) const; + + /** + * Detects delay slots. These evaluate to a NOP so we don't actually need to emit them + */ + bool is_delay_slot() const; + protected: const RSXFragmentProgram &m_prog; u32 m_ctrl = 0; diff --git a/rpcs3/Emu/RSX/RSXTexture.cpp b/rpcs3/Emu/RSX/RSXTexture.cpp index 9b8f4bfb2e..52f8183545 100644 --- a/rpcs3/Emu/RSX/RSXTexture.cpp +++ b/rpcs3/Emu/RSX/RSXTexture.cpp @@ -251,7 +251,7 @@ namespace rsx u8 fragment_texture::convolution_filter() const { - return ((registers[NV4097_SET_TEXTURE_FILTER + (m_index * 8)] >> 13) & 0xf); + return ((registers[NV4097_SET_TEXTURE_FILTER + (m_index * 8)] >> 13) & 0x7); } u8 fragment_texture::argb_signed() const diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index 454038c962..1e910f0f81 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -716,7 +716,7 @@ namespace vk view_swizzle = source->native_component_map; } - image->set_debug_name("Temp view"); + image->set_debug_name(fmt::format("Temp view, fmt=0x%x", gcm_format)); image->set_native_component_layout(view_swizzle); auto view = image->get_view(remap_vector); diff --git a/rpcs3/Emu/RSX/gcm_enums.h b/rpcs3/Emu/RSX/gcm_enums.h index 61b51c3857..4c88176fa4 100644 --- a/rpcs3/Emu/RSX/gcm_enums.h +++ b/rpcs3/Emu/RSX/gcm_enums.h @@ -954,6 +954,8 @@ namespace gcm CELL_GCM_TEXTURE_LINEAR_LINEAR = 6, CELL_GCM_TEXTURE_CONVOLUTION_MIN = 7, CELL_GCM_TEXTURE_CONVOLUTION_MAG = 4, + + // Convolution mode CELL_GCM_TEXTURE_CONVOLUTION_QUINCUNX = 1, CELL_GCM_TEXTURE_CONVOLUTION_GAUSSIAN = 2, CELL_GCM_TEXTURE_CONVOLUTION_QUINCUNX_ALT = 3,