2020-12-05 13:08:24 +01:00
# include "stdafx.h"
2016-01-10 20:09:56 +01:00
# include "ProgramStateCache.h"
2020-02-15 23:36:20 +01:00
# include "Emu/system_config.h"
2025-03-08 17:03:05 +01:00
# include "Emu/RSX/Core/RSXDriverState.h"
2025-02-28 14:45:34 +01:00
# include "util/sysinfo.hpp"
2018-07-01 19:37:05 +02:00
# include <stack>
2024-08-09 13:07:37 +02:00
# if defined(ARCH_X64)
# include "emmintrin.h"
# include "immintrin.h"
# endif
# ifdef ARCH_ARM64
# ifndef _MSC_VER
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wstrict-aliasing"
# pragma GCC diagnostic ignored "-Wold-style-cast"
# endif
# include "Emu/CPU/sse2neon.h"
# ifndef _MSC_VER
# pragma GCC diagnostic pop
# endif
# endif
2016-01-10 20:09:56 +01:00
2025-02-28 14:45:34 +01:00
# ifdef _MSC_VER
# define AVX512_ICL_FUNC
# else
# define AVX512_ICL_FUNC __attribute__((__target__("avx512f,avx512bw,avx512dq,avx512cd,avx512vl,avx512bitalg,avx512ifma,avx512vbmi,avx512vbmi2,avx512vnni,avx512vpopcntdq")))
# endif
2016-01-10 20:09:56 +01:00
using namespace program_hash_util ;
2025-02-28 14:45:34 +01:00
# ifdef ARCH_X64
2025-03-02 23:54:45 +01:00
AVX512_ICL_FUNC usz get_vertex_program_ucode_hash_512 ( const RSXVertexProgram & program )
{
// Load all elements of the instruction_mask bitset
const __m512i * instMask512 = reinterpret_cast < const __m512i * > ( & program . instruction_mask ) ;
const __m128i * instMask128 = reinterpret_cast < const __m128i * > ( & program . instruction_mask ) ;
const __m512i lowerMask = _mm512_loadu_si512 ( instMask512 ) ;
const __m128i upper128 = _mm_loadu_si128 ( instMask128 + 4 ) ;
const __m512i upperMask = _mm512_zextsi128_si512 ( upper128 ) ;
__m512i maskIndex = _mm512_setzero_si512 ( ) ;
const __m512i negativeOnes = _mm512_set1_epi64 ( - 1 ) ;
// Special masks to test against bitset
const __m512i testMask0 = _mm512_set_epi64 (
0x0808080808080808 ,
0x0808080808080808 ,
0x0404040404040404 ,
0x0404040404040404 ,
0x0202020202020202 ,
0x0202020202020202 ,
0x0101010101010101 ,
0x0101010101010101 ) ;
const __m512i testMask1 = _mm512_set_epi64 (
0x8080808080808080 ,
0x8080808080808080 ,
0x4040404040404040 ,
0x4040404040404040 ,
0x2020202020202020 ,
0x2020202020202020 ,
0x1010101010101010 ,
0x1010101010101010 ) ;
const __m512i * instBuffer = reinterpret_cast < const __m512i * > ( program . data . data ( ) ) ;
__m512i acc0 = _mm512_setzero_si512 ( ) ;
__m512i acc1 = _mm512_setzero_si512 ( ) ;
__m512i rotMask0 = _mm512_set_epi64 ( 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 ) ;
__m512i rotMask1 = _mm512_set_epi64 ( 15 , 14 , 13 , 12 , 11 , 10 , 9 , 8 ) ;
2025-03-07 08:54:12 +01:00
const __m512i rotMaskAdd = _mm512_set_epi64 ( 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 ) ;
2025-03-02 23:54:45 +01:00
u32 instIndex = 0 ;
// If there is remainder, add an extra (masked) iteration
2025-03-07 08:54:12 +01:00
const u32 extraIteration = ( program . data . size ( ) % 32 ! = 0 ) ? 1 : 0 ;
const u32 length = static_cast < u32 > ( program . data . size ( ) / 32 ) + extraIteration ;
2025-03-02 23:54:45 +01:00
// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
// for the remainder, or a scalar loop.
while ( instIndex < ( length ) )
2016-01-10 20:09:56 +01:00
{
2025-03-02 23:54:45 +01:00
const __m512i masks = _mm512_permutex2var_epi8 ( lowerMask , maskIndex , upperMask ) ;
const __mmask8 result0 = _mm512_test_epi64_mask ( masks , testMask0 ) ;
const __mmask8 result1 = _mm512_test_epi64_mask ( masks , testMask1 ) ;
const __m512i load0 = _mm512_maskz_loadu_epi64 ( result0 , ( instBuffer + instIndex * 2 ) ) ;
const __m512i load1 = _mm512_maskz_loadu_epi64 ( result1 , ( instBuffer + ( instIndex * 2 ) + 1 ) ) ;
2025-02-28 14:45:34 +01:00
2025-03-02 23:54:45 +01:00
const __m512i rotated0 = _mm512_rorv_epi64 ( load0 , rotMask0 ) ;
const __m512i rotated1 = _mm512_rorv_epi64 ( load1 , rotMask1 ) ;
2025-02-28 14:45:34 +01:00
2025-03-02 23:54:45 +01:00
acc0 = _mm512_add_epi64 ( acc0 , rotated0 ) ;
acc1 = _mm512_add_epi64 ( acc1 , rotated1 ) ;
2025-02-28 14:45:34 +01:00
2025-03-02 23:54:45 +01:00
rotMask0 = _mm512_add_epi64 ( rotMask0 , rotMaskAdd ) ;
rotMask1 = _mm512_add_epi64 ( rotMask1 , rotMaskAdd ) ;
maskIndex = _mm512_sub_epi8 ( maskIndex , negativeOnes ) ;
2025-02-28 14:45:34 +01:00
2025-03-02 23:54:45 +01:00
instIndex + + ;
2025-02-28 14:45:34 +01:00
}
2025-03-02 23:54:45 +01:00
const __m512i result = _mm512_add_epi64 ( acc0 , acc1 ) ;
return _mm512_reduce_add_epi64 ( result ) ;
}
2025-02-28 14:45:34 +01:00
# endif
2025-03-02 23:54:45 +01:00
usz vertex_program_utils : : get_vertex_program_ucode_hash ( const RSXVertexProgram & program )
{
2025-02-28 14:45:34 +01:00
// Checksum as hash with rotated data
const void * instbuffer = program . data . data ( ) ;
u32 instIndex = 0 ;
usz acc0 = 0 ;
usz acc1 = 0 ;
do
{
if ( program . instruction_mask [ instIndex ] )
{
const auto inst = v128 : : loadu ( instbuffer , instIndex ) ;
2025-03-07 08:54:12 +01:00
const usz tmp0 = std : : rotr ( inst . _u64 [ 0 ] , instIndex * 2 ) ;
2025-02-28 14:45:34 +01:00
acc0 + = tmp0 ;
2025-03-07 08:54:12 +01:00
const usz tmp1 = std : : rotr ( inst . _u64 [ 1 ] , ( instIndex * 2 ) + 1 ) ;
2025-02-28 14:45:34 +01:00
acc1 + = tmp1 ;
}
instIndex + + ;
} while ( instIndex < ( program . data . size ( ) / 4 ) ) ;
2025-02-14 21:26:16 +01:00
return acc0 + acc1 ;
2025-02-28 14:45:34 +01:00
}
2016-01-10 20:09:56 +01:00
2018-07-01 19:37:05 +02:00
vertex_program_utils : : vertex_program_metadata vertex_program_utils : : analyse_vertex_program ( const u32 * data , u32 entry , RSXVertexProgram & dst_prog )
2018-03-20 12:14:45 +01:00
{
2018-07-09 20:31:31 +02:00
vertex_program_utils : : vertex_program_metadata result { } ;
2021-01-12 11:01:06 +01:00
//u32 last_instruction_address = 0;
//u32 first_instruction_address = entry;
2018-07-01 19:37:05 +02:00
2021-06-27 14:36:18 +02:00
std : : bitset < rsx : : max_vertex_program_instructions > instructions_to_patch ;
std : : pair < u32 , u32 > instruction_range { umax , 0 } ;
2018-07-01 19:37:05 +02:00
bool has_branch_instruction = false ;
2021-06-27 14:36:18 +02:00
std : : stack < u32 > call_stack ;
2018-07-01 19:37:05 +02:00
2025-03-07 08:54:12 +01:00
D3 d3 { } ;
D2 d2 { } ;
D1 d1 { } ;
D0 d0 { } ;
2018-03-20 12:14:45 +01:00
2018-07-01 19:37:05 +02:00
std : : function < void ( u32 , bool ) > walk_function = [ & ] ( u32 start , bool fast_exit )
2018-03-20 12:14:45 +01:00
{
2020-03-28 03:46:48 +01:00
u32 current_instruction = start ;
2018-07-01 19:37:05 +02:00
std : : set < u32 > conditional_targets ;
2018-03-20 12:14:45 +01:00
2018-07-01 19:37:05 +02:00
while ( true )
2018-03-20 12:14:45 +01:00
{
2021-06-27 14:36:18 +02:00
ensure ( current_instruction < rsx : : max_vertex_program_instructions ) ;
2018-07-01 19:37:05 +02:00
2020-03-28 03:46:48 +01:00
if ( result . instruction_mask [ current_instruction ] )
2018-07-01 19:37:05 +02:00
{
if ( ! fast_exit )
{
2023-11-09 00:07:25 +01:00
// This can be harmless if a dangling RET was encountered before.
// This can also be legal in case of BRB...BRI loops since BRIs are conditional. Might just be a loop with exit cond.
rsx_log . warning ( " vp_analyser: Possible infinite loop detected " ) ;
2018-07-01 19:37:05 +02:00
}
2023-11-09 00:07:25 +01:00
// There is never any reason to continue scanning after self-intersecting on the control-flow tree.
break ;
2018-07-01 19:37:05 +02:00
}
2020-04-10 09:05:23 +02:00
const auto instruction = v128 : : loadu ( & data [ current_instruction * 4 ] ) ;
d1 . HEX = instruction . _u32 [ 1 ] ;
2021-12-07 21:06:13 +01:00
d2 . HEX = instruction . _u32 [ 2 ] ;
2020-04-10 09:05:23 +02:00
d3 . HEX = instruction . _u32 [ 3 ] ;
2018-07-01 19:37:05 +02:00
// Touch current instruction
2020-03-28 03:46:48 +01:00
result . instruction_mask [ current_instruction ] = true ;
instruction_range . first = std : : min ( current_instruction , instruction_range . first ) ;
instruction_range . second = std : : max ( current_instruction , instruction_range . second ) ;
2018-07-01 19:37:05 +02:00
2021-09-15 19:46:03 +02:00
// Whether to check if the current instruction references an input stream
2021-12-07 21:06:13 +01:00
auto input_attribute_ref = [ & ] ( )
{
if ( ! d1 . input_src )
{
// It is possible to reference ATTR0, but this is mandatory anyway. No need to explicitly test for it
return ;
}
const auto ref_mask = ( 1u < < d1 . input_src ) ;
if ( ( result . referenced_inputs_mask & ref_mask ) = = 0 )
{
// Type is encoded in the first 2 bits of each block
const auto src0 = d2 . src0l & 0x3 ;
const auto src1 = d2 . src1 & 0x3 ;
const auto src2 = d3 . src2l & 0x3 ;
if ( ( src0 = = RSX_VP_REGISTER_TYPE_INPUT ) | |
( src1 = = RSX_VP_REGISTER_TYPE_INPUT ) | |
( src2 = = RSX_VP_REGISTER_TYPE_INPUT ) )
{
result . referenced_inputs_mask | = ref_mask ;
}
}
} ;
auto branch_to = [ & ] ( const u32 target )
{
input_attribute_ref ( ) ;
current_instruction = target ;
} ;
2021-09-15 19:46:03 +02:00
2018-07-09 20:31:31 +02:00
// Basic vec op analysis, must be done before flow analysis
switch ( d1 . vec_opcode )
{
2021-12-07 21:06:13 +01:00
case RSX_VEC_OPCODE_NOP :
{
break ;
}
2018-07-09 20:31:31 +02:00
case RSX_VEC_OPCODE_TXL :
{
result . referenced_textures_mask | = ( 1 < < d2 . tex_num ) ;
break ;
}
2021-09-15 19:46:03 +02:00
default :
{
2021-12-07 21:06:13 +01:00
input_attribute_ref ( ) ;
2021-09-15 19:46:03 +02:00
break ;
}
2018-07-09 20:31:31 +02:00
}
2018-07-01 19:37:05 +02:00
bool static_jump = false ;
bool function_call = true ;
2018-03-20 12:14:45 +01:00
2018-07-01 19:37:05 +02:00
switch ( d1 . sca_opcode )
{
2021-12-07 21:06:13 +01:00
case RSX_SCA_OPCODE_NOP :
{
break ;
}
2018-07-01 19:37:05 +02:00
case RSX_SCA_OPCODE_BRI :
{
2020-04-10 09:05:23 +02:00
d0 . HEX = instruction . _u32 [ 0 ] ;
2018-07-01 19:37:05 +02:00
static_jump = ( d0 . cond = = 0x7 ) ;
2018-09-06 13:28:12 +02:00
[[fallthrough]] ;
2018-07-01 19:37:05 +02:00
}
case RSX_SCA_OPCODE_BRB :
{
function_call = false ;
2018-09-06 13:28:12 +02:00
[[fallthrough]] ;
2018-07-01 19:37:05 +02:00
}
case RSX_SCA_OPCODE_CAL :
case RSX_SCA_OPCODE_CLI :
case RSX_SCA_OPCODE_CLB :
{
// Need to patch the jump address to be consistent wherever the program is located
2020-03-28 03:46:48 +01:00
instructions_to_patch [ current_instruction ] = true ;
2018-07-01 19:37:05 +02:00
has_branch_instruction = true ;
2021-06-27 14:36:18 +02:00
d0 . HEX = instruction . _u32 [ 0 ] ;
const u32 jump_address = ( d0 . iaddrh2 < < 9 ) | ( d2 . iaddrh < < 3 ) | d3 . iaddrl ;
2018-07-01 19:37:05 +02:00
if ( function_call )
{
2020-03-28 03:46:48 +01:00
call_stack . push ( current_instruction + 1 ) ;
2021-12-07 21:06:13 +01:00
branch_to ( jump_address ) ;
2018-07-01 19:37:05 +02:00
continue ;
}
else if ( static_jump )
{
// NOTE: This will skip potential jump target blocks between current->target
2021-12-07 21:06:13 +01:00
branch_to ( jump_address ) ;
2018-07-01 19:37:05 +02:00
continue ;
}
else
{
// Set possible end address and proceed as usual
conditional_targets . emplace ( jump_address ) ;
instruction_range . second = std : : max ( jump_address , instruction_range . second ) ;
}
break ;
}
case RSX_SCA_OPCODE_RET :
{
if ( call_stack . empty ( ) )
{
2020-02-01 09:07:25 +01:00
rsx_log . error ( " vp_analyser: RET found outside subroutine call " ) ;
2018-07-01 19:37:05 +02:00
}
else
{
2021-12-07 21:06:13 +01:00
branch_to ( call_stack . top ( ) ) ;
2018-07-01 19:37:05 +02:00
call_stack . pop ( ) ;
continue ;
}
break ;
}
2021-09-15 19:46:03 +02:00
default :
{
2021-12-07 21:06:13 +01:00
input_attribute_ref ( ) ;
2021-09-15 19:46:03 +02:00
break ;
}
}
2023-11-09 00:07:25 +01:00
// Check exit conditions...
if ( d3 . end )
{
// We have seen an end of instructions marker.
// Multiple exits may exist, usually skipped over by branching. Do not exit on end unless there is no branching.
if ( ! has_branch_instruction | | fast_exit | | current_instruction > = instruction_range . second )
{
// Conditions:
// 1. No branching so far. This will always be the exit.
// 2. Fast exit flag is set. This happens when walking through subroutines.
// 3. We've gone beyond the known instruction range. In this scenario, this is the furthest end marker seen so far. It has to be reached by some earlier branch.
break ;
}
}
else if ( ( current_instruction + 1 ) = = rsx : : max_vertex_program_instructions )
2018-07-01 19:37:05 +02:00
{
2023-11-09 00:07:25 +01:00
// No more instructions to read.
2018-07-01 19:37:05 +02:00
break ;
}
2020-03-28 03:46:48 +01:00
current_instruction + + ;
2018-03-20 12:14:45 +01:00
}
2018-07-01 19:37:05 +02:00
for ( const u32 target : conditional_targets )
{
if ( ! result . instruction_mask [ target ] )
{
walk_function ( target , true ) ;
}
2018-03-20 12:14:45 +01:00
}
2018-07-01 19:37:05 +02:00
} ;
2020-05-07 23:22:12 +02:00
if ( g_cfg . video . debug_program_analyser )
2018-07-01 19:37:05 +02:00
{
2018-12-24 16:47:46 +01:00
fs : : file dump ( fs : : get_cache_dir ( ) + " shaderlog/vp_analyser.bin " , fs : : rewrite ) ;
2018-07-01 19:37:05 +02:00
dump . write ( & entry , 4 ) ;
2021-06-27 14:36:18 +02:00
dump . write ( data , rsx : : max_vertex_program_instructions * 16 ) ;
2018-07-01 19:37:05 +02:00
dump . close ( ) ;
}
2018-03-20 12:14:45 +01:00
2018-07-01 19:37:05 +02:00
walk_function ( entry , false ) ;
const u32 instruction_count = ( instruction_range . second - instruction_range . first + 1 ) ;
result . ucode_length = instruction_count * 16 ;
dst_prog . base_address = instruction_range . first ;
dst_prog . entry = entry ;
dst_prog . data . resize ( instruction_count * 4 ) ;
dst_prog . instruction_mask = ( result . instruction_mask > > instruction_range . first ) ;
if ( ! has_branch_instruction )
{
2020-12-09 08:47:45 +01:00
ensure ( instruction_range . first = = entry ) ;
2018-07-01 19:37:05 +02:00
std : : memcpy ( dst_prog . data . data ( ) , data + ( instruction_range . first * 4 ) , result . ucode_length ) ;
}
else
{
for ( u32 i = instruction_range . first , count = 0 ; i < = instruction_range . second ; + + i , + + count )
2018-03-20 12:14:45 +01:00
{
2020-04-10 09:05:23 +02:00
const u32 * instruction = & data [ i * 4 ] ;
u32 * dst = & dst_prog . data [ count * 4 ] ;
2018-07-01 19:37:05 +02:00
if ( result . instruction_mask [ i ] )
{
2020-04-10 09:05:23 +02:00
v128 : : storeu ( v128 : : loadu ( instruction ) , dst ) ;
2018-07-01 19:37:05 +02:00
if ( instructions_to_patch [ i ] )
{
2021-06-27 14:36:18 +02:00
d0 . HEX = dst [ 0 ] ;
2020-04-10 09:05:23 +02:00
d2 . HEX = dst [ 2 ] ;
d3 . HEX = dst [ 3 ] ;
2018-07-01 19:37:05 +02:00
2021-06-27 14:36:18 +02:00
u32 address = ( d0 . iaddrh2 < < 9 ) | ( d2 . iaddrh < < 3 ) | d3 . iaddrl ;
2018-07-01 19:37:05 +02:00
address - = instruction_range . first ;
2021-06-27 14:36:18 +02:00
d0 . iaddrh2 = ( address > > 9 ) & 0x1 ;
d2 . iaddrh = ( address > > 3 ) & 0x3F ;
2018-07-01 19:37:05 +02:00
d3 . iaddrl = ( address & 0x7 ) ;
2021-06-27 14:36:18 +02:00
dst [ 0 ] = d0 . HEX ;
2020-04-10 09:05:23 +02:00
dst [ 2 ] = d2 . HEX ;
dst [ 3 ] = d3 . HEX ;
2018-07-01 19:37:05 +02:00
dst_prog . jump_table . emplace ( address ) ;
}
}
else
{
2020-04-10 09:05:23 +02:00
v128 : : storeu ( { } , dst ) ;
2018-07-01 19:37:05 +02:00
}
}
2023-11-08 23:29:37 +01:00
// Typical ubershaders have the dispatch at the top with subroutines following. However...
// some games have the dispatch block at the end and the subroutines above them.
// We need to simulate a jump-to-entry in this situation
// Normally this condition is handled by the conditional_targets walk, but sometimes this doesn't work due to cyclic branches
if ( instruction_range . first < dst_prog . entry )
{
// Is there a subroutine that jumps into the entry? If not, add to jump table
const auto target = dst_prog . entry - instruction_range . first ;
dst_prog . jump_table . insert ( target ) ;
}
2018-07-01 19:37:05 +02:00
// Verification
for ( const u32 target : dst_prog . jump_table )
{
2018-07-09 20:31:31 +02:00
if ( ! dst_prog . instruction_mask [ target ] )
2018-07-01 19:37:05 +02:00
{
2020-02-01 09:07:25 +01:00
rsx_log . error ( " vp_analyser: Failed, branch target 0x%x was not resolved " , target ) ;
2018-07-01 19:37:05 +02:00
}
2018-03-20 12:14:45 +01:00
}
}
2021-09-15 19:46:03 +02:00
result . referenced_inputs_mask | = 1u ; // VPOS is always enabled, else no rendering can happen
2018-07-01 19:37:05 +02:00
return result ;
2018-03-20 12:14:45 +01:00
}
2020-12-18 08:39:54 +01:00
usz vertex_program_storage_hash : : operator ( ) ( const RSXVertexProgram & program ) const
2017-12-02 16:20:52 +01:00
{
2025-03-02 23:54:45 +01:00
# ifdef ARCH_X64
usz ucode_hash ;
if ( utils : : has_avx512_icl ( ) )
{
ucode_hash = get_vertex_program_ucode_hash_512 ( program ) ;
}
else
{
ucode_hash = vertex_program_utils : : get_vertex_program_ucode_hash ( program ) ;
}
# else
2025-01-09 01:22:12 +01:00
const usz ucode_hash = vertex_program_utils : : get_vertex_program_ucode_hash ( program ) ;
2025-03-02 23:54:45 +01:00
# endif
2025-01-09 01:22:12 +01:00
const u32 state_params [ ] =
{
program . ctrl ,
program . output_mask ,
program . texture_state . texture_dimensions ,
program . texture_state . multisampled_textures ,
} ;
const usz metadata_hash = rpcs3 : : hash_array ( state_params ) ;
return rpcs3 : : hash64 ( ucode_hash , metadata_hash ) ;
2017-12-02 16:20:52 +01:00
}
2025-02-28 14:45:34 +01:00
# ifdef ARCH_X64
2025-03-02 23:54:45 +01:00
AVX512_ICL_FUNC bool vertex_program_compare_512 ( const RSXVertexProgram & binary1 , const RSXVertexProgram & binary2 )
2025-02-28 14:45:34 +01:00
{
// Load all elements of the instruction_mask bitset
const __m512i * instMask512 = reinterpret_cast < const __m512i * > ( & binary1 . instruction_mask ) ;
const __m128i * instMask128 = reinterpret_cast < const __m128i * > ( & binary1 . instruction_mask ) ;
const __m512i lowerMask = _mm512_loadu_si512 ( instMask512 ) ;
const __m128i upper128 = _mm_loadu_si128 ( instMask128 + 4 ) ;
const __m512i upperMask = _mm512_zextsi128_si512 ( upper128 ) ;
__m512i maskIndex = _mm512_setzero_si512 ( ) ;
const __m512i negativeOnes = _mm512_set1_epi64 ( - 1 ) ;
// Special masks to test against bitset
const __m512i testMask0 = _mm512_set_epi64 (
0x0808080808080808 ,
0x0808080808080808 ,
0x0404040404040404 ,
0x0404040404040404 ,
0x0202020202020202 ,
0x0202020202020202 ,
0x0101010101010101 ,
0x0101010101010101 ) ;
const __m512i testMask1 = _mm512_set_epi64 (
0x8080808080808080 ,
0x8080808080808080 ,
0x4040404040404040 ,
0x4040404040404040 ,
0x2020202020202020 ,
0x2020202020202020 ,
0x1010101010101010 ,
0x1010101010101010 ) ;
const __m512i * instBuffer1 = reinterpret_cast < const __m512i * > ( binary1 . data . data ( ) ) ;
const __m512i * instBuffer2 = reinterpret_cast < const __m512i * > ( binary2 . data . data ( ) ) ;
// If there is remainder, add an extra (masked) iteration
2025-03-07 08:54:12 +01:00
const u32 extraIteration = ( binary1 . data . size ( ) % 32 ! = 0 ) ? 1 : 0 ;
const u32 length = static_cast < u32 > ( binary1 . data . size ( ) / 32 ) + extraIteration ;
2025-02-28 14:45:34 +01:00
u32 instIndex = 0 ;
// The instruction mask will prevent us from reading out of bounds, we do not need a seperate masked loop
// for the remainder, or a scalar loop.
while ( instIndex < ( length ) )
{
const __m512i masks = _mm512_permutex2var_epi8 ( lowerMask , maskIndex , upperMask ) ;
const __mmask8 result0 = _mm512_test_epi64_mask ( masks , testMask0 ) ;
const __mmask8 result1 = _mm512_test_epi64_mask ( masks , testMask1 ) ;
const __m512i load0 = _mm512_maskz_loadu_epi64 ( result0 , ( instBuffer1 + ( instIndex * 2 ) ) ) ;
const __m512i load1 = _mm512_maskz_loadu_epi64 ( result0 , ( instBuffer2 + ( instIndex * 2 ) ) ) ;
const __m512i load2 = _mm512_maskz_loadu_epi64 ( result1 , ( instBuffer1 + ( instIndex * 2 ) + 1 ) ) ;
const __m512i load3 = _mm512_maskz_loadu_epi64 ( result1 , ( instBuffer2 + ( instIndex * 2 ) + 1 ) ) ;
const __mmask8 res0 = _mm512_cmpneq_epi64_mask ( load0 , load1 ) ;
const __mmask8 res1 = _mm512_cmpneq_epi64_mask ( load2 , load3 ) ;
const u8 result = _kortestz_mask8_u8 ( res0 , res1 ) ;
//kortestz will set result to 1 if all bits are zero, so invert the check for result
if ( ! result )
{
return false ;
}
maskIndex = _mm512_sub_epi8 ( maskIndex , negativeOnes ) ;
instIndex + + ;
}
return true ;
}
# endif
2025-03-02 23:54:45 +01:00
bool vertex_program_compare : : operator ( ) ( const RSXVertexProgram & binary1 , const RSXVertexProgram & binary2 ) const
{
2025-03-12 06:12:12 +01:00
if ( ! compare_properties ( binary1 , binary2 ) )
{
2025-03-02 23:54:45 +01:00
return false ;
2025-03-12 06:12:12 +01:00
}
if ( binary1 . data . size ( ) ! = binary2 . data . size ( ) | |
binary1 . jump_table ! = binary2 . jump_table )
{
2025-03-02 23:54:45 +01:00
return false ;
2025-03-12 06:12:12 +01:00
}
2025-03-02 23:54:45 +01:00
# ifdef ARCH_X64
if ( utils : : has_avx512_icl ( ) )
{
return vertex_program_compare_512 ( binary1 , binary2 ) ;
}
# endif
2020-04-10 09:05:23 +02:00
const void * instBuffer1 = binary1 . data . data ( ) ;
const void * instBuffer2 = binary2 . data . data ( ) ;
2020-12-18 08:39:54 +01:00
usz instIndex = 0 ;
2025-02-28 14:45:34 +01:00
while ( instIndex < ( binary1 . data . size ( ) / 4 ) )
2016-01-10 20:09:56 +01:00
{
2025-02-16 03:39:53 +01:00
if ( binary1 . instruction_mask [ instIndex ] )
2018-07-01 19:37:05 +02:00
{
2020-04-10 09:05:23 +02:00
const auto inst1 = v128 : : loadu ( instBuffer1 , instIndex ) ;
const auto inst2 = v128 : : loadu ( instBuffer2 , instIndex ) ;
2020-12-21 15:12:05 +01:00
if ( inst1 . _u ^ inst2 . _u )
2018-07-01 19:37:05 +02:00
{
return false ;
}
}
2016-01-10 20:09:56 +01:00
instIndex + + ;
}
2017-06-27 19:46:36 +02:00
2016-01-10 20:09:56 +01:00
return true ;
}
2025-03-12 06:12:12 +01:00
bool vertex_program_compare : : compare_properties ( const RSXVertexProgram & binary1 , const RSXVertexProgram & binary2 )
{
return binary1 . output_mask = = binary2 . output_mask & &
binary1 . ctrl = = binary2 . ctrl & &
binary1 . texture_state = = binary2 . texture_state ;
}
2025-02-16 03:39:53 +01:00
bool fragment_program_utils : : is_any_src_constant ( v128 sourceOperand )
2016-01-10 20:09:56 +01:00
{
2025-02-16 03:39:53 +01:00
const u64 masked = sourceOperand . _u64 [ 1 ] & 0x30000000300 ;
return ( sourceOperand . _u32 [ 1 ] & 0x300 ) = = 0x200 | | ( static_cast < u32 > ( masked ) = = 0x200 | | static_cast < u32 > ( masked > > 32 ) = = 0x200 ) ;
2016-01-10 20:09:56 +01:00
}
2020-12-18 08:39:54 +01:00
usz fragment_program_utils : : get_fragment_program_ucode_size ( const void * ptr )
2016-01-10 20:09:56 +01:00
{
2020-04-10 09:05:23 +02:00
const auto instBuffer = ptr ;
2020-12-18 08:39:54 +01:00
usz instIndex = 0 ;
2016-01-10 20:09:56 +01:00
while ( true )
{
2020-04-10 09:05:23 +02:00
const v128 inst = v128 : : loadu ( instBuffer , instIndex ) ;
2025-03-07 08:54:12 +01:00
const bool end = ( inst . _u32 [ 0 ] > > 8 ) & 0x1 ;
2016-01-10 20:09:56 +01:00
2025-02-16 03:39:53 +01:00
if ( is_any_src_constant ( inst ) )
2016-01-10 20:09:56 +01:00
{
instIndex + = 2 ;
if ( end )
return instIndex * 4 * 4 ;
continue ;
}
instIndex + + ;
if ( end )
return ( instIndex ) * 4 * 4 ;
}
}
2020-04-10 09:05:23 +02:00
fragment_program_utils : : fragment_program_metadata fragment_program_utils : : analyse_fragment_program ( const void * ptr )
2017-12-02 16:20:52 +01:00
{
2020-04-23 23:38:24 +02:00
fragment_program_utils : : fragment_program_metadata result { } ;
2021-05-22 20:46:10 +02:00
result . program_start_offset = - 1 ;
2020-04-10 09:05:23 +02:00
const auto instBuffer = ptr ;
2018-07-11 22:51:29 +02:00
s32 index = 0 ;
2018-03-20 12:14:45 +01:00
2025-03-07 08:54:12 +01:00
// Find the start of the program
while ( true )
{
const auto inst = v128 : : loadu ( instBuffer , index ) ;
const u32 opcode = ( inst . _u32 [ 0 ] > > 16 ) & 0x3F ;
if ( opcode )
{
// We found the start of the program, don't advance the index
result . program_start_offset = index * 16 ;
break ;
}
if ( ( inst . _u32 [ 0 ] > > 8 ) & 0x1 )
{
result . program_start_offset = index * 16 ;
result . program_ucode_length = 16 ;
result . is_nop_shader = true ;
return result ;
}
index + + ;
}
2017-12-02 16:20:52 +01:00
while ( true )
{
2020-04-10 09:05:23 +02:00
const auto inst = v128 : : loadu ( instBuffer , index ) ;
2018-03-20 12:14:45 +01:00
2020-04-23 23:38:24 +02:00
// Check for opcode high bit which indicates a branch instructions (opcode 0x40...0x45)
if ( inst . _u32 [ 2 ] & ( 1 < < 23 ) )
2018-03-20 12:14:45 +01:00
{
2020-05-10 19:14:58 +02:00
// NOTE: Jump instructions are not yet proved to work outside of loops and if/else blocks
// Otherwise we would need to follow the execution chain
2020-04-23 23:38:24 +02:00
result . has_branch_instructions = true ;
}
else
{
const u32 opcode = ( inst . _u32 [ 0 ] > > 16 ) & 0x3F ;
if ( opcode )
2018-03-20 12:14:45 +01:00
{
2020-04-23 23:38:24 +02:00
switch ( opcode )
{
case RSX_FP_OPCODE_TEX :
case RSX_FP_OPCODE_TEXBEM :
case RSX_FP_OPCODE_TXP :
case RSX_FP_OPCODE_TXPBEM :
case RSX_FP_OPCODE_TXD :
case RSX_FP_OPCODE_TXB :
case RSX_FP_OPCODE_TXL :
{
//Bits 17-20 of word 1, swapped within u16 sections
//Bits 16-23 are swapped into the upper 8 bits (24-31)
const u32 tex_num = ( inst . _u32 [ 0 ] > > 25 ) & 15 ;
result . referenced_textures_mask | = ( 1 < < tex_num ) ;
break ;
}
case RSX_FP_OPCODE_PK4 :
case RSX_FP_OPCODE_UP4 :
case RSX_FP_OPCODE_PK2 :
case RSX_FP_OPCODE_UP2 :
case RSX_FP_OPCODE_PKB :
case RSX_FP_OPCODE_UPB :
case RSX_FP_OPCODE_PK16 :
case RSX_FP_OPCODE_UP16 :
case RSX_FP_OPCODE_PKG :
case RSX_FP_OPCODE_UPG :
{
result . has_pack_instructions = true ;
break ;
}
}
2018-03-20 12:14:45 +01:00
}
2025-03-07 08:54:12 +01:00
if ( is_any_src_constant ( inst ) )
2018-03-20 12:14:45 +01:00
{
//Instruction references constant, skip one slot occupied by data
2018-07-11 22:51:29 +02:00
index + + ;
2020-04-23 23:38:24 +02:00
result . program_constants_buffer_length + = 16 ;
2018-03-20 12:14:45 +01:00
}
}
2025-03-07 08:54:12 +01:00
index + + ;
2018-05-13 12:48:28 +02:00
2020-04-10 09:05:23 +02:00
if ( ( inst . _u32 [ 0 ] > > 8 ) & 0x1 )
2018-03-20 12:14:45 +01:00
{
2017-12-02 16:20:52 +01:00
break ;
2018-03-20 12:14:45 +01:00
}
2017-12-02 16:20:52 +01:00
}
2025-03-07 08:54:12 +01:00
result . program_ucode_length = ( index - ( result . program_start_offset / 16 ) ) * 16 ;
2020-04-23 23:38:24 +02:00
return result ;
2017-12-02 16:20:52 +01:00
}
2020-12-18 08:39:54 +01:00
usz fragment_program_utils : : get_fragment_program_ucode_hash ( const RSXFragmentProgram & program )
2016-01-10 20:09:56 +01:00
{
2025-02-14 21:26:16 +01:00
// Checksum as hash with rotated data
2020-10-27 21:41:20 +01:00
const void * instbuffer = program . get_data ( ) ;
2025-02-14 21:26:16 +01:00
usz acc0 = 0 ;
usz acc1 = 0 ;
2025-03-08 17:38:17 +01:00
for ( int instIndex = 0 ; instIndex < static_cast < int > ( program . ucode_length / 16 ) ; instIndex + + )
2016-01-10 20:09:56 +01:00
{
2020-04-10 09:05:23 +02:00
const auto inst = v128 : : loadu ( instbuffer , instIndex ) ;
2025-03-07 08:54:12 +01:00
const usz tmp0 = std : : rotr ( inst . _u64 [ 0 ] , instIndex * 2 ) ;
2025-02-14 21:26:16 +01:00
acc0 + = tmp0 ;
2025-03-07 08:54:12 +01:00
const usz tmp1 = std : : rotr ( inst . _u64 [ 1 ] , ( instIndex * 2 ) + 1 ) ;
2025-02-14 21:26:16 +01:00
acc1 + = tmp1 ;
2016-01-10 20:09:56 +01:00
// Skip constants
2025-02-16 03:39:53 +01:00
if ( fragment_program_utils : : is_any_src_constant ( inst ) )
2016-01-10 20:09:56 +01:00
instIndex + + ;
}
2025-03-07 08:54:12 +01:00
return acc0 + acc1 ;
2016-01-10 20:09:56 +01:00
}
2020-12-18 08:39:54 +01:00
usz fragment_program_storage_hash : : operator ( ) ( const RSXFragmentProgram & program ) const
2017-12-02 16:20:52 +01:00
{
2025-01-09 01:22:12 +01:00
const usz ucode_hash = fragment_program_utils : : get_fragment_program_ucode_hash ( program ) ;
const u32 state_params [ ] =
{
program . ctrl ,
program . two_sided_lighting ? 1u : 0u ,
program . texture_state . texture_dimensions ,
program . texture_state . shadow_textures ,
program . texture_state . redirected_textures ,
program . texture_state . multisampled_textures ,
program . texcoord_control_mask ,
program . mrt_buffers_count
} ;
const usz metadata_hash = rpcs3 : : hash_array ( state_params ) ;
return rpcs3 : : hash64 ( ucode_hash , metadata_hash ) ;
2017-12-02 16:20:52 +01:00
}
2016-01-26 20:42:54 +01:00
bool fragment_program_compare : : operator ( ) ( const RSXFragmentProgram & binary1 , const RSXFragmentProgram & binary2 ) const
2016-01-10 20:09:56 +01:00
{
2025-03-12 06:12:12 +01:00
if ( ! compare_properties ( binary1 , binary2 ) )
2025-01-03 23:57:03 +01:00
{
2016-01-26 20:42:54 +01:00
return false ;
2025-01-03 23:57:03 +01:00
}
2017-06-27 19:46:36 +02:00
2020-10-27 21:41:20 +01:00
const void * instBuffer1 = binary1 . get_data ( ) ;
const void * instBuffer2 = binary2 . get_data ( ) ;
2025-03-07 08:54:12 +01:00
for ( usz instIndex = 0 ; instIndex < ( binary1 . ucode_length / 16 ) ; instIndex + + )
2016-01-10 20:09:56 +01:00
{
2020-04-10 09:05:23 +02:00
const auto inst1 = v128 : : loadu ( instBuffer1 , instIndex ) ;
const auto inst2 = v128 : : loadu ( instBuffer2 , instIndex ) ;
2016-01-10 20:09:56 +01:00
2020-12-21 15:12:05 +01:00
if ( inst1 . _u ^ inst2 . _u )
2025-01-03 23:57:03 +01:00
{
2016-01-10 20:09:56 +01:00
return false ;
2025-01-03 23:57:03 +01:00
}
2017-06-27 19:46:36 +02:00
2016-01-10 20:09:56 +01:00
// Skip constants
2025-02-16 03:39:53 +01:00
if ( fragment_program_utils : : is_any_src_constant ( inst1 ) )
2016-01-10 20:09:56 +01:00
instIndex + + ;
}
2025-03-07 08:54:12 +01:00
return true ;
2016-01-10 20:09:56 +01:00
}
2024-08-09 13:07:37 +02:00
2025-03-12 06:12:12 +01:00
bool fragment_program_compare : : compare_properties ( const RSXFragmentProgram & binary1 , const RSXFragmentProgram & binary2 )
2025-03-06 12:29:32 +01:00
{
2025-03-12 06:12:12 +01:00
return binary1 . ucode_length = = binary2 . ucode_length & &
binary1 . ctrl = = binary2 . ctrl & &
binary1 . texture_state = = binary2 . texture_state & &
binary1 . texcoord_control_mask = = binary2 . texcoord_control_mask & &
binary1 . two_sided_lighting = = binary2 . two_sided_lighting & &
binary1 . mrt_buffers_count = = binary2 . mrt_buffers_count ;
2025-03-06 12:29:32 +01:00
}
2024-08-09 13:07:37 +02:00
namespace rsx
{
# if defined(ARCH_X64) || defined(ARCH_ARM64)
2024-08-09 21:36:06 +02:00
static inline void write_fragment_constants_to_buffer_sse2 ( const std : : span < f32 > & buffer , const RSXFragmentProgram & rsx_prog , const std : : vector < usz > & offsets_cache , bool sanitize )
2024-08-09 13:07:37 +02:00
{
f32 * dst = buffer . data ( ) ;
for ( usz offset_in_fragment_program : offsets_cache )
{
2025-03-07 08:54:12 +01:00
const char * data = static_cast < const char * > ( rsx_prog . get_data ( ) ) + offset_in_fragment_program ;
2024-08-09 13:07:37 +02:00
2025-03-07 08:54:12 +01:00
const __m128i vector = _mm_loadu_si128 ( reinterpret_cast < const __m128i * > ( data ) ) ;
2024-08-09 13:07:37 +02:00
const __m128i shuffled_vector = _mm_or_si128 ( _mm_slli_epi16 ( vector , 8 ) , _mm_srli_epi16 ( vector , 8 ) ) ;
if ( sanitize )
{
//Convert NaNs and Infs to 0
const auto masked = _mm_and_si128 ( shuffled_vector , _mm_set1_epi32 ( 0x7fffffff ) ) ;
const auto valid = _mm_cmplt_epi32 ( masked , _mm_set1_epi32 ( 0x7f800000 ) ) ;
const auto result = _mm_and_si128 ( shuffled_vector , valid ) ;
_mm_stream_si128 ( utils : : bless < __m128i > ( dst ) , result ) ;
}
else
{
_mm_stream_si128 ( utils : : bless < __m128i > ( dst ) , shuffled_vector ) ;
}
dst + = 4 ;
}
}
2024-08-09 21:36:06 +02:00
# else
static inline void write_fragment_constants_to_buffer_fallback ( const std : : span < f32 > & buffer , const RSXFragmentProgram & rsx_prog , const std : : vector < usz > & offsets_cache , bool sanitize )
2024-08-09 13:07:37 +02:00
{
f32 * dst = buffer . data ( ) ;
for ( usz offset_in_fragment_program : offsets_cache )
{
2025-03-07 08:54:12 +01:00
const char * data = static_cast < const char * > ( rsx_prog . get_data ( ) ) + offset_in_fragment_program ;
2024-08-09 13:07:37 +02:00
for ( u32 i = 0 ; i < 4 ; i + + )
{
2025-03-07 08:54:12 +01:00
const u32 value = reinterpret_cast < const u32 * > ( data ) [ i ] ;
2024-08-09 13:07:37 +02:00
const u32 shuffled = ( ( value > > 8 ) & 0xff00ff ) | ( ( value < < 8 ) & 0xff00ff00 ) ;
if ( sanitize & & ( shuffled & 0x7fffffff ) > = 0x7f800000 )
{
dst [ i ] = 0.f ;
}
else
{
dst [ i ] = std : : bit_cast < f32 > ( shuffled ) ;
}
}
dst + = 4 ;
}
}
2024-08-09 21:36:06 +02:00
# endif
2024-08-09 13:07:37 +02:00
void write_fragment_constants_to_buffer ( const std : : span < f32 > & buffer , const RSXFragmentProgram & rsx_prog , const std : : vector < usz > & offsets_cache , bool sanitize )
{
# if defined(ARCH_X64) || defined(ARCH_ARM64)
write_fragment_constants_to_buffer_sse2 ( buffer , rsx_prog , offsets_cache , sanitize ) ;
# else
write_fragment_constants_to_buffer_fallback ( buffer , rsx_prog , offsets_cache , sanitize ) ;
# endif
}
2025-03-08 17:03:05 +01:00
void program_cache_hint_t : : invalidate ( u32 flags )
{
if ( flags & rsx : : vertex_program_dirty )
{
2025-03-12 06:12:12 +01:00
m_cached_vertex_program = nullptr ;
2025-03-08 17:03:05 +01:00
}
if ( flags & rsx : : fragment_program_dirty )
{
2025-03-12 06:12:12 +01:00
m_cached_fragment_program = nullptr ;
2025-03-08 17:03:05 +01:00
}
}
2025-03-12 06:12:12 +01:00
void program_cache_hint_t : : invalidate_vertex_program ( const RSXVertexProgram & p )
{
if ( ! m_cached_vertex_program )
{
return ;
}
if ( ! vertex_program_compare : : compare_properties ( m_cached_vp_properties , p ) )
{
m_cached_vertex_program = nullptr ;
}
}
void program_cache_hint_t : : invalidate_fragment_program ( const RSXFragmentProgram & p )
{
if ( ! m_cached_fragment_program )
{
return ;
}
if ( ! fragment_program_compare : : compare_properties ( m_cached_fp_properties , p ) )
{
m_cached_fragment_program = nullptr ;
}
}
void program_cache_hint_t : : cache_vertex_program ( program_cache_hint_t * cache , const RSXVertexProgram & ref , void * vertex_program )
{
if ( ! cache )
{
return ;
}
cache - > m_cached_vertex_program = vertex_program ;
cache - > m_cached_vp_properties . output_mask = ref . output_mask ;
cache - > m_cached_vp_properties . ctrl = ref . ctrl ;
cache - > m_cached_vp_properties . texture_state = ref . texture_state ;
}
void program_cache_hint_t : : cache_fragment_program ( program_cache_hint_t * cache , const RSXFragmentProgram & ref , void * fragment_program )
{
if ( ! cache )
{
return ;
}
cache - > m_cached_fragment_program = fragment_program ;
cache - > m_cached_fp_properties . ucode_length = ref . ucode_length ;
cache - > m_cached_fp_properties . ctrl = ref . ctrl ;
cache - > m_cached_fp_properties . texture_state = ref . texture_state ;
cache - > m_cached_fp_properties . texcoord_control_mask = ref . texcoord_control_mask ;
cache - > m_cached_fp_properties . two_sided_lighting = ref . two_sided_lighting ;
cache - > m_cached_fp_properties . mrt_buffers_count = ref . mrt_buffers_count ;
}
2024-08-09 13:07:37 +02:00
}