From 32a2c58441495871ea25ece3aea00bd0d85bb859 Mon Sep 17 00:00:00 2001 From: DH Date: Thu, 17 Oct 2024 04:42:08 +0300 Subject: [PATCH] gpu: shader: implement more instructions add v_mad_u64_u32, v_mad_i64_i32, image_sample_l stub s_ttracedata fix s_andn2_saveexec_b64, s_orn2_saveexec_b64, s_subb_u32 --- rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl | 106 +++++++++++++++++---- 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl b/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl index 808790290..0357208d4 100644 --- a/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl +++ b/rpcsx/gpu/lib/gcn-shader/shaders/rdna.glsl @@ -497,7 +497,7 @@ uint32_t v_mbcnt_hi_u32_b32(uint32_t x, uint32_t y) { return (thread_id > 32 ? bitCount(x & ((1 << (thread_id - 32)) - 1)) : 0) + y; } uint32_t v_add_i32(inout uint64_t sdst, int32_t x, int32_t y) { - uint64_t result = uint64_t(x) + uint64_t(y); + uint64_t result = uint64_t(int64_t(x) + int64_t(y)); if (result > 0xffffffff) { sdst |= exec & (uint64_t(1) << thread_id); @@ -714,7 +714,7 @@ bool v_cmp_class_f64(float64_t x, uint vftypemask) { return CMP_CLASS(x, vftypem float32_t v_mad_legacy_f32(float32_t a, float32_t b, float32_t c) { return (a == 0 || b == 0) ? c : fma(a, b, c); } float32_t v_mad_f32(float32_t a, float32_t b, float32_t c) { return fma(a, b, c); } -uint32_t v_mad_i32_i24(int32_t a, int32_t b, int32_t c) { return mul24lo(a, b) + c; } +int32_t v_mad_i32_i24(int32_t a, int32_t b, int32_t c) { return mul24lo(a, b) + c; } uint32_t v_mad_u32_u24(uint32_t a, uint32_t b, uint32_t c) { return mul24lo(a, b) + c; } float32_t v_cubeid_f32(float32_t a, float32_t b, float32_t c) { if (abs(c) >= abs(a) && abs(c) >= abs(b)) { @@ -964,13 +964,38 @@ uint32_t v_msad_u8(uint32_t x, uint32_t y, uint32_t z) { // } // void v_mqsad_u32_u8() {} -// void v_mad_u64_u32() {} -// void v_mad_i64_i32() {} + +uint64_t v_mad_u64_u32(uint32_t a, uint32_t b, uint64_t c) { + uint32_t mulResult = a * b; + uint64_t result = mulResult + c; + + uint64_t thread_mask = uint64_t(1) << thread_id; + if (result < max(mulResult, c)) { + vcc |= thread_mask & exec; + } else { + vcc &= ~thread_mask; + } + return result; +} +int64_t v_mad_i64_i32(int32_t a, int32_t b, int64_t c) { + int32_t mulResult = a * b; + int64_t result = mulResult + c; + + uint64_t thread_mask = uint64_t(1) << thread_id; + if (sign(mulResult) == sign(c) && sign(mulResult) != result) { + vcc |= thread_mask & exec; + } else { + vcc &= ~thread_mask; + } + return result; +} // SOP bool scc; +void s_ttracedata(uint64_t) {} + void s_cmp_eq_i32(int32_t a, int32_t b) { scc = a == b; } void s_cmp_ge_i32(int32_t a, int32_t b) { scc = a >= b; } void s_cmp_gt_i32(int32_t a, int32_t b) { scc = a > b; } @@ -1128,13 +1153,13 @@ uint64_t s_xor_saveexec_b64(uint64_t x) { } uint64_t s_andn2_saveexec_b64(uint64_t x) { uint64_t result = exec; - exec = result & ~x; + exec = x & ~result; scc = result != 0; return result; } uint64_t s_orn2_saveexec_b64(uint64_t x) { uint64_t result = exec; - exec = result | ~x; + exec = x | ~result; scc = result != 0; return result; } @@ -1197,18 +1222,14 @@ int32_t s_sub_i32(int32_t x, int32_t y) { return result; } uint32_t s_addc_u32(uint32_t x, uint32_t y) { - uint32_t carry0; - uint32_t carry1 = 0; - uint32_t result = uaddCarry(x, y, carry0); - if (scc) { - result = uaddCarry(result, 1, carry1); - } - scc = (carry0 | carry1) != 0; - return result; + uint64_t result = uint64_t(x) + uint64_t(y) + (scc ? 1 : 0); + scc = result > 0xffffffff; + return uint32_t(result); } uint32_t s_subb_u32(uint32_t x, uint32_t y) { - uint32_t result = x - y - (scc ? 1 : 0); - scc = y + (scc ? 1 : 0) > x; + uint32_t sccValue = scc ? 1 : 0; + uint32_t result = x - y - sccValue; + scc = (y + sccValue) > x || sccValue > x; return result; } int32_t s_min_i32(int32_t x, int32_t y) { @@ -2779,7 +2800,58 @@ void image_sample(inout f32vec4 vdata, f32vec3 vaddr, int32_t textureIndexHint, // image_sample_cl // image_sample_d // image_sample_d_cl -// image_sample_l +void image_sample_l(inout f32vec4 vdata, f32vec4 vaddr, int32_t textureIndexHint, uint32_t tbuffer[8], int32_t samplerIndexHint, u32vec4 ssampler, uint32_t dmask) { + uint8_t textureType = tbuffer_type(tbuffer); + f32vec4 result; + switch (uint(textureType)) { + case kTextureType1D: + case kTextureTypeArray1D: + result = textureLod( + sampler1D( + textures1D[findTexture1DIndex(textureIndexHint, tbuffer)], + samplers[findSamplerIndex(samplerIndexHint, ssampler)] + ), vaddr.x, vaddr.y); + break; + + case kTextureType2D: + case kTextureTypeCube: + case kTextureTypeArray2D: + case kTextureTypeMsaa2D: + case kTextureTypeMsaaArray2D: + result = textureLod( + sampler2D( + textures2D[findTexture2DIndex(textureIndexHint, tbuffer)], + samplers[findSamplerIndex(samplerIndexHint, ssampler)] + ), vaddr.xy, vaddr.z); + break; + + case kTextureType3D: + result = textureLod( + sampler3D( + textures3D[findTexture3DIndex(textureIndexHint, tbuffer)], + samplers[findSamplerIndex(samplerIndexHint, ssampler)] + ), vaddr.xyz, vaddr.w); + break; + + default: + return; + } + + result = swizzle(result, + tbuffer_dst_sel_x(tbuffer), + tbuffer_dst_sel_y(tbuffer), + tbuffer_dst_sel_z(tbuffer), + tbuffer_dst_sel_w(tbuffer)); + + + int vdataIndex = 0; + for (int i = 0; i < 4; ++i) { + if ((dmask & (1 << i)) != 0) { + vdata[vdataIndex++] = result[i]; + } + } +} + // image_sample_b // image_sample_b_cl // image_sample_lz