rsx: Rework GPU deswizzle kernel to prevent hangs

2025-12-06 07:12:28 +01:00 · 2025-11-22 02:47:24 +03:00 · 2025-11-22 02:47:24 +03:00 · 7f6842705c
parent 9deb6cd4fa
commit 7f6842705c
3 changed files with 34 additions and 32 deletions
--- a/rpcs3/Emu/RSX/GL/GLCompute.h
+++ b/rpcs3/Emu/RSX/GL/GLCompute.h
@ -338,10 +338,10 @@ namespace gl
 			params.logd = rsx::ceil_log2(depth);
 			set_parameters(cmd);
-			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
+			const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
-			const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u);      // For block sizes less than 4 bytes wide
+			const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
-			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
+			const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
-			compute_task::run(cmd, linear_invocations);
+			compute_task::run(cmd, workgroup_invocations);
 		}
 	};
--- a/rpcs3/Emu/RSX/Program/GLSLSnippets/GPUDeswizzle.glsl
+++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/GPUDeswizzle.glsl
@ -103,34 +103,48 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
 #if USE_16BIT_ADDRESSING
-void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
+void decode_16b(const in uint texel_id, in uint x, const in uint y, const in uint z)
 {
 	const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
-	accumulator |= data_in[src_id / 2] & masks[subword];
+	uint accumulator = 0;
-	if (subword == 1)
+	const uint subword_count = min(invocation.size.x, 2);
 	for (uint subword = 0; subword < subword_count; ++subword, ++x)
 	{
-		data_out[dst_id / 2] = %f(accumulator);
+		uint src_texel_id = get_z_index(x, y, z);
 		uint src_id = (src_texel_id + invocation.data_offset);
 		accumulator |= data_in[src_id / 2] & masks[subword];
 	}
 	data_out[texel_id / 2] = %f(accumulator);
 }
 #elif USE_8BIT_ADDRESSING
-void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
+void decode_8b(const in uint texel_id, in uint x, const in uint y, const in uint z)
 {
 	const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
-	accumulator |= data_in[src_id / 4] & masks[subword];
+	uint accumulator = 0;
-	if (subword == 3)
+	const uint subword_count = min(invocation.size.x, 4);
 	for (uint subword = 0; subword < subword_count; ++subword, ++x)
 	{
-		data_out[dst_id / 4] = accumulator;
+		uint src_texel_id = get_z_index(x, y, z);
 		uint src_id = (src_texel_id + invocation.data_offset);
 		accumulator |= data_in[src_id / 4] & masks[subword];
 	}
 	data_out[texel_id / 4] = accumulator;
 }
 #else
-void write32(const in uint word_count, in uint src_id, in uint dst_id)
+void decode_32b(const in uint texel_id, const in uint word_count, const in uint x, const in uint y, const in uint z)
 {
 	uint src_texel_id = get_z_index(x, y, z);
 	uint dst_id = (texel_id * word_count);
 	uint src_id = (src_texel_id + invocation.data_offset) * word_count;
 	for (uint i = 0; i < word_count; ++i)
 	{
 		uint value = data_in[src_id++];
@ -165,23 +179,11 @@ void main()
 	uint x = (slice_offset % row_length);
 #if USE_8BIT_ADDRESSING
-	for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) {
+	decode_8b(texel_id, x, y, z);
 #elif USE_16BIT_ADDRESSING
-	for (uint subword = 0, accumulator = 0; subword < 2; ++subword, ++x) {
+	decode_16b(texel_id, x, y, z);
 #endif
 		uint src_texel_id = get_z_index(x, y, z);
 		uint dst_id = (texel_id * word_count);
 		uint src_id = (src_texel_id + invocation.data_offset) * word_count;
 #if USE_8BIT_ADDRESSING
 		write8(accumulator, subword, src_id, dst_id);
 	}
 #elif USE_16BIT_ADDRESSING
 		write16(accumulator, subword, src_id, dst_id);
 	}
 #else
-	write32(word_count, src_id, dst_id);
+	decode_32b(texel_id, word_count, x, y, z);
 #endif
 }
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -475,10 +475,10 @@ namespace vk
 			params.logh = rsx::ceil_log2(height);
 			params.logd = rsx::ceil_log2(depth);
-			const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
+			const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
-			const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u);      // For block sizes less than 4 bytes wide
+			const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
-			const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
+			const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
-			compute_task::run(cmd, linear_invocations);
+			compute_task::run(cmd, workgroup_invocations);
 		}
 	};