rsx: Rework GPU deswizzle kernel to prevent hangs
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, .ci/build-mac.sh, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, .ci/build-mac-arm64.sh, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang (win64, clang, clang64) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run

This commit is contained in:
kd-11 2025-11-22 02:47:24 +03:00 committed by Ani
parent 9deb6cd4fa
commit 7f6842705c
3 changed files with 34 additions and 32 deletions

View file

@ -338,10 +338,10 @@ namespace gl
params.logd = rsx::ceil_log2(depth); params.logd = rsx::ceil_log2(depth);
set_parameters(cmd); set_parameters(cmd);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword; const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, workgroup_invocations);
} }
}; };

View file

@ -103,34 +103,48 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
#if USE_16BIT_ADDRESSING #if USE_16BIT_ADDRESSING
void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id) void decode_16b(const in uint texel_id, in uint x, const in uint y, const in uint z)
{ {
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 }; const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
accumulator |= data_in[src_id / 2] & masks[subword]; uint accumulator = 0;
if (subword == 1) const uint subword_count = min(invocation.size.x, 2);
for (uint subword = 0; subword < subword_count; ++subword, ++x)
{ {
data_out[dst_id / 2] = %f(accumulator); uint src_texel_id = get_z_index(x, y, z);
uint src_id = (src_texel_id + invocation.data_offset);
accumulator |= data_in[src_id / 2] & masks[subword];
} }
data_out[texel_id / 2] = %f(accumulator);
} }
#elif USE_8BIT_ADDRESSING #elif USE_8BIT_ADDRESSING
void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id) void decode_8b(const in uint texel_id, in uint x, const in uint y, const in uint z)
{ {
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 }; const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
accumulator |= data_in[src_id / 4] & masks[subword]; uint accumulator = 0;
if (subword == 3) const uint subword_count = min(invocation.size.x, 4);
for (uint subword = 0; subword < subword_count; ++subword, ++x)
{ {
data_out[dst_id / 4] = accumulator; uint src_texel_id = get_z_index(x, y, z);
uint src_id = (src_texel_id + invocation.data_offset);
accumulator |= data_in[src_id / 4] & masks[subword];
} }
data_out[texel_id / 4] = accumulator;
} }
#else #else
void write32(const in uint word_count, in uint src_id, in uint dst_id) void decode_32b(const in uint texel_id, const in uint word_count, const in uint x, const in uint y, const in uint z)
{ {
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
for (uint i = 0; i < word_count; ++i) for (uint i = 0; i < word_count; ++i)
{ {
uint value = data_in[src_id++]; uint value = data_in[src_id++];
@ -165,23 +179,11 @@ void main()
uint x = (slice_offset % row_length); uint x = (slice_offset % row_length);
#if USE_8BIT_ADDRESSING #if USE_8BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) { decode_8b(texel_id, x, y, z);
#elif USE_16BIT_ADDRESSING #elif USE_16BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 2; ++subword, ++x) { decode_16b(texel_id, x, y, z);
#endif
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
#if USE_8BIT_ADDRESSING
write8(accumulator, subword, src_id, dst_id);
}
#elif USE_16BIT_ADDRESSING
write16(accumulator, subword, src_id, dst_id);
}
#else #else
write32(word_count, src_id, dst_id); decode_32b(texel_id, word_count, x, y, z);
#endif #endif
} }

View file

@ -475,10 +475,10 @@ namespace vk
params.logh = rsx::ceil_log2(height); params.logh = rsx::ceil_log2(height);
params.logd = rsx::ceil_log2(depth); params.logd = rsx::ceil_log2(depth);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword; const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, workgroup_invocations);
} }
}; };