rsx: Rework GPU deswizzle kernel to prevent hangs
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, .ci/build-mac.sh, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, .ci/build-mac-arm64.sh, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang (win64, clang, clang64) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run

This commit is contained in:
kd-11 2025-11-22 02:47:24 +03:00 committed by Ani
parent 9deb6cd4fa
commit 7f6842705c
3 changed files with 34 additions and 32 deletions

View file

@ -338,10 +338,10 @@ namespace gl
params.logd = rsx::ceil_log2(depth);
set_parameters(cmd);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
compute_task::run(cmd, linear_invocations);
const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, workgroup_invocations);
}
};

View file

@ -103,34 +103,48 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
#if USE_16BIT_ADDRESSING
void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
void decode_16b(const in uint texel_id, in uint x, const in uint y, const in uint z)
{
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
accumulator |= data_in[src_id / 2] & masks[subword];
uint accumulator = 0;
if (subword == 1)
const uint subword_count = min(invocation.size.x, 2);
for (uint subword = 0; subword < subword_count; ++subword, ++x)
{
data_out[dst_id / 2] = %f(accumulator);
uint src_texel_id = get_z_index(x, y, z);
uint src_id = (src_texel_id + invocation.data_offset);
accumulator |= data_in[src_id / 2] & masks[subword];
}
data_out[texel_id / 2] = %f(accumulator);
}
#elif USE_8BIT_ADDRESSING
void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
void decode_8b(const in uint texel_id, in uint x, const in uint y, const in uint z)
{
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
accumulator |= data_in[src_id / 4] & masks[subword];
uint accumulator = 0;
if (subword == 3)
const uint subword_count = min(invocation.size.x, 4);
for (uint subword = 0; subword < subword_count; ++subword, ++x)
{
data_out[dst_id / 4] = accumulator;
uint src_texel_id = get_z_index(x, y, z);
uint src_id = (src_texel_id + invocation.data_offset);
accumulator |= data_in[src_id / 4] & masks[subword];
}
data_out[texel_id / 4] = accumulator;
}
#else
void write32(const in uint word_count, in uint src_id, in uint dst_id)
void decode_32b(const in uint texel_id, const in uint word_count, const in uint x, const in uint y, const in uint z)
{
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
for (uint i = 0; i < word_count; ++i)
{
uint value = data_in[src_id++];
@ -165,23 +179,11 @@ void main()
uint x = (slice_offset % row_length);
#if USE_8BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) {
decode_8b(texel_id, x, y, z);
#elif USE_16BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 2; ++subword, ++x) {
#endif
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
#if USE_8BIT_ADDRESSING
write8(accumulator, subword, src_id, dst_id);
}
#elif USE_16BIT_ADDRESSING
write16(accumulator, subword, src_id, dst_id);
}
decode_16b(texel_id, x, y, z);
#else
write32(word_count, src_id, dst_id);
decode_32b(texel_id, word_count, x, y, z);
#endif
}

View file

@ -475,10 +475,10 @@ namespace vk
params.logh = rsx::ceil_log2(height);
params.logd = rsx::ceil_log2(depth);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
compute_task::run(cmd, linear_invocations);
const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
compute_task::run(cmd, workgroup_invocations);
}
};