mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-12-06 07:12:28 +01:00
rsx: Rework GPU deswizzle kernel to prevent hangs
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, .ci/build-mac.sh, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, .ci/build-mac-arm64.sh, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang (win64, clang, clang64) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run
Some checks are pending
Generate Translation Template / Generate Translation Template (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux-aarch64.sh, gcc, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (/rpcs3/.ci/build-linux.sh, gcc, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (a1d35836e8d45bfc6f63c26f0a3e5d46ef622fe1, rpcs3/rpcs3-binaries-linux-arm64, /rpcs3/.ci/build-linux-aarch64.sh, clang, rpcs3/rpcs3-ci-jammy-aarch64:1.7, ubuntu-24.04-arm) (push) Waiting to run
Build RPCS3 / RPCS3 Linux ${{ matrix.os }} ${{ matrix.compiler }} (d812f1254a1157c80fd402f94446310560f54e5f, rpcs3/rpcs3-binaries-linux, /rpcs3/.ci/build-linux.sh, clang, rpcs3/rpcs3-ci-jammy:1.7, ubuntu-24.04) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (51ae32f468089a8169aaf1567de355ff4a3e0842, rpcs3/rpcs3-binaries-mac, .ci/build-mac.sh, Intel) (push) Waiting to run
Build RPCS3 / RPCS3 Mac ${{ matrix.name }} (8e21bdbc40711a3fccd18fbf17b742348b0f4281, rpcs3/rpcs3-binaries-mac-arm64, .ci/build-mac-arm64.sh, Apple Silicon) (push) Waiting to run
Build RPCS3 / RPCS3 Windows (push) Waiting to run
Build RPCS3 / RPCS3 Windows Clang (win64, clang, clang64) (push) Waiting to run
Build RPCS3 / RPCS3 FreeBSD (push) Waiting to run
This commit is contained in:
parent
9deb6cd4fa
commit
7f6842705c
|
|
@ -338,10 +338,10 @@ namespace gl
|
||||||
params.logd = rsx::ceil_log2(depth);
|
params.logd = rsx::ceil_log2(depth);
|
||||||
set_parameters(cmd);
|
set_parameters(cmd);
|
||||||
|
|
||||||
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
|
||||||
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
|
const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
|
||||||
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
|
const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
|
||||||
compute_task::run(cmd, linear_invocations);
|
compute_task::run(cmd, workgroup_invocations);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -103,34 +103,48 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
|
||||||
|
|
||||||
#if USE_16BIT_ADDRESSING
|
#if USE_16BIT_ADDRESSING
|
||||||
|
|
||||||
void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
|
void decode_16b(const in uint texel_id, in uint x, const in uint y, const in uint z)
|
||||||
{
|
{
|
||||||
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
|
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
|
||||||
accumulator |= data_in[src_id / 2] & masks[subword];
|
uint accumulator = 0;
|
||||||
|
|
||||||
if (subword == 1)
|
const uint subword_count = min(invocation.size.x, 2);
|
||||||
|
for (uint subword = 0; subword < subword_count; ++subword, ++x)
|
||||||
{
|
{
|
||||||
data_out[dst_id / 2] = %f(accumulator);
|
uint src_texel_id = get_z_index(x, y, z);
|
||||||
|
uint src_id = (src_texel_id + invocation.data_offset);
|
||||||
|
accumulator |= data_in[src_id / 2] & masks[subword];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data_out[texel_id / 2] = %f(accumulator);
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif USE_8BIT_ADDRESSING
|
#elif USE_8BIT_ADDRESSING
|
||||||
|
|
||||||
void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
|
void decode_8b(const in uint texel_id, in uint x, const in uint y, const in uint z)
|
||||||
{
|
{
|
||||||
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
|
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
|
||||||
accumulator |= data_in[src_id / 4] & masks[subword];
|
uint accumulator = 0;
|
||||||
|
|
||||||
if (subword == 3)
|
const uint subword_count = min(invocation.size.x, 4);
|
||||||
|
for (uint subword = 0; subword < subword_count; ++subword, ++x)
|
||||||
{
|
{
|
||||||
data_out[dst_id / 4] = accumulator;
|
uint src_texel_id = get_z_index(x, y, z);
|
||||||
|
uint src_id = (src_texel_id + invocation.data_offset);
|
||||||
|
accumulator |= data_in[src_id / 4] & masks[subword];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
data_out[texel_id / 4] = accumulator;
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void write32(const in uint word_count, in uint src_id, in uint dst_id)
|
void decode_32b(const in uint texel_id, const in uint word_count, const in uint x, const in uint y, const in uint z)
|
||||||
{
|
{
|
||||||
|
uint src_texel_id = get_z_index(x, y, z);
|
||||||
|
uint dst_id = (texel_id * word_count);
|
||||||
|
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
|
||||||
|
|
||||||
for (uint i = 0; i < word_count; ++i)
|
for (uint i = 0; i < word_count; ++i)
|
||||||
{
|
{
|
||||||
uint value = data_in[src_id++];
|
uint value = data_in[src_id++];
|
||||||
|
|
@ -165,23 +179,11 @@ void main()
|
||||||
uint x = (slice_offset % row_length);
|
uint x = (slice_offset % row_length);
|
||||||
|
|
||||||
#if USE_8BIT_ADDRESSING
|
#if USE_8BIT_ADDRESSING
|
||||||
for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) {
|
decode_8b(texel_id, x, y, z);
|
||||||
#elif USE_16BIT_ADDRESSING
|
#elif USE_16BIT_ADDRESSING
|
||||||
for (uint subword = 0, accumulator = 0; subword < 2; ++subword, ++x) {
|
decode_16b(texel_id, x, y, z);
|
||||||
#endif
|
|
||||||
|
|
||||||
uint src_texel_id = get_z_index(x, y, z);
|
|
||||||
uint dst_id = (texel_id * word_count);
|
|
||||||
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
|
|
||||||
|
|
||||||
#if USE_8BIT_ADDRESSING
|
|
||||||
write8(accumulator, subword, src_id, dst_id);
|
|
||||||
}
|
|
||||||
#elif USE_16BIT_ADDRESSING
|
|
||||||
write16(accumulator, subword, src_id, dst_id);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
write32(word_count, src_id, dst_id);
|
decode_32b(texel_id, word_count, x, y, z);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -475,10 +475,10 @@ namespace vk
|
||||||
params.logh = rsx::ceil_log2(height);
|
params.logh = rsx::ceil_log2(height);
|
||||||
params.logd = rsx::ceil_log2(depth);
|
params.logd = rsx::ceil_log2(depth);
|
||||||
|
|
||||||
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
|
const u32 word_count_per_invocation = std::max<u32>(sizeof(_BlockType) / 4u, 1u);
|
||||||
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
|
const u32 num_bytes_per_invocation = (word_count_per_invocation * 4u * optimal_group_size);
|
||||||
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
|
const u32 workgroup_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
|
||||||
compute_task::run(cmd, linear_invocations);
|
compute_task::run(cmd, workgroup_invocations);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue