rsx/vk: Add support for 8 and 16-bit texel GPU-accelerated deswizzle

This commit is contained in:
kd-11 2025-11-20 01:45:50 +03:00 committed by kd-11
parent b067688c8e
commit ff72f944ba
5 changed files with 158 additions and 90 deletions

View file

@ -1096,80 +1096,65 @@ namespace rsx
fmt::throw_exception("Wrong format 0x%x", format); fmt::throw_exception("Wrong format 0x%x", format);
} }
if (word_size) if (!word_size)
{ {
if (word_size == 1) return result;
}
result.element_size = word_size;
result.block_length = words_per_block;
bool require_cpu_swizzle = !caps.supports_hw_deswizzle && is_swizzled;
bool require_cpu_byteswap = word_size > 1 && !caps.supports_byteswap;
if (is_swizzled && caps.supports_hw_deswizzle)
{
result.require_deswizzle = true;
}
if (!require_cpu_byteswap && !require_cpu_swizzle)
{
result.require_swap = (word_size > 1);
if (caps.supports_zero_copy)
{ {
if (is_swizzled) result.require_upload = true;
{ result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
}
else if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
} }
else if (word_size == 1)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 2)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const u16>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u32>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
return result;
}
if (word_size == 1)
{
ensure(is_swizzled);
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
}
else if (word_size == 2)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else else
{ copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
result.element_size = word_size; }
result.block_length = words_per_block; else if (word_size == 4)
{
bool require_cpu_swizzle = !caps.supports_hw_deswizzle && is_swizzled; if (is_swizzled)
bool require_cpu_byteswap = !caps.supports_byteswap; copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
if (is_swizzled && caps.supports_hw_deswizzle) copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
{
if (word_size == 4 || (((word_size * words_per_block) & 3) == 0))
{
result.require_deswizzle = true;
}
else
{
require_cpu_swizzle = true;
}
}
if (!require_cpu_byteswap && !require_cpu_swizzle)
{
result.require_swap = true;
if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 2)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const u16>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u32>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
}
else
{
if (word_size == 2)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
}
}
} }
return result; return result;

View file

@ -1219,7 +1219,7 @@ namespace rsx
if (result.size() > 1) if (result.size() > 1)
{ {
std::sort(result.begin(), result.end(), [](const auto &a, const auto &b) result.sort([](const auto &a, const auto &b)
{ {
if (a.surface->last_use_tag == b.surface->last_use_tag) if (a.surface->last_use_tag == b.surface->last_use_tag)
{ {

View file

@ -3,6 +3,9 @@ R"(
#define SSBO_LOCATION(x) (x + %loc) #define SSBO_LOCATION(x) (x + %loc)
#define USE_8BIT_ADDRESSING %_8bit
#define USE_16BIT_ADDRESSING %_16bit
layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in; layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;
layout(%set, binding=SSBO_LOCATION(0), std430) buffer ssbo0{ uint data_in[]; }; layout(%set, binding=SSBO_LOCATION(0), std430) buffer ssbo0{ uint data_in[]; };
@ -98,12 +101,57 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
return offset; return offset;
} }
#if USE_16BIT_ADDRESSING
void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
{
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
accumulator |= data_in[src_id / 2] & masks[subword];
if (subword == 1)
{
data_out[dst_id / 2] = accumulator;
}
}
#elif USE_8BIT_ADDRESSING
void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
{
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
accumulator |= data_in[src_id / 4] & masks[subword];
if (subword == 3)
{
data_out[dst_id / 4] = accumulator;
}
}
#else
void write32(const in uint word_count, in uint src_id, in uint dst_id)
{
for (uint i = 0; i < word_count; ++i)
{
uint value = data_in[src_id++];
data_out[dst_id++] = %f(value);
}
}
#endif
void main() void main()
{ {
uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x); uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);
uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x; uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;
uint word_count = %_wordcount; uint word_count = %_wordcount;
#if USE_8BIT_ADDRESSING
texel_id *= 4; // Each invocation consumes 4 texels
#elif USE_16BIT_ADDRESSING
texel_id *= 2; // Each invocation consumes 2 texels
#endif
if (!init_invocation_properties(texel_id)) if (!init_invocation_properties(texel_id))
return; return;
@ -116,14 +164,25 @@ void main()
uint y = (slice_offset / row_length); uint y = (slice_offset / row_length);
uint x = (slice_offset % row_length); uint x = (slice_offset % row_length);
uint src_texel_id = get_z_index(x, y, z); #if USE_8BIT_ADDRESSING
uint dst_id = (texel_id * word_count); for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) {
uint src_id = (src_texel_id + invocation.data_offset) * word_count; #elif USE_16BIT_ADDRESSING
for (uint subword = 0, temp = 0; subword < 2; ++subword, ++x) {
#endif
for (uint i = 0; i < word_count; ++i) uint src_texel_id = get_z_index(x, y, z);
{ uint dst_id = (texel_id * word_count);
uint value = data_in[src_id++]; uint src_id = (src_texel_id + invocation.data_offset) * word_count;
data_out[dst_id++] = %f(value);
#if USE_8BIT_ADDRESSING
write8(accumulator, subword, src_id, dst_id);
} }
#elif USE_16BIT_ADDRESSING
write16(accumulator, subword, src_id, dst_id);
}
#else
write32(word_count, src_id, dst_id);
#endif
} }
)" )"

View file

@ -403,8 +403,6 @@ namespace vk
cs_deswizzle_3d() cs_deswizzle_3d()
{ {
ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type"
ssbo_count = 2; ssbo_count = 2;
use_push_constants = true; use_push_constants = true;
push_constants_size = 28; push_constants_size = 28;
@ -438,8 +436,10 @@ namespace vk
{ "%set", "set = 0" }, { "%set", "set = 0" },
{ "%push_block", "push_constant" }, { "%push_block", "push_constant" },
{ "%ws", std::to_string(optimal_group_size) }, { "%ws", std::to_string(optimal_group_size) },
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, { "%_wordcount", std::to_string(std::max<u32>(sizeof(_BlockType) / 4u, 1u)) },
{ "%f", transform } { "%f", transform },
{ "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" },
{ "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" },
}; };
m_src = fmt::replace_all(m_src, syntax_replace); m_src = fmt::replace_all(m_src, syntax_replace);
@ -476,7 +476,21 @@ namespace vk
params.logd = rsx::ceil_log2(depth); params.logd = rsx::ceil_log2(depth);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
// Check if we need to do subaddressing and adjust invocation count accordingly
switch (sizeof(_BlockType))
{
case 1:
linear_invocations /= 4;
break;
case 2:
linear_invocations /= 2;
break;
default:
break;
}
compute_task::run(cmd, linear_invocations); compute_task::run(cmd, linear_invocations);
} }
}; };

View file

@ -759,6 +759,10 @@ namespace vk
{ {
switch (block_size) switch (block_size)
{ {
case 1:
return vk::get_compute_task<cs_deswizzle_3d<u8, u8, false>>();
case 2:
return vk::get_compute_task<cs_deswizzle_3d<u16, WordType, SwapBytes>>();
case 4: case 4:
return vk::get_compute_task<cs_deswizzle_3d<u32, WordType, SwapBytes>>(); return vk::get_compute_task<cs_deswizzle_3d<u32, WordType, SwapBytes>>();
case 8: case 8:
@ -776,21 +780,27 @@ namespace vk
vk::cs_deswizzle_base* job = nullptr; vk::cs_deswizzle_base* job = nullptr;
const auto block_size = (word_size * word_count); const auto block_size = (word_size * word_count);
ensure(word_size == 4 || word_size == 2);
if (!swap_bytes) if (!swap_bytes)
{ {
if (word_size == 4) switch (word_size)
{
job = get_deswizzle_transformation<u32, false>(block_size);
}
else
{ {
case 1:
job = get_deswizzle_transformation<u8, false>(block_size);
break;
case 2:
job = get_deswizzle_transformation<u16, false>(block_size); job = get_deswizzle_transformation<u16, false>(block_size);
break;
case 4:
job = get_deswizzle_transformation<u32, false>(block_size);
break;
default:
fmt::throw_exception("Unimplemented deswizzle for format.");
} }
} }
else else
{ {
ensure(word_size == 2 || word_size == 4);
if (word_size == 4) if (word_size == 4)
{ {
job = get_deswizzle_transformation<u32, true>(block_size); job = get_deswizzle_transformation<u32, true>(block_size);