diff --git a/rpcs3/Emu/RSX/GL/GLCompute.h b/rpcs3/Emu/RSX/GL/GLCompute.h index 54458c1f1c..442d8e4a0d 100644 --- a/rpcs3/Emu/RSX/GL/GLCompute.h +++ b/rpcs3/Emu/RSX/GL/GLCompute.h @@ -263,8 +263,6 @@ namespace gl cs_deswizzle_3d() { - ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type" - initialize(); m_src = @@ -294,8 +292,10 @@ namespace gl { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))}, { "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) }, { "%ws", std::to_string(optimal_group_size) }, - { "%_wordcount", std::to_string(sizeof(_BlockType) / 4) }, - { "%f", transform } + { "%_wordcount", std::to_string(std::max(sizeof(_BlockType) / 4u, 1u)) }, + { "%f", transform }, + { "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" }, + { "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" }, }; m_src = fmt::replace_all(m_src, syntax_replace); @@ -339,7 +339,8 @@ namespace gl set_parameters(cmd); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); - const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); + const u32 texels_per_dword = std::max(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide + const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword; compute_task::run(cmd, linear_invocations); } }; diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 0c34690bf4..3d4632b4e1 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -36,6 +36,16 @@ namespace gl { switch (block_size) { + case 1: + gl::get_compute_task>()->run( + cmd, dst, dst_offset, src, src_offset, + data_length, width, height, depth, 1); + break; + case 2: + gl::get_compute_task>()->run( + cmd, dst, dst_offset, src, src_offset, + data_length, width, height, depth, 1); + break; case 4: gl::get_compute_task>()->run( cmd, dst, dst_offset, src, src_offset, @@ -748,39 +758,54 @@ namespace gl g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast(image_linear_size)); // 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem - ensure(op.element_size == 2 || op.element_size == 4); const auto block_size = op.element_size * op.block_length; if (op.require_swap) { mem_layout.swap_bytes = false; - if (op.element_size == 4) [[ likely ]] + switch (op.element_size) { - do_deswizzle_transformation(cmd, block_size, + case 1: + do_deswizzle_transformation(cmd, block_size, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); - } - else - { + break; + case 2: do_deswizzle_transformation(cmd, block_size, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); + break; + case 4: + do_deswizzle_transformation(cmd, block_size, + &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, + static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); + break; + default: + fmt::throw_exception("Unimplemented element size deswizzle"); } } else { - if (op.element_size == 4) [[ likely ]] + switch (op.element_size) { - do_deswizzle_transformation(cmd, block_size, + case 1: + do_deswizzle_transformation(cmd, block_size, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); - } - else - { + break; + case 2: do_deswizzle_transformation(cmd, block_size, &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); + break; + case 4: + do_deswizzle_transformation(cmd, block_size, + &g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset, + static_cast(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth); + break; + default: + fmt::throw_exception("Unimplemented element size deswizzle"); } } diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index ec5e8d32a6..a62d93ec74 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -476,21 +476,8 @@ namespace vk params.logd = rsx::ceil_log2(depth); const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size); - u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation); - - // Check if we need to do subaddressing and adjust invocation count accordingly - switch (sizeof(_BlockType)) - { - case 1: - linear_invocations /= 4; - break; - case 2: - linear_invocations /= 2; - break; - default: - break; - } - + const u32 texels_per_dword = std::max(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide + const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword; compute_task::run(cmd, linear_invocations); } };