Merge branch 'master' into nastys-patch-17

This commit is contained in:
nastys 2025-11-21 21:40:11 +01:00 committed by GitHub
commit 5b7a1d50d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 221 additions and 131 deletions

View file

@ -827,6 +827,9 @@ extern void ppu_register_function_at(u32 addr, u32 size, ppu_intrp_func_t ptr =
return;
}
size = utils::align<u32>(size + addr % 4, 4);
addr &= -4;
if (g_cfg.core.ppu_decoder == ppu_decoder_type::llvm)
{
return;

View file

@ -1096,80 +1096,65 @@ namespace rsx
fmt::throw_exception("Wrong format 0x%x", format);
}
if (word_size)
if (!word_size)
{
if (word_size == 1)
return result;
}
result.element_size = word_size;
result.block_length = words_per_block;
bool require_cpu_swizzle = !caps.supports_hw_deswizzle && is_swizzled;
bool require_cpu_byteswap = word_size > 1 && !caps.supports_byteswap;
if (is_swizzled && caps.supports_hw_deswizzle)
{
result.require_deswizzle = true;
}
if (!require_cpu_byteswap && !require_cpu_swizzle)
{
result.require_swap = (word_size > 1);
if (caps.supports_zero_copy)
{
if (is_swizzled)
{
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
}
else if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 1)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 2)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const u16>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u32>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
return result;
}
if (word_size == 1)
{
ensure(is_swizzled);
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u8>(), src_layout.data.as_span<const u8>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
}
else if (word_size == 2)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
{
result.element_size = word_size;
result.block_length = words_per_block;
bool require_cpu_swizzle = !caps.supports_hw_deswizzle && is_swizzled;
bool require_cpu_byteswap = !caps.supports_byteswap;
if (is_swizzled && caps.supports_hw_deswizzle)
{
if (word_size == 4 || (((word_size * words_per_block) & 3) == 0))
{
result.require_deswizzle = true;
}
else
{
require_cpu_swizzle = true;
}
}
if (!require_cpu_byteswap && !require_cpu_swizzle)
{
result.require_swap = true;
if (caps.supports_zero_copy)
{
result.require_upload = true;
result.deferred_cmds = build_transfer_cmds(src_layout.data.data(), word_size * words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 2)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const u16>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const u32>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
}
else
{
if (word_size == 2)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
}
}
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u16>(), src_layout.data.as_span<const be_t<u16>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
else if (word_size == 4)
{
if (is_swizzled)
copy_unmodified_block_swizzled::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block);
else
copy_unmodified_block::copy_mipmap_level(dst_buffer.as_span<u32>(), src_layout.data.as_span<const be_t<u32>>(), words_per_block, w, h, depth, src_layout.border, dst_pitch_in_block, src_layout.pitch_in_block);
}
return result;

View file

@ -1219,7 +1219,7 @@ namespace rsx
if (result.size() > 1)
{
std::sort(result.begin(), result.end(), [](const auto &a, const auto &b)
result.sort([](const auto &a, const auto &b)
{
if (a.surface->last_use_tag == b.surface->last_use_tag)
{

View file

@ -1496,7 +1496,7 @@ namespace rsx
void on_miss()
{
rsx_log.warning("Cache miss at address 0x%X. This is gonna hurt...", get_section_base());
// rsx_log.trace("Cache miss at address 0x%X. This is gonna hurt...", get_section_base());
m_tex_cache->on_miss(*derived());
}

View file

@ -263,8 +263,6 @@ namespace gl
cs_deswizzle_3d()
{
ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type"
initialize();
m_src =
@ -294,8 +292,10 @@ namespace gl
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0))},
{ "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(2)) },
{ "%ws", std::to_string(optimal_group_size) },
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
{ "%f", transform }
{ "%_wordcount", std::to_string(std::max<u32>(sizeof(_BlockType) / 4u, 1u)) },
{ "%f", transform },
{ "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" },
{ "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" },
};
m_src = fmt::replace_all(m_src, syntax_replace);
@ -339,7 +339,8 @@ namespace gl
set_parameters(cmd);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
compute_task::run(cmd, linear_invocations);
}
};

View file

@ -36,6 +36,16 @@ namespace gl
{
switch (block_size)
{
case 1:
gl::get_compute_task<gl::cs_deswizzle_3d<u8, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset,
data_length, width, height, depth, 1);
break;
case 2:
gl::get_compute_task<gl::cs_deswizzle_3d<u16, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset,
data_length, width, height, depth, 1);
break;
case 4:
gl::get_compute_task<gl::cs_deswizzle_3d<u32, WordType, SwapBytes>>()->run(
cmd, dst, dst_offset, src, src_offset,
@ -707,7 +717,7 @@ namespace gl
}
rsx::io_buffer io_buf = dst_buffer;
caps.supports_hw_deswizzle = (is_swizzled && driver_caps.ARB_compute_shader_supported && image_linear_size > 4096);
caps.supports_hw_deswizzle = (is_swizzled && driver_caps.ARB_compute_shader_supported && image_linear_size > 1024);
auto op = upload_texture_subresource(io_buf, layout, format, is_swizzled, caps);
// Define upload region
@ -748,39 +758,54 @@ namespace gl
g_upload_transfer_buffer.copy_to(&g_deswizzle_scratch_buffer.get(), upload_scratch_mem.second, deswizzle_data_offset, static_cast<u32>(image_linear_size));
// 2.2 Apply compute transform to deswizzle input and dump it in compute_scratch_mem
ensure(op.element_size == 2 || op.element_size == 4);
const auto block_size = op.element_size * op.block_length;
if (op.require_swap)
{
mem_layout.swap_bytes = false;
if (op.element_size == 4) [[ likely ]]
switch (op.element_size)
{
do_deswizzle_transformation<u32, true>(cmd, block_size,
case 1:
do_deswizzle_transformation<u8, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
}
else
{
break;
case 2:
do_deswizzle_transformation<u16, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
case 4:
do_deswizzle_transformation<u32, true>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
default:
fmt::throw_exception("Unimplemented element size deswizzle");
}
}
else
{
if (op.element_size == 4) [[ likely ]]
switch (op.element_size)
{
do_deswizzle_transformation<u32, false>(cmd, block_size,
case 1:
do_deswizzle_transformation<u8, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
}
else
{
break;
case 2:
do_deswizzle_transformation<u16, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
case 4:
do_deswizzle_transformation<u32, false>(cmd, block_size,
&g_compute_decode_buffer.get(), compute_scratch_mem.second, &g_deswizzle_scratch_buffer.get(), deswizzle_data_offset,
static_cast<u32>(image_linear_size), layout.width_in_texel, layout.height_in_texel, layout.depth);
break;
default:
fmt::throw_exception("Unimplemented element size deswizzle");
}
}

View file

@ -3,6 +3,9 @@ R"(
#define SSBO_LOCATION(x) (x + %loc)
#define USE_8BIT_ADDRESSING %_8bit
#define USE_16BIT_ADDRESSING %_16bit
layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;
layout(%set, binding=SSBO_LOCATION(0), std430) buffer ssbo0{ uint data_in[]; };
@ -98,12 +101,57 @@ uint get_z_index(const in uint x_, const in uint y_, const in uint z_)
return offset;
}
#if USE_16BIT_ADDRESSING
void write16(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
{
const uint masks[] = { 0x0000FFFF, 0xFFFF0000 };
accumulator |= data_in[src_id / 2] & masks[subword];
if (subword == 1)
{
data_out[dst_id / 2] = %f(accumulator);
}
}
#elif USE_8BIT_ADDRESSING
void write8(inout uint accumulator, const in uint subword, const in uint src_id, const in uint dst_id)
{
const uint masks[] = { 0x000000FF, 0x0000FF00, 0x00FF0000, 0xFF000000 };
accumulator |= data_in[src_id / 4] & masks[subword];
if (subword == 3)
{
data_out[dst_id / 4] = accumulator;
}
}
#else
void write32(const in uint word_count, in uint src_id, in uint dst_id)
{
for (uint i = 0; i < word_count; ++i)
{
uint value = data_in[src_id++];
data_out[dst_id++] = %f(value);
}
}
#endif
void main()
{
uint invocations_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);
uint texel_id = (gl_GlobalInvocationID.y * invocations_x) + gl_GlobalInvocationID.x;
uint word_count = %_wordcount;
#if USE_8BIT_ADDRESSING
texel_id *= 4; // Each invocation consumes 4 texels
#elif USE_16BIT_ADDRESSING
texel_id *= 2; // Each invocation consumes 2 texels
#endif
if (!init_invocation_properties(texel_id))
return;
@ -116,14 +164,25 @@ void main()
uint y = (slice_offset / row_length);
uint x = (slice_offset % row_length);
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
#if USE_8BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 4; ++subword, ++x) {
#elif USE_16BIT_ADDRESSING
for (uint subword = 0, accumulator = 0; subword < 2; ++subword, ++x) {
#endif
for (uint i = 0; i < word_count; ++i)
{
uint value = data_in[src_id++];
data_out[dst_id++] = %f(value);
uint src_texel_id = get_z_index(x, y, z);
uint dst_id = (texel_id * word_count);
uint src_id = (src_texel_id + invocation.data_offset) * word_count;
#if USE_8BIT_ADDRESSING
write8(accumulator, subword, src_id, dst_id);
}
#elif USE_16BIT_ADDRESSING
write16(accumulator, subword, src_id, dst_id);
}
#else
write32(word_count, src_id, dst_id);
#endif
}
)"

View file

@ -403,8 +403,6 @@ namespace vk
cs_deswizzle_3d()
{
ensure((sizeof(_BlockType) & 3) == 0); // "Unsupported block type"
ssbo_count = 2;
use_push_constants = true;
push_constants_size = 28;
@ -438,8 +436,10 @@ namespace vk
{ "%set", "set = 0" },
{ "%push_block", "push_constant" },
{ "%ws", std::to_string(optimal_group_size) },
{ "%_wordcount", std::to_string(sizeof(_BlockType) / 4) },
{ "%f", transform }
{ "%_wordcount", std::to_string(std::max<u32>(sizeof(_BlockType) / 4u, 1u)) },
{ "%f", transform },
{ "%_8bit", sizeof(_BlockType) == 1 ? "1" : "0" },
{ "%_16bit", sizeof(_BlockType) == 2 ? "1" : "0" },
};
m_src = fmt::replace_all(m_src, syntax_replace);
@ -476,7 +476,8 @@ namespace vk
params.logd = rsx::ceil_log2(depth);
const u32 num_bytes_per_invocation = (sizeof(_BlockType) * optimal_group_size);
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation);
const u32 texels_per_dword = std::max<u32>(4u / sizeof(_BlockType), 1u); // For block sizes less than 4 bytes wide
const u32 linear_invocations = utils::aligned_div(data_length, num_bytes_per_invocation) / texels_per_dword;
compute_task::run(cmd, linear_invocations);
}
};

View file

@ -759,6 +759,10 @@ namespace vk
{
switch (block_size)
{
case 1:
return vk::get_compute_task<cs_deswizzle_3d<u8, u8, false>>();
case 2:
return vk::get_compute_task<cs_deswizzle_3d<u16, WordType, SwapBytes>>();
case 4:
return vk::get_compute_task<cs_deswizzle_3d<u32, WordType, SwapBytes>>();
case 8:
@ -776,21 +780,27 @@ namespace vk
vk::cs_deswizzle_base* job = nullptr;
const auto block_size = (word_size * word_count);
ensure(word_size == 4 || word_size == 2);
if (!swap_bytes)
{
if (word_size == 4)
{
job = get_deswizzle_transformation<u32, false>(block_size);
}
else
switch (word_size)
{
case 1:
job = get_deswizzle_transformation<u8, false>(block_size);
break;
case 2:
job = get_deswizzle_transformation<u16, false>(block_size);
break;
case 4:
job = get_deswizzle_transformation<u32, false>(block_size);
break;
default:
fmt::throw_exception("Unimplemented deswizzle for format.");
}
}
else
{
ensure(word_size == 2 || word_size == 4);
if (word_size == 4)
{
job = get_deswizzle_transformation<u32, true>(block_size);

View file

@ -35,37 +35,43 @@ void gl_gs_frame::reset()
draw_context_t gl_gs_frame::make_context()
{
// This whole function should run in the main GUI thread.
// This really matters on Windows where a lot of wgl internals are stashed in the TEB.
auto context = new GLContext();
context->handle = new QOpenGLContext();
bool success = true;
if (m_primary_context)
Emu.BlockingCallFromMainThread([&]()
{
QOffscreenSurface* surface = nullptr;
// Workaround for the Qt warning: "Attempting to create QWindow-based QOffscreenSurface outside the gui thread. Expect failures."
Emu.BlockingCallFromMainThread([&]()
if (m_primary_context)
{
surface = new QOffscreenSurface();
QOffscreenSurface* surface = new QOffscreenSurface();
surface->setFormat(m_format);
surface->create();
});
// Share resources with the first created context
context->handle->setShareContext(m_primary_context->handle);
context->surface = surface;
context->owner = true;
}
else
{
// This is the first created context, all others will share resources with this one
m_primary_context = context;
context->surface = this;
context->owner = false;
}
// Share resources with the first created context
context->handle->setShareContext(m_primary_context->handle);
context->surface = surface;
context->owner = true;
}
else
{
// This is the first created context, all others will share resources with this one
m_primary_context = context;
context->surface = this;
context->owner = false;
}
context->handle->setFormat(m_format);
context->handle->setFormat(m_format);
if (!context->handle->create())
if (!context->handle->create())
{
success = false;
}
});
if (!success)
{
fmt::throw_exception("Failed to create OpenGL context");
}
@ -110,8 +116,8 @@ void gl_gs_frame::delete_context(draw_context_t ctx)
gl_ctx->handle->doneCurrent();
#ifdef _MSC_VER
//AMD driver crashes when executing wglDeleteContext
//Catch with SEH
// AMD driver crashes when executing wglDeleteContext, probably because the current thread does not own the context.
// Catch with SEH
__try
{
delete gl_ctx->handle;