From ba673d1407851086f2bb960a8c4a4b494eabbcb9 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Fri, 26 Dec 2025 15:23:54 +0300 Subject: [PATCH] rsx/vk: Add post-cyclic-z barriers allowing us to keep early Z optimizations --- rpcs3/Emu/RSX/Core/RSXDriverState.h | 3 +++ rpcs3/Emu/RSX/RSXThread.cpp | 40 +++++++++++++++++++++++++---- rpcs3/Emu/RSX/VK/VKDraw.cpp | 10 ++++++++ 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Core/RSXDriverState.h b/rpcs3/Emu/RSX/Core/RSXDriverState.h index 16dd08c78d..9e7bc2cd68 100644 --- a/rpcs3/Emu/RSX/Core/RSXDriverState.h +++ b/rpcs3/Emu/RSX/Core/RSXDriverState.h @@ -40,6 +40,9 @@ namespace rsx xform_instancing_state_dirty = (1 << 25), // Transform instancing state has changed + zeta_address_is_cyclic = (1 << 26), // The currently bound Z buffer is active for R/W in a cyclic manner + zeta_address_cyclic_barrier = (1 << 27), // A memory barrier is required to "end" the Z buffer cyclic state + // TODO - Should signal that we simply need to do a FP compare before the next draw call and invalidate the ucode if the content has changed. // Marking as dirty to invalidate hot cache also works, it's not like there's tons of barriers per frame anyway. fragment_program_needs_rehash = fragment_program_ucode_dirty, diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index a353c34d94..c6038e2f50 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -1681,10 +1681,24 @@ namespace rsx return; } + auto set_zeta_write_enabled = [&](bool state) + { + if (state == m_framebuffer_layout.zeta_write_enabled) + { + return; + } + + if (m_graphics_state & rsx::zeta_address_is_cyclic) + { + m_graphics_state |= rsx::fragment_program_state_dirty; + } + m_framebuffer_layout.zeta_write_enabled = state; + }; + auto evaluate_depth_buffer_state = [&]() { - m_framebuffer_layout.zeta_write_enabled = - (rsx::method_registers.depth_test_enabled() && rsx::method_registers.depth_write_enabled()); + const bool zeta_write_en = (rsx::method_registers.depth_test_enabled() && rsx::method_registers.depth_write_enabled()); + set_zeta_write_enabled(zeta_write_en); }; auto evaluate_stencil_buffer_state = [&]() @@ -1707,7 +1721,7 @@ namespace rsx rsx::method_registers.back_stencil_op_zfail() != rsx::stencil_op::keep); } - m_framebuffer_layout.zeta_write_enabled = (mask && active_write_op); + set_zeta_write_enabled(mask && active_write_op); } }; @@ -2110,6 +2124,9 @@ namespace rsx break; } + const bool zeta_was_cyclic = m_graphics_state & rsx::zeta_address_is_cyclic; + m_graphics_state.clear(rsx::zeta_address_is_cyclic); + for (u32 textures_ref = current_fp_metadata.referenced_textures_mask, i = 0; textures_ref; textures_ref >>= 1, ++i) { if (!(textures_ref & 1)) continue; @@ -2242,11 +2259,17 @@ namespace rsx } if (sampler_descriptors[i]->is_cyclic_reference && - !(current_fragment_program.ctrl & (CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT | RSX_SHADER_CONTROL_META_USES_DISCARD)) && + m_framebuffer_layout.zeta_address != 0 && !g_cfg.video.strict_rendering_mode && g_cfg.video.shader_precision != gpu_preset_level::low) { - current_fragment_program.ctrl |= RSX_SHADER_CONTROL_DISABLE_EARLY_Z; + m_graphics_state |= rsx::zeta_address_is_cyclic; + + if (!(current_fragment_program.ctrl & (CELL_GCM_SHADER_CONTROL_DEPTH_EXPORT | RSX_SHADER_CONTROL_META_USES_DISCARD)) && + m_framebuffer_layout.zeta_write_enabled) + { + current_fragment_program.ctrl |= RSX_SHADER_CONTROL_DISABLE_EARLY_Z; + } } } else if (!backend_config.supports_hw_renormalization /* && @@ -2340,6 +2363,13 @@ namespace rsx } m_program_cache_hint.invalidate_fragment_program(current_fragment_program); + + if (zeta_was_cyclic && zeta_was_cyclic != m_graphics_state.test(rsx::zeta_address_is_cyclic)) + { + // Forced "fall-out" barrier. This is a special case for Z buffers because they can be cyclic without writes. + // That condition can cause early-Z in a later call to introduce data hazard in previous cyclic draws. + m_graphics_state |= rsx::zeta_address_cyclic_barrier; + } } bool thread::invalidate_fragment_program(u32 dst_dma, u32 dst_offset, u32 size) diff --git a/rpcs3/Emu/RSX/VK/VKDraw.cpp b/rpcs3/Emu/RSX/VK/VKDraw.cpp index 6101aeb9de..a088a6ead6 100644 --- a/rpcs3/Emu/RSX/VK/VKDraw.cpp +++ b/rpcs3/Emu/RSX/VK/VKDraw.cpp @@ -1044,6 +1044,14 @@ void VKGSRender::end() if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil)) { ds->write_barrier(*m_current_command_buffer); + + if (m_graphics_state.test(rsx::zeta_address_cyclic_barrier) && + ds->current_layout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) + { + // We actually need to end the subpass as a minimum. Without this, early-Z optimiazations in following draws will clobber reads from previous draws and cause flickering. + // Since we're ending the subpass, might as well restore DCC/HiZ for extra performance + ds->change_layout(*m_current_command_buffer, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); + } } for (auto &rtt : m_rtts.m_bound_render_targets) @@ -1054,6 +1062,8 @@ void VKGSRender::end() } } + m_graphics_state.clear(rsx::zeta_address_cyclic_barrier); + m_frame_stats.setup_time += m_profiler.duration(); // Now bind the shader resources. It is important that this takes place after the barriers so that we don't end up with stale descriptors