rpcsx/rpcs3/Emu/RSX/VK/VKGSRender.cpp
Talkashie dabb2cc9a0
Fix typos, improve consistency
Fixes typos where spelling or grammar is objectively wrong.
Changes wording and capitalization in some areas to be more consistent with other areas.
2023-07-28 13:09:06 +03:00

3023 lines
102 KiB
C++

#include "stdafx.h"
#include "../Overlays/overlay_compile_notification.h"
#include "../Overlays/Shaders/shader_loading_dialog_native.h"
#include "VKAsyncScheduler.h"
#include "VKCommandStream.h"
#include "VKCommonDecompiler.h"
#include "VKCompute.h"
#include "VKGSRender.h"
#include "VKHelpers.h"
#include "VKRenderPass.h"
#include "VKResourceManager.h"
#include "vkutils/buffer_object.h"
#include "vkutils/scratch.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/Memory/vm_locking.h"
#include "../Program/program_state_cache2.hpp"
#include "util/asm.hpp"
namespace vk
{
VkCompareOp get_compare_func(rsx::comparison_function op, bool reverse_direction = false);
std::pair<VkFormat, VkComponentMapping> get_compatible_surface_format(rsx::surface_color_format color_format)
{
const VkComponentMapping o_rgb = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_ONE };
const VkComponentMapping z_rgb = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_ZERO };
switch (color_format)
{
#ifndef __APPLE__
case rsx::surface_color_format::r5g6b5:
return std::make_pair(VK_FORMAT_R5G6B5_UNORM_PACK16, vk::default_component_map);
case rsx::surface_color_format::x1r5g5b5_o1r5g5b5:
return std::make_pair(VK_FORMAT_A1R5G5B5_UNORM_PACK16, o_rgb);
case rsx::surface_color_format::x1r5g5b5_z1r5g5b5:
return std::make_pair(VK_FORMAT_A1R5G5B5_UNORM_PACK16, z_rgb);
#else
// assign B8G8R8A8_UNORM to formats that are not supported by Metal
case rsx::surface_color_format::r5g6b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::x1r5g5b5_o1r5g5b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, o_rgb);
case rsx::surface_color_format::x1r5g5b5_z1r5g5b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, z_rgb);
#endif
case rsx::surface_color_format::a8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::a8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::x8b8g8r8_o8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, o_rgb);
case rsx::surface_color_format::x8b8g8r8_z8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, z_rgb);
case rsx::surface_color_format::x8r8g8b8_z8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, z_rgb);
case rsx::surface_color_format::x8r8g8b8_o8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, o_rgb);
case rsx::surface_color_format::w16z16y16x16:
return std::make_pair(VK_FORMAT_R16G16B16A16_SFLOAT, vk::default_component_map);
case rsx::surface_color_format::w32z32y32x32:
return std::make_pair(VK_FORMAT_R32G32B32A32_SFLOAT, vk::default_component_map);
case rsx::surface_color_format::b8:
{
const VkComponentMapping no_alpha = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE };
return std::make_pair(VK_FORMAT_R8_UNORM, no_alpha);
}
case rsx::surface_color_format::g8b8:
{
const VkComponentMapping gb_rg = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G };
return std::make_pair(VK_FORMAT_R8G8_UNORM, gb_rg);
}
case rsx::surface_color_format::x32:
{
const VkComponentMapping rrrr = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R };
return std::make_pair(VK_FORMAT_R32_SFLOAT, rrrr);
}
default:
rsx_log.error("Surface color buffer: Unsupported surface color format (0x%x)", static_cast<u32>(color_format));
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
}
}
VkLogicOp get_logic_op(rsx::logic_op op)
{
switch (op)
{
case rsx::logic_op::logic_clear: return VK_LOGIC_OP_CLEAR;
case rsx::logic_op::logic_and: return VK_LOGIC_OP_AND;
case rsx::logic_op::logic_and_reverse: return VK_LOGIC_OP_AND_REVERSE;
case rsx::logic_op::logic_copy: return VK_LOGIC_OP_COPY;
case rsx::logic_op::logic_and_inverted: return VK_LOGIC_OP_AND_INVERTED;
case rsx::logic_op::logic_noop: return VK_LOGIC_OP_NO_OP;
case rsx::logic_op::logic_xor: return VK_LOGIC_OP_XOR;
case rsx::logic_op::logic_or : return VK_LOGIC_OP_OR;
case rsx::logic_op::logic_nor: return VK_LOGIC_OP_NOR;
case rsx::logic_op::logic_equiv: return VK_LOGIC_OP_EQUIVALENT;
case rsx::logic_op::logic_invert: return VK_LOGIC_OP_INVERT;
case rsx::logic_op::logic_or_reverse: return VK_LOGIC_OP_OR_REVERSE;
case rsx::logic_op::logic_copy_inverted: return VK_LOGIC_OP_COPY_INVERTED;
case rsx::logic_op::logic_or_inverted: return VK_LOGIC_OP_OR_INVERTED;
case rsx::logic_op::logic_nand: return VK_LOGIC_OP_NAND;
case rsx::logic_op::logic_set: return VK_LOGIC_OP_SET;
default:
fmt::throw_exception("Unknown logic op 0x%x", static_cast<u32>(op));
}
}
VkBlendFactor get_blend_factor(rsx::blend_factor factor)
{
switch (factor)
{
case rsx::blend_factor::one: return VK_BLEND_FACTOR_ONE;
case rsx::blend_factor::zero: return VK_BLEND_FACTOR_ZERO;
case rsx::blend_factor::src_alpha: return VK_BLEND_FACTOR_SRC_ALPHA;
case rsx::blend_factor::dst_alpha: return VK_BLEND_FACTOR_DST_ALPHA;
case rsx::blend_factor::src_color: return VK_BLEND_FACTOR_SRC_COLOR;
case rsx::blend_factor::dst_color: return VK_BLEND_FACTOR_DST_COLOR;
case rsx::blend_factor::constant_color: return VK_BLEND_FACTOR_CONSTANT_COLOR;
case rsx::blend_factor::constant_alpha: return VK_BLEND_FACTOR_CONSTANT_ALPHA;
case rsx::blend_factor::one_minus_src_color: return VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR;
case rsx::blend_factor::one_minus_dst_color: return VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
case rsx::blend_factor::one_minus_src_alpha: return VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
case rsx::blend_factor::one_minus_dst_alpha: return VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA;
case rsx::blend_factor::one_minus_constant_alpha: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA;
case rsx::blend_factor::one_minus_constant_color: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR;
case rsx::blend_factor::src_alpha_saturate: return VK_BLEND_FACTOR_SRC_ALPHA_SATURATE;
default:
fmt::throw_exception("Unknown blend factor 0x%x", static_cast<u32>(factor));
}
}
VkBlendOp get_blend_op(rsx::blend_equation op)
{
switch (op)
{
case rsx::blend_equation::add_signed:
rsx_log.trace("blend equation add_signed used. Emulating using FUNC_ADD");
[[fallthrough]];
case rsx::blend_equation::add:
return VK_BLEND_OP_ADD;
case rsx::blend_equation::subtract: return VK_BLEND_OP_SUBTRACT;
case rsx::blend_equation::reverse_subtract_signed:
rsx_log.trace("blend equation reverse_subtract_signed used. Emulating using FUNC_REVERSE_SUBTRACT");
[[fallthrough]];
case rsx::blend_equation::reverse_subtract: return VK_BLEND_OP_REVERSE_SUBTRACT;
case rsx::blend_equation::min: return VK_BLEND_OP_MIN;
case rsx::blend_equation::max: return VK_BLEND_OP_MAX;
default:
fmt::throw_exception("Unknown blend op: 0x%x", static_cast<u32>(op));
}
}
VkStencilOp get_stencil_op(rsx::stencil_op op)
{
switch (op)
{
case rsx::stencil_op::keep: return VK_STENCIL_OP_KEEP;
case rsx::stencil_op::zero: return VK_STENCIL_OP_ZERO;
case rsx::stencil_op::replace: return VK_STENCIL_OP_REPLACE;
case rsx::stencil_op::incr: return VK_STENCIL_OP_INCREMENT_AND_CLAMP;
case rsx::stencil_op::decr: return VK_STENCIL_OP_DECREMENT_AND_CLAMP;
case rsx::stencil_op::invert: return VK_STENCIL_OP_INVERT;
case rsx::stencil_op::incr_wrap: return VK_STENCIL_OP_INCREMENT_AND_WRAP;
case rsx::stencil_op::decr_wrap: return VK_STENCIL_OP_DECREMENT_AND_WRAP;
default:
fmt::throw_exception("Unknown stencil op: 0x%x", static_cast<u32>(op));
}
}
VkFrontFace get_front_face(rsx::front_face ffv)
{
switch (ffv)
{
case rsx::front_face::cw: return VK_FRONT_FACE_CLOCKWISE;
case rsx::front_face::ccw: return VK_FRONT_FACE_COUNTER_CLOCKWISE;
default:
fmt::throw_exception("Unknown front face value: 0x%x", static_cast<u32>(ffv));
}
}
VkCullModeFlags get_cull_face(rsx::cull_face cfv)
{
switch (cfv)
{
case rsx::cull_face::back: return VK_CULL_MODE_BACK_BIT;
case rsx::cull_face::front: return VK_CULL_MODE_FRONT_BIT;
case rsx::cull_face::front_and_back: return VK_CULL_MODE_FRONT_AND_BACK;
default:
fmt::throw_exception("Unknown cull face value: 0x%x", static_cast<u32>(cfv));
}
}
struct vertex_input_assembly_state
{
VkPrimitiveTopology primitive;
VkBool32 restart_index_enabled;
};
vertex_input_assembly_state decode_vertex_input_assembly_state()
{
vertex_input_assembly_state state{};
const auto& current_draw = rsx::method_registers.current_draw_clause;
const auto [primitive, emulated_primitive] = vk::get_appropriate_topology(current_draw.primitive);
if (rsx::method_registers.restart_index_enabled() &&
!current_draw.is_disjoint_primitive &&
current_draw.command == rsx::draw_command::indexed &&
!emulated_primitive &&
!vk::emulate_primitive_restart(current_draw.primitive))
{
state.restart_index_enabled = VK_TRUE;
}
state.primitive = primitive;
return state;
}
// TODO: This should be deprecated soon (kd)
vk::pipeline_props decode_rsx_state(
const vertex_input_assembly_state& vertex_input,
vk::render_target* ds,
const rsx::backend_configuration& backend_config,
u8 num_draw_buffers,
u8 num_rasterization_samples,
bool depth_bounds_support)
{
vk::pipeline_props properties{};
// Input assembly
properties.state.set_primitive_type(vertex_input.primitive);
properties.state.enable_primitive_restart(vertex_input.restart_index_enabled);
// Rasterizer state
properties.state.set_attachment_count(num_draw_buffers);
properties.state.set_front_face(vk::get_front_face(rsx::method_registers.front_face_mode()));
properties.state.enable_depth_clamp(rsx::method_registers.depth_clamp_enabled() || !rsx::method_registers.depth_clip_enabled());
properties.state.enable_depth_bias(true);
properties.state.enable_depth_bounds_test(depth_bounds_support);
if (rsx::method_registers.depth_test_enabled())
{
//NOTE: Like stencil, depth write is meaningless without depth test
properties.state.set_depth_mask(rsx::method_registers.depth_write_enabled());
properties.state.enable_depth_test(vk::get_compare_func(rsx::method_registers.depth_func()));
}
if (rsx::method_registers.cull_face_enabled())
properties.state.enable_cull_face(vk::get_cull_face(rsx::method_registers.cull_face_mode()));
for (uint index = 0; index < num_draw_buffers; ++index)
{
bool color_mask_b = rsx::method_registers.color_mask_b(index);
bool color_mask_g = rsx::method_registers.color_mask_g(index);
bool color_mask_r = rsx::method_registers.color_mask_r(index);
bool color_mask_a = rsx::method_registers.color_mask_a(index);
switch (rsx::method_registers.surface_color())
{
case rsx::surface_color_format::b8:
rsx::get_b8_colormask(color_mask_r, color_mask_g, color_mask_b, color_mask_a);
break;
case rsx::surface_color_format::g8b8:
rsx::get_g8b8_r8g8_colormask(color_mask_r, color_mask_g, color_mask_b, color_mask_a);
break;
default:
break;
}
properties.state.set_color_mask(index, color_mask_r, color_mask_g, color_mask_b, color_mask_a);
}
// LogicOp and Blend are mutually exclusive. If both are enabled, LogicOp takes precedence.
if (rsx::method_registers.logic_op_enabled())
{
properties.state.enable_logic_op(vk::get_logic_op(rsx::method_registers.logic_operation()));
}
else
{
bool mrt_blend_enabled[] =
{
rsx::method_registers.blend_enabled(),
rsx::method_registers.blend_enabled_surface_1(),
rsx::method_registers.blend_enabled_surface_2(),
rsx::method_registers.blend_enabled_surface_3()
};
VkBlendFactor sfactor_rgb, sfactor_a, dfactor_rgb, dfactor_a;
VkBlendOp equation_rgb, equation_a;
if (mrt_blend_enabled[0] || mrt_blend_enabled[1] || mrt_blend_enabled[2] || mrt_blend_enabled[3])
{
sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());
for (u8 idx = 0; idx < num_draw_buffers; ++idx)
{
if (mrt_blend_enabled[idx])
{
properties.state.enable_blend(idx, sfactor_rgb, sfactor_a, dfactor_rgb, dfactor_a, equation_rgb, equation_a);
}
}
}
}
if (rsx::method_registers.stencil_test_enabled())
{
if (!rsx::method_registers.two_sided_stencil_test_enabled())
{
properties.state.enable_stencil_test(
vk::get_stencil_op(rsx::method_registers.stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
}
else
{
properties.state.enable_stencil_test_separate(0,
vk::get_stencil_op(rsx::method_registers.stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
properties.state.enable_stencil_test_separate(1,
vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.back_stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
}
if (ds && ds->samples() > 1 && !(ds->stencil_init_flags & 0xFF00))
{
if (properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.passOp != VK_STENCIL_OP_KEEP)
{
// Toggle bit 9 to signal require full bit-wise transfer
ds->stencil_init_flags |= (1 << 8);
}
}
}
if (backend_config.supports_hw_a2c || num_rasterization_samples > 1)
{
const bool alpha_to_one_enable = rsx::method_registers.msaa_alpha_to_one_enabled() && backend_config.supports_hw_a2one;
properties.state.set_multisample_state(
num_rasterization_samples,
rsx::method_registers.msaa_sample_mask(),
rsx::method_registers.msaa_enabled(),
rsx::method_registers.msaa_alpha_to_coverage_enabled(),
alpha_to_one_enable);
// A problem observed on multiple GPUs is that interior geometry edges can resolve 0 samples unless we force shading rate of 1.
// For whatever reason, the way MSAA images are 'resolved' on PS3 bypasses this issue.
// NOTE: We do not do image resolve at all, the output is merely 'exploded' and the guest application is responsible for doing the resolve in software as it is on real hardware.
properties.state.set_multisample_shading_rate(1.f);
}
return properties;
}
}
namespace
{
std::tuple<VkPipelineLayout, VkDescriptorSetLayout> get_shared_pipeline_layout(VkDevice dev)
{
const auto& binding_table = vk::get_current_renderer()->get_pipeline_binding_table();
rsx::simple_array<VkDescriptorSetLayoutBinding> bindings(binding_table.total_descriptor_bindings);
u32 idx = 0;
// Vertex stream, one stream for cacheable data, one stream for transient data
for (int i = 0; i < 3; i++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_buffers_first_bind_slot + i;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
}
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_constant_buffers_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_state_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_texture_params_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_constant_buffers_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_ALL_GRAPHICS;
bindings[idx].binding = binding_table.vertex_params_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.conditional_render_predicate_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.rasterizer_env_bind_slot;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
for (auto binding = binding_table.textures_first_bind_slot;
binding < binding_table.vertex_textures_first_bind_slot;
binding++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
}
for (int i = 0; i < rsx::limits::vertex_textures_count; i++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_textures_first_bind_slot + i;
bindings[idx].pImmutableSamplers = nullptr;
idx++;
}
ensure(idx == binding_table.total_descriptor_bindings);
std::array<VkPushConstantRange, 1> push_constants;
push_constants[0].offset = 0;
push_constants[0].size = 16;
push_constants[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
if (vk::emulate_conditional_rendering())
{
// Conditional render toggle
push_constants[0].size = 20;
}
const auto set_layout = vk::descriptors::create_layout(bindings);
VkPipelineLayoutCreateInfo layout_info = {};
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
layout_info.setLayoutCount = 1;
layout_info.pSetLayouts = &set_layout;
layout_info.pushConstantRangeCount = 1;
layout_info.pPushConstantRanges = push_constants.data();
VkPipelineLayout result;
CHECK_RESULT(vkCreatePipelineLayout(dev, &layout_info, nullptr, &result));
return std::make_tuple(result, set_layout);
}
}
u64 VKGSRender::get_cycles()
{
return thread_ctrl::get_cycles(static_cast<named_thread<VKGSRender>&>(*this));
}
VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
{
if (m_instance.create("RPCS3"))
{
m_instance.bind();
}
else
{
rsx_log.fatal("Could not find a Vulkan compatible GPU driver. Your GPU(s) may not support Vulkan, or you need to install the Vulkan runtime and drivers");
m_device = VK_NULL_HANDLE;
return;
}
std::vector<vk::physical_device>& gpus = m_instance.enumerate_devices();
//Actually confirm that the loader found at least one compatible device
//This should not happen unless something is wrong with the driver setup on the target system
if (gpus.empty())
{
//We can't throw in Emulator::Load, so we show error and return
rsx_log.fatal("No compatible GPU devices found");
m_device = VK_NULL_HANDLE;
return;
}
bool gpu_found = false;
std::string adapter_name = g_cfg.video.vk.adapter;
display_handle_t display = m_frame->handle();
#ifdef HAVE_X11
std::visit([this](auto&& p) {
using T = std::decay_t<decltype(p)>;
if constexpr (std::is_same_v<T, std::pair<Display*, Window>>)
{
m_display_handle = p.first; XFlush(m_display_handle);
}
}, display);
#endif
for (auto &gpu : gpus)
{
if (gpu.get_name() == adapter_name)
{
m_swapchain.reset(m_instance.create_swapchain(display, gpu));
gpu_found = true;
break;
}
}
if (!gpu_found || adapter_name.empty())
{
m_swapchain.reset(m_instance.create_swapchain(display, gpus[0]));
}
if (!m_swapchain)
{
m_device = VK_NULL_HANDLE;
rsx_log.fatal("Could not successfully initialize a swapchain");
return;
}
m_device = const_cast<vk::render_device*>(&m_swapchain->get_device());
vk::set_current_renderer(m_swapchain->get_device());
m_swapchain_dims.width = m_frame->client_width();
m_swapchain_dims.height = m_frame->client_height();
if (!m_swapchain->init(m_swapchain_dims.width, m_swapchain_dims.height))
{
swapchain_unavailable = true;
}
//create command buffer...
m_command_buffer_pool.create((*m_device), m_device->get_graphics_queue_family());
m_primary_cb_list.create(m_command_buffer_pool, vk::command_buffer::access_type_hint::flush_only);
m_current_command_buffer = m_primary_cb_list.get();
m_current_command_buffer->begin();
//Create secondary command_buffer for parallel operations
m_secondary_command_buffer_pool.create((*m_device), m_device->get_graphics_queue_family());
m_secondary_cb_list.create(m_secondary_command_buffer_pool, vk::command_buffer::access_type_hint::all);
//Precalculated stuff
std::tie(m_pipeline_layout, m_descriptor_layouts) = get_shared_pipeline_layout(*m_device);
//Occlusion
m_occlusion_query_manager = std::make_unique<vk::query_pool_manager>(*m_device, VK_QUERY_TYPE_OCCLUSION, OCCLUSION_MAX_POOL_SIZE);
m_occlusion_map.resize(rsx::reports::occlusion_query_count);
for (u32 n = 0; n < rsx::reports::occlusion_query_count; ++n)
m_occlusion_query_data[n].driver_handle = n;
if (g_cfg.video.precise_zpass_count)
{
m_occlusion_query_manager->set_control_flags(VK_QUERY_CONTROL_PRECISE_BIT, 0);
}
// Generate frame contexts
const u32 max_draw_calls = m_device->get_descriptor_max_draw_calls();
const auto& binding_table = m_device->get_pipeline_binding_table();
const u32 num_fs_samplers = binding_table.vertex_textures_first_bind_slot - binding_table.textures_first_bind_slot;
rsx::simple_array<VkDescriptorPoolSize> descriptor_type_sizes =
{
{ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 6 },
{ VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER , 3 },
{ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , (num_fs_samplers + 4) },
// Conditional rendering predicate slot; refactor to allow skipping this when not needed
{ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 }
};
m_descriptor_pool.create(*m_device, descriptor_type_sizes, max_draw_calls);
VkSemaphoreCreateInfo semaphore_info = {};
semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
// VRAM allocation
m_attrib_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, "attrib buffer", 0x400000, VK_TRUE);
m_fragment_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment env buffer");
m_vertex_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "vertex env buffer");
m_fragment_texture_params_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment texture params buffer");
m_vertex_layout_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "vertex layout buffer", 0x10000, VK_TRUE);
m_fragment_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment constants buffer");
m_transform_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "transform constants buffer");
m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
m_raster_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "raster env buffer");
const auto shadermode = g_cfg.video.shadermode.get();
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
m_vertex_instructions_buffer.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, 64 * 0x100000, "vertex instructions buffer", 512 * 16);
m_fragment_instructions_buffer.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, 64 * 0x100000, "fragment instructions buffer", 2048);
}
// Initialize optional allocation information with placeholders
m_vertex_env_buffer_info = { m_vertex_env_ring_info.heap->value, 0, 32 };
m_vertex_constants_buffer_info = { m_transform_constants_ring_info.heap->value, 0, 32 };
m_fragment_env_buffer_info = { m_fragment_env_ring_info.heap->value, 0, 32 };
m_fragment_texture_params_buffer_info = { m_fragment_texture_params_ring_info.heap->value, 0, 32 };
m_raster_env_buffer_info = { m_raster_env_ring_info.heap->value, 0, 128 };
const auto limits = m_device->gpu().get_limits();
m_texbuffer_view_size = std::min(limits.maxTexelBufferElements, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000u);
if (m_texbuffer_view_size < 0x800000)
{
// Warn, only possibly expected on macOS
rsx_log.warning("Current driver may crash due to memory limitations (%uk)", m_texbuffer_view_size / 1024);
}
for (auto &ctx : frame_context_storage)
{
vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.present_wait_semaphore);
vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.acquire_signal_semaphore);
}
const auto& memory_map = m_device->get_memory_mapping();
null_buffer = std::make_unique<vk::buffer>(*m_device, 32, memory_map.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0, VMM_ALLOCATION_POOL_UNDEFINED);
null_buffer_view = std::make_unique<vk::buffer_view>(*m_device, null_buffer->value, VK_FORMAT_R8_UINT, 0, 32);
vk::initialize_compiler_context();
vk::initialize_pipe_compiler(g_cfg.video.shader_compiler_threads_count);
m_prog_buffer = std::make_unique<vk::program_cache>
(
[this](const vk::pipeline_props& props, const RSXVertexProgram& vp, const RSXFragmentProgram& fp)
{
// Program was linked or queued for linking
m_shaders_cache->store(props, vp, fp);
}
);
if (g_cfg.video.disable_vertex_cache)
m_vertex_cache = std::make_unique<vk::null_vertex_cache>();
else
m_vertex_cache = std::make_unique<vk::weak_vertex_cache>();
m_shaders_cache = std::make_unique<vk::shader_cache>(*m_prog_buffer, "vulkan", "v1.94");
for (u32 i = 0; i < m_swapchain->get_swap_image_count(); ++i)
{
const auto target_layout = m_swapchain->get_optimal_present_layout();
const auto target_image = m_swapchain->get_image(i);
VkClearColorValue clear_color{};
VkImageSubresourceRange range = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
vk::change_image_layout(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, range);
vkCmdClearColorImage(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear_color, 1, &range);
vk::change_image_layout(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, target_layout, range);
}
m_current_frame = &frame_context_storage[0];
m_texture_cache.initialize((*m_device), m_device->get_graphics_queue(),
m_texture_upload_buffer_ring_info);
vk::get_overlay_pass<vk::ui_overlay_renderer>()->init(*m_current_command_buffer, m_texture_upload_buffer_ring_info);
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
m_shader_interpreter.init(*m_device);
}
backend_config.supports_multidraw = true;
// NVIDIA has broken attribute interpolation
backend_config.supports_normalized_barycentrics = (
vk::get_driver_vendor() != vk::driver_vendor::NVIDIA ||
!m_device->get_barycoords_support() ||
g_cfg.video.shader_precision == gpu_preset_level::low);
// NOTE: We do not actually need multiple sample support for A2C to work
// This is here for visual consistency - will be removed when AA problems due to mipmaps are fixed
if (g_cfg.video.antialiasing_level != msaa_level::none)
{
backend_config.supports_hw_msaa = true;
backend_config.supports_hw_a2c = true;
backend_config.supports_hw_a2one = m_device->get_alpha_to_one_support();
}
// NOTE: On NVIDIA cards going back decades (including the PS3) there is a slight normalization inaccuracy in compressed formats.
// Confirmed in BLES01916 (The Evil Within) which uses RGB565 for some virtual texturing data.
backend_config.supports_hw_renormalization = (vk::get_driver_vendor() == vk::driver_vendor::NVIDIA);
// Conditional rendering support
// Do not use on MVK due to a speedhack we rely on (streaming results without stopping the current renderpass)
// If we break the renderpasses, MVK loses around 75% of its performance in troublesome spots compared to just doing a CPU sync
backend_config.supports_hw_conditional_render = (vk::get_driver_vendor() != vk::driver_vendor::MVK);
// Passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
// Host sync
backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
// Async compute and related operations
if (g_cfg.video.vk.asynchronous_texture_streaming)
{
// Optimistic, enable async compute
backend_config.supports_asynchronous_compute = true;
if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
{
rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
backend_config.supports_asynchronous_compute = false;
}
}
// Sanity checks
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::NVIDIA:
if (backend_config.supports_asynchronous_compute)
{
if (auto chip_family = vk::get_chip_family();
chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
{
rsx_log.warning("Older NVIDIA cards do not meet requirements for true asynchronous compute due to some driver fakery.");
}
rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
}
break;
#if !defined(_WIN32)
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
#if !defined(__linux__)
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
#endif
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("AMDGPU kernel driver on Linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
case vk::driver_vendor::INTEL:
// As expected host allocations won't work on INTEL despite the extension being present
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("INTEL driver does not support passthrough DMA buffers");
backend_config.supports_passthrough_dma = false;
}
break;
default: break;
}
if (backend_config.supports_asynchronous_compute)
{
m_async_compute_memory_barrier =
{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2_KHR,
.pNext = nullptr,
.srcStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT_KHR | VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT_KHR,
.srcAccessMask = VK_ACCESS_2_MEMORY_WRITE_BIT_KHR,
.dstStageMask = VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT_KHR | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT_KHR,
.dstAccessMask = VK_ACCESS_2_SHADER_SAMPLED_READ_BIT_KHR
};
m_async_compute_dependency_info =
{
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO_KHR,
.memoryBarrierCount = 1,
.pMemoryBarriers = &m_async_compute_memory_barrier
};
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler, m_async_compute_dependency_info);
}
if (backend_config.supports_host_gpu_labels)
{
if (backend_config.supports_passthrough_dma)
{
m_host_object_data = std::make_unique<vk::buffer>(*m_device,
0x10000,
memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
VMM_ALLOCATION_POOL_SYSTEM);
m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
ensure(m_host_data_ptr->magic == 0xCAFEBABE);
}
else
{
rsx_log.error("Your GPU/driver does not support extensions required to enable passthrough DMA emulation. Host GPU labels will be disabled.");
backend_config.supports_host_gpu_labels = false;
}
}
if (!backend_config.supports_host_gpu_labels &&
!backend_config.supports_asynchronous_compute)
{
// Disable passthrough DMA unless we enable a feature that requires it.
// I'm avoiding an explicit checkbox for this until I figure out why host labels don't fix all problems with passthrough.
backend_config.supports_passthrough_dma = false;
}
}
VKGSRender::~VKGSRender()
{
if (m_device == VK_NULL_HANDLE)
{
//Initialization failed
return;
}
// Flush DMA queue
while (!g_fxo->get<rsx::dma_manager>().sync())
{
do_local_task(rsx::FIFO::state::lock_wait);
}
//Wait for device to finish up with resources
vkDeviceWaitIdle(*m_device);
// Globals. TODO: Refactor lifetime management
if (backend_config.supports_asynchronous_compute)
{
g_fxo->get<vk::AsyncTaskScheduler>().destroy();
}
// GC cleanup
vk::get_resource_manager()->flush();
// Host data
if (m_host_object_data)
{
m_host_object_data->unmap();
m_host_object_data.reset();
}
// Clear flush requests
m_flush_requests.clear_pending_flag();
// Shaders
vk::destroy_pipe_compiler(); // Ensure no pending shaders being compiled
vk::finalize_compiler_context(); // Shut down the glslang compiler
m_prog_buffer->clear(); // Delete shader objects
m_shader_interpreter.destroy();
m_persistent_attribute_storage.reset();
m_volatile_attribute_storage.reset();
m_vertex_layout_storage.reset();
// Upscaler (references some global resources)
m_upscaler.reset();
// Heaps
m_attrib_ring_info.destroy();
m_fragment_env_ring_info.destroy();
m_vertex_env_ring_info.destroy();
m_fragment_texture_params_ring_info.destroy();
m_vertex_layout_ring_info.destroy();
m_fragment_constants_ring_info.destroy();
m_transform_constants_ring_info.destroy();
m_index_buffer_ring_info.destroy();
m_texture_upload_buffer_ring_info.destroy();
m_vertex_instructions_buffer.destroy();
m_fragment_instructions_buffer.destroy();
m_raster_env_ring_info.destroy();
// Fallback bindables
null_buffer.reset();
null_buffer_view.reset();
if (m_current_frame == &m_aux_frame_context)
{
// Return resources back to the owner
m_current_frame = &frame_context_storage[m_current_queue_index];
m_current_frame->swap_storage(m_aux_frame_context);
m_current_frame->grab_resources(m_aux_frame_context);
}
m_aux_frame_context.buffer_views_to_clean.clear();
// NOTE: aux_context uses descriptor pools borrowed from the main queues and any allocations will be automatically freed when pool is destroyed
for (auto &ctx : frame_context_storage)
{
vkDestroySemaphore((*m_device), ctx.present_wait_semaphore, nullptr);
vkDestroySemaphore((*m_device), ctx.acquire_signal_semaphore, nullptr);
ctx.buffer_views_to_clean.clear();
}
// Textures
m_rtts.destroy();
m_texture_cache.destroy();
m_stencil_mirror_sampler.reset();
// Overlay text handler
m_text_writer.reset();
// Pipeline descriptors
m_descriptor_pool.destroy();
vkDestroyPipelineLayout(*m_device, m_pipeline_layout, nullptr);
vkDestroyDescriptorSetLayout(*m_device, m_descriptor_layouts, nullptr);
// Queries
m_occlusion_query_manager.reset();
m_cond_render_buffer.reset();
// Command buffer
m_primary_cb_list.destroy();
m_secondary_cb_list.destroy();
m_command_buffer_pool.destroy();
m_secondary_command_buffer_pool.destroy();
// Global resources
vk::destroy_global_resources();
// Device handles/contexts
m_swapchain->destroy();
m_instance.destroy();
#if defined(HAVE_X11) && defined(HAVE_VULKAN)
if (m_display_handle)
XCloseDisplay(m_display_handle);
#endif
}
bool VKGSRender::on_access_violation(u32 address, bool is_writing)
{
vk::texture_cache::thrashed_set result;
{
const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
result = m_texture_cache.invalidate_address(*m_secondary_cb_list.get(), address, cause);
}
if (result.invalidate_samplers)
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
if (!result.violation_handled)
{
return zcull_ctrl->on_access_violation(address);
}
if (result.num_flushable > 0)
{
if (g_fxo->get<rsx::dma_manager>().is_current_thread())
{
// The offloader thread cannot handle flush requests
ensure(!(m_queue_status & flush_queue_state::deadlock));
m_offloader_fault_range = g_fxo->get<rsx::dma_manager>().get_fault_range(is_writing);
m_offloader_fault_cause = (is_writing) ? rsx::invalidation_cause::write : rsx::invalidation_cause::read;
g_fxo->get<rsx::dma_manager>().set_mem_fault_flag();
m_queue_status |= flush_queue_state::deadlock;
m_eng_interrupt_mask |= rsx::backend_interrupt;
// Wait for deadlock to clear
while (m_queue_status & flush_queue_state::deadlock)
{
utils::pause();
}
g_fxo->get<rsx::dma_manager>().clear_mem_fault_flag();
return true;
}
bool has_queue_ref = false;
if (!is_current_thread()) [[likely]]
{
// Always submit primary cb to ensure state consistency (flush pending changes such as image transitions)
vm::temporary_unlock();
std::lock_guard lock(m_flush_queue_mutex);
m_flush_requests.post(false);
m_eng_interrupt_mask |= rsx::backend_interrupt;
has_queue_ref = true;
}
else
{
if (vk::is_uninterruptible())
{
rsx_log.error("Fault in uninterruptible code!");
}
// Flush primary cb queue to sync pending changes (e.g image transitions!)
flush_command_queue();
}
if (has_queue_ref)
{
// Wait for the RSX thread to process request if it hasn't already
m_flush_requests.producer_wait();
}
m_texture_cache.flush_all(*m_secondary_cb_list.next(), result);
if (has_queue_ref)
{
// Release RSX thread
m_flush_requests.remove_one();
}
}
return true;
}
void VKGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause)
{
std::lock_guard lock(m_secondary_cb_guard);
auto data = m_texture_cache.invalidate_range(*m_secondary_cb_list.next(), range, cause);
AUDIT(data.empty());
if (cause == rsx::invalidation_cause::unmap)
{
if (data.violation_handled)
{
m_texture_cache.purge_unreleased_sections();
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
}
vk::unmap_dma(range.start, range.length());
}
}
void VKGSRender::on_semaphore_acquire_wait()
{
if (m_flush_requests.pending() ||
(async_flip_requested & flip_request::emu_requested) ||
(m_queue_status & flush_queue_state::deadlock))
{
do_local_task(rsx::FIFO::state::lock_wait);
}
}
bool VKGSRender::on_vram_exhausted(rsx::problem_severity severity)
{
ensure(!vk::is_uninterruptible() && rsx::get_current_renderer()->is_current_thread());
bool texture_cache_relieved = false;
if (severity >= rsx::problem_severity::fatal)
{
// Hard sync before trying to evict anything. This guarantees no UAF crashes in the driver.
// As a bonus, we also get a free gc pass
flush_command_queue(true, true);
if (m_texture_cache.is_overallocated())
{
// Evict some unused textures. Do not evict any active references
std::set<u32> exclusion_list;
auto scan_array = [&](const auto& texture_array)
{
for (auto i = 0ull; i < texture_array.size(); ++i)
{
const auto& tex = texture_array[i];
const auto addr = rsx::get_address(tex.offset(), tex.location());
exclusion_list.insert(addr);
}
};
scan_array(rsx::method_registers.fragment_textures);
scan_array(rsx::method_registers.vertex_textures);
// Hold the secondary lock guard to prevent threads from trying to touch access violation handler stuff
std::lock_guard lock(m_secondary_cb_guard);
rsx_log.warning("Texture cache is overallocated. Will evict unnecessary textures.");
texture_cache_relieved = m_texture_cache.evict_unused(exclusion_list);
}
}
texture_cache_relieved |= m_texture_cache.handle_memory_pressure(severity);
if (severity == rsx::problem_severity::low)
{
// Low severity only handles invalidating unused textures
return texture_cache_relieved;
}
bool surface_cache_relieved = false;
const auto mem_info = m_device->get_memory_mapping();
// Check if we need to spill
if (severity >= rsx::problem_severity::fatal && // Only spill for fatal errors
mem_info.device_local != mem_info.host_visible_coherent && // Do not spill if it is an IGP, there is nowhere to spill to
m_rtts.is_overallocated()) // Surface cache must be over-allocated by the design quota
{
// Queue a VRAM spill operation.
m_rtts.spill_unused_memory();
}
// Moderate severity and higher also starts removing stale render target objects
if (m_rtts.handle_memory_pressure(*m_current_command_buffer, severity))
{
surface_cache_relieved = true;
m_rtts.trim(*m_current_command_buffer, severity);
}
const bool any_cache_relieved = (texture_cache_relieved || surface_cache_relieved);
if (severity < rsx::problem_severity::fatal)
{
return any_cache_relieved;
}
if (surface_cache_relieved && !m_samplers_dirty)
{
// If surface cache was modified destructively, then we must reload samplers touching the surface cache.
bool invalidate_samplers = false;
auto scan_array = [&](const auto& texture_array, const auto& sampler_states)
{
if (invalidate_samplers)
{
return;
}
for (auto i = 0ull; i < texture_array.size(); ++i)
{
if (texture_array[i].enabled() &&
sampler_states[i] &&
sampler_states[i]->upload_context == rsx::texture_upload_context::framebuffer_storage)
{
invalidate_samplers = true;
break;
}
}
};
scan_array(rsx::method_registers.fragment_textures, fs_sampler_state);
scan_array(rsx::method_registers.vertex_textures, vs_sampler_state);
if (invalidate_samplers)
{
m_samplers_dirty.store(true);
}
}
// Imminent crash, full GPU sync is the least of our problems
flush_command_queue(true, true);
return any_cache_relieved;
}
void VKGSRender::on_descriptor_pool_fragmentation(bool is_fatal)
{
if (!is_fatal)
{
// It is very likely that the release is simply in progress (enqueued)
m_primary_cb_list.wait_all();
return;
}
// Just flush everything. Unless the hardware is very deficient, this should happen very rarely.
flush_command_queue(true, true);
}
void VKGSRender::notify_tile_unbound(u32 tile)
{
//TODO: Handle texture writeback
if (false)
{
u32 addr = rsx::get_address(tiles[tile].offset, tiles[tile].location);
on_notify_memory_unmapped(addr, tiles[tile].size);
m_rtts.invalidate_surface_address(addr, false);
}
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
}
void VKGSRender::check_heap_status(u32 flags)
{
ensure(flags);
bool heap_critical;
if (flags == VK_HEAP_CHECK_ALL)
{
heap_critical = m_attrib_ring_info.is_critical() ||
m_texture_upload_buffer_ring_info.is_critical() ||
m_fragment_env_ring_info.is_critical() ||
m_vertex_env_ring_info.is_critical() ||
m_fragment_texture_params_ring_info.is_critical() ||
m_vertex_layout_ring_info.is_critical() ||
m_fragment_constants_ring_info.is_critical() ||
m_transform_constants_ring_info.is_critical() ||
m_index_buffer_ring_info.is_critical() ||
m_raster_env_ring_info.is_critical();
}
else
{
heap_critical = false;
u32 test = 1u << std::countr_zero(flags);
do
{
switch (flags & test)
{
case 0:
break;
case VK_HEAP_CHECK_TEXTURE_UPLOAD_STORAGE:
heap_critical = m_texture_upload_buffer_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_STORAGE:
heap_critical = m_attrib_ring_info.is_critical() || m_index_buffer_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_ENV_STORAGE:
heap_critical = m_vertex_env_ring_info.is_critical();
break;
case VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE:
heap_critical = m_fragment_env_ring_info.is_critical() || m_raster_env_ring_info.is_critical();
break;
case VK_HEAP_CHECK_TEXTURE_ENV_STORAGE:
heap_critical = m_fragment_texture_params_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_LAYOUT_STORAGE:
heap_critical = m_vertex_layout_ring_info.is_critical();
break;
case VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE:
heap_critical = m_transform_constants_ring_info.is_critical();
break;
case VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE:
heap_critical = m_fragment_constants_ring_info.is_critical();
break;
default:
fmt::throw_exception("Unexpected heap flag set! (0x%X)", test);
}
flags &= ~test;
test <<= 1;
}
while (flags && !heap_critical);
}
if (heap_critical)
{
m_profiler.start();
vk::frame_context_t *target_frame = nullptr;
if (!m_queued_frames.empty())
{
if (m_current_frame != &m_aux_frame_context)
{
target_frame = m_queued_frames.front();
}
}
if (target_frame == nullptr)
{
flush_command_queue(true);
m_vertex_cache->purge();
m_index_buffer_ring_info.reset_allocation_stats();
m_fragment_env_ring_info.reset_allocation_stats();
m_vertex_env_ring_info.reset_allocation_stats();
m_fragment_texture_params_ring_info.reset_allocation_stats();
m_vertex_layout_ring_info.reset_allocation_stats();
m_fragment_constants_ring_info.reset_allocation_stats();
m_transform_constants_ring_info.reset_allocation_stats();
m_attrib_ring_info.reset_allocation_stats();
m_texture_upload_buffer_ring_info.reset_allocation_stats();
m_raster_env_ring_info.reset_allocation_stats();
m_current_frame->reset_heap_ptrs();
m_last_heap_sync_time = rsx::get_shared_tag();
}
else
{
// Flush the frame context
frame_context_cleanup(target_frame);
}
m_frame_stats.flip_time += m_profiler.duration();
}
}
void VKGSRender::check_present_status()
{
while (!m_queued_frames.empty())
{
auto ctx = m_queued_frames.front();
if (!ctx->swap_command_buffer->poke())
{
return;
}
frame_context_cleanup(ctx);
}
}
VkDescriptorSet VKGSRender::allocate_descriptor_set()
{
if (!m_shader_interpreter.is_interpreter(m_program)) [[likely]]
{
return m_descriptor_pool.allocate(m_descriptor_layouts, VK_TRUE);
}
else
{
return m_shader_interpreter.allocate_descriptor_set();
}
}
void VKGSRender::set_viewport()
{
const auto [clip_width, clip_height] = rsx::apply_resolution_scale<true>(
rsx::method_registers.surface_clip_width(), rsx::method_registers.surface_clip_height());
const auto zclip_near = rsx::method_registers.clip_min();
const auto zclip_far = rsx::method_registers.clip_max();
//NOTE: The scale_offset matrix already has viewport matrix factored in
m_viewport.x = 0;
m_viewport.y = 0;
m_viewport.width = clip_width;
m_viewport.height = clip_height;
if (m_device->get_unrestricted_depth_range_support())
{
m_viewport.minDepth = zclip_near;
m_viewport.maxDepth = zclip_far;
}
else
{
m_viewport.minDepth = 0.f;
m_viewport.maxDepth = 1.f;
}
m_current_command_buffer->flags |= vk::command_buffer::cb_reload_dynamic_state;
m_graphics_state.clear(rsx::pipeline_state::zclip_config_state_dirty);
}
void VKGSRender::set_scissor(bool clip_viewport)
{
areau scissor;
if (get_scissor(scissor, clip_viewport))
{
m_scissor.extent.height = scissor.height();
m_scissor.extent.width = scissor.width();
m_scissor.offset.x = scissor.x1;
m_scissor.offset.y = scissor.y1;
m_current_command_buffer->flags |= vk::command_buffer::cb_reload_dynamic_state;
}
}
void VKGSRender::bind_viewport()
{
if (m_graphics_state & rsx::pipeline_state::zclip_config_state_dirty)
{
if (m_device->get_unrestricted_depth_range_support())
{
m_viewport.minDepth = rsx::method_registers.clip_min();
m_viewport.maxDepth = rsx::method_registers.clip_max();
}
m_graphics_state.clear(rsx::pipeline_state::zclip_config_state_dirty);
}
vkCmdSetViewport(*m_current_command_buffer, 0, 1, &m_viewport);
vkCmdSetScissor(*m_current_command_buffer, 0, 1, &m_scissor);
}
void VKGSRender::on_init_thread()
{
if (m_device == VK_NULL_HANDLE)
{
fmt::throw_exception("No Vulkan device was created");
}
GSRender::on_init_thread();
zcull_ctrl.reset(static_cast<::rsx::reports::ZCULL_control*>(this));
if (!m_overlay_manager)
{
m_frame->hide();
m_shaders_cache->load(nullptr, m_pipeline_layout);
m_frame->show();
}
else
{
rsx::shader_loading_dialog_native dlg(this);
// TODO: Handle window resize messages during loading on GPUs without OUT_OF_DATE_KHR support
m_shaders_cache->load(&dlg, m_pipeline_layout);
}
}
void VKGSRender::on_exit()
{
GSRender::on_exit();
zcull_ctrl.release();
}
void VKGSRender::clear_surface(u32 mask)
{
if (skip_current_frame || swapchain_unavailable) return;
// If stencil write mask is disabled, remove clear_stencil bit
if (!rsx::method_registers.stencil_mask()) mask &= ~RSX_GCM_CLEAR_STENCIL_BIT;
// Ignore invalid clear flags
if (!(mask & RSX_GCM_CLEAR_ANY_MASK)) return;
u8 ctx = rsx::framebuffer_creation_context::context_draw;
if (mask & RSX_GCM_CLEAR_COLOR_RGBA_MASK) ctx |= rsx::framebuffer_creation_context::context_clear_color;
if (mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK) ctx |= rsx::framebuffer_creation_context::context_clear_depth;
init_buffers(rsx::framebuffer_creation_context{ctx});
if (!m_graphics_state.test(rsx::rtt_config_valid)) return;
//float depth_clear = 1.f;
u32 stencil_clear = 0;
u32 depth_stencil_mask = 0;
std::vector<VkClearAttachment> clear_descriptors;
VkClearValue depth_stencil_clear_values = {}, color_clear_values = {};
u16 scissor_x = static_cast<u16>(m_scissor.offset.x);
u16 scissor_w = static_cast<u16>(m_scissor.extent.width);
u16 scissor_y = static_cast<u16>(m_scissor.offset.y);
u16 scissor_h = static_cast<u16>(m_scissor.extent.height);
const u16 fb_width = m_draw_fbo->width();
const u16 fb_height = m_draw_fbo->height();
//clip region
std::tie(scissor_x, scissor_y, scissor_w, scissor_h) = rsx::clip_region<u16>(fb_width, fb_height, scissor_x, scissor_y, scissor_w, scissor_h, true);
VkClearRect region = { { { scissor_x, scissor_y }, { scissor_w, scissor_h } }, 0, 1 };
const bool full_frame = (scissor_w == fb_width && scissor_h == fb_height);
bool update_color = false, update_z = false;
auto surface_depth_format = rsx::method_registers.surface_depth_fmt();
if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil); mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK)
{
if (mask & RSX_GCM_CLEAR_DEPTH_BIT)
{
u32 max_depth_value = get_max_depth_value(surface_depth_format);
u32 clear_depth = rsx::method_registers.z_clear_value(is_depth_stencil_format(surface_depth_format));
float depth_clear = static_cast<float>(clear_depth) / max_depth_value;
depth_stencil_clear_values.depthStencil.depth = depth_clear;
depth_stencil_clear_values.depthStencil.stencil = stencil_clear;
depth_stencil_mask |= VK_IMAGE_ASPECT_DEPTH_BIT;
}
if (is_depth_stencil_format(surface_depth_format))
{
if (mask & RSX_GCM_CLEAR_STENCIL_BIT)
{
u8 clear_stencil = rsx::method_registers.stencil_clear_value();
depth_stencil_clear_values.depthStencil.stencil = clear_stencil;
depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
if (ds->samples() > 1)
{
if (full_frame) ds->stencil_init_flags &= 0xFF;
ds->stencil_init_flags |= clear_stencil;
}
}
}
if ((depth_stencil_mask && depth_stencil_mask != ds->aspect()) || !full_frame)
{
// At least one aspect is not being cleared or the clear does not cover the full frame
// Steps to initialize memory are required
if (ds->state_flags & rsx::surface_state_flags::erase_bkgnd && // Needs initialization
ds->old_contents.empty() && !g_cfg.video.read_depth_buffer) // No way to load data from memory, so no initialization given
{
// Only one aspect was cleared. Make sure to memory initialize the other before removing dirty flag
const auto ds_mask = (mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK);
if (ds_mask == RSX_GCM_CLEAR_DEPTH_BIT && (ds->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT))
{
// Depth was cleared, initialize stencil
depth_stencil_clear_values.depthStencil.stencil = 0xFF;
depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
}
else if (ds_mask == RSX_GCM_CLEAR_STENCIL_BIT)
{
// Stencil was cleared, initialize depth
depth_stencil_clear_values.depthStencil.depth = 1.f;
depth_stencil_mask |= VK_IMAGE_ASPECT_DEPTH_BIT;
}
}
else
{
// Barrier required before any writes
ds->write_barrier(*m_current_command_buffer);
}
}
}
if (auto colormask = (mask & RSX_GCM_CLEAR_COLOR_RGBA_MASK))
{
if (!m_draw_buffers.empty())
{
bool use_fast_clear = (colormask == RSX_GCM_CLEAR_COLOR_RGBA_MASK);;
u8 clear_a = rsx::method_registers.clear_color_a();
u8 clear_r = rsx::method_registers.clear_color_r();
u8 clear_g = rsx::method_registers.clear_color_g();
u8 clear_b = rsx::method_registers.clear_color_b();
switch (rsx::method_registers.surface_color())
{
case rsx::surface_color_format::x32:
case rsx::surface_color_format::w16z16y16x16:
case rsx::surface_color_format::w32z32y32x32:
{
//NOP
colormask = 0;
break;
}
case rsx::surface_color_format::b8:
{
rsx::get_b8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_b8_clearmask(colormask);
use_fast_clear = (colormask & RSX_GCM_CLEAR_RED_BIT);
break;
}
case rsx::surface_color_format::g8b8:
{
rsx::get_g8b8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_g8b8_r8g8_clearmask(colormask);
use_fast_clear = ((colormask & RSX_GCM_CLEAR_COLOR_RG_MASK) == RSX_GCM_CLEAR_COLOR_RG_MASK);
break;
}
case rsx::surface_color_format::r5g6b5:
{
rsx::get_rgb565_clear_color(clear_r, clear_g, clear_b, clear_a);
use_fast_clear = ((colormask & RSX_GCM_CLEAR_COLOR_RGB_MASK) == RSX_GCM_CLEAR_COLOR_RGB_MASK);
break;
}
case rsx::surface_color_format::x1r5g5b5_o1r5g5b5:
{
rsx::get_a1rgb555_clear_color(clear_r, clear_g, clear_b, clear_a, 255);
break;
}
case rsx::surface_color_format::x1r5g5b5_z1r5g5b5:
{
rsx::get_a1rgb555_clear_color(clear_r, clear_g, clear_b, clear_a, 0);
break;
}
case rsx::surface_color_format::a8b8g8r8:
case rsx::surface_color_format::x8b8g8r8_o8b8g8r8:
case rsx::surface_color_format::x8b8g8r8_z8b8g8r8:
{
rsx::get_abgr8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_abgr8_clearmask(colormask);
break;
}
default:
{
break;
}
}
if (colormask)
{
if (!use_fast_clear || !full_frame)
{
// If we're not clobber all the memory, a barrier is required
for (const auto& index : m_rtts.m_bound_render_target_ids)
{
m_rtts.m_bound_render_targets[index].second->write_barrier(*m_current_command_buffer);
}
}
color_clear_values.color.float32[0] = static_cast<float>(clear_r) / 255;
color_clear_values.color.float32[1] = static_cast<float>(clear_g) / 255;
color_clear_values.color.float32[2] = static_cast<float>(clear_b) / 255;
color_clear_values.color.float32[3] = static_cast<float>(clear_a) / 255;
if (use_fast_clear)
{
for (u32 index = 0; index < m_draw_buffers.size(); ++index)
{
clear_descriptors.push_back({ VK_IMAGE_ASPECT_COLOR_BIT, index, color_clear_values });
}
}
else
{
color4f clear_color =
{
color_clear_values.color.float32[0],
color_clear_values.color.float32[1],
color_clear_values.color.float32[2],
color_clear_values.color.float32[3]
};
auto attachment_clear_pass = vk::get_overlay_pass<vk::attachment_clear_pass>();
attachment_clear_pass->run(*m_current_command_buffer, m_draw_fbo, region.rect, colormask, clear_color, get_render_pass());
}
update_color = true;
}
}
}
if (depth_stencil_mask)
{
if ((depth_stencil_mask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
rsx::method_registers.stencil_mask() != 0xff)
{
// Partial stencil clear. Disables fast stencil clear
auto ds = std::get<1>(m_rtts.m_bound_depth_stencil);
auto key = vk::get_renderpass_key({ ds });
auto renderpass = vk::get_renderpass(*m_device, key);
vk::get_overlay_pass<vk::stencil_clear_pass>()->run(
*m_current_command_buffer, ds, region.rect,
depth_stencil_clear_values.depthStencil.stencil,
rsx::method_registers.stencil_mask(), renderpass);
depth_stencil_mask &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
}
if (depth_stencil_mask)
{
clear_descriptors.push_back({ static_cast<VkImageAspectFlags>(depth_stencil_mask), 0, depth_stencil_clear_values });
}
update_z = true;
}
if (update_color || update_z)
{
m_rtts.on_write({ update_color, update_color, update_color, update_color }, update_z);
}
if (!clear_descriptors.empty())
{
begin_render_pass();
vkCmdClearAttachments(*m_current_command_buffer, ::size32(clear_descriptors), clear_descriptors.data(), 1, &region);
}
}
void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
{
close_and_submit_command_buffer();
if (hard_sync)
{
// wait for the latest instruction to execute
m_current_command_buffer->reset();
// Clear all command buffer statuses
m_primary_cb_list.poke_all();
// Drain present queue
while (!m_queued_frames.empty())
{
check_present_status();
}
m_flush_requests.clear_pending_flag();
}
if (!do_not_switch)
{
// Grab next cb in line and make it usable
// NOTE: Even in the case of a hard sync, this is required to free any waiters on the CB (ZCULL)
m_current_command_buffer = m_primary_cb_list.next();
m_current_command_buffer->reset();
}
else
{
// Special hard-sync where we must preserve the CB. This can happen when an emergency event handler is invoked and needs to flush to hw.
ensure(hard_sync);
}
// Just in case a queued frame holds a ref to this cb, drain the present queue
check_present_status();
if (m_occlusion_query_active)
{
m_current_command_buffer->flags |= vk::command_buffer::cb_load_occluson_task;
}
m_current_command_buffer->begin();
}
bool VKGSRender::release_GCM_label(u32 address, u32 args)
{
if (!backend_config.supports_host_gpu_labels)
{
return false;
}
auto drain_label_queue = [this]()
{
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
utils::pause();
if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
}
};
ensure(m_host_data_ptr);
if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
{
// All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed
drain_label_queue();
return false;
}
const auto mapping = vk::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
if (!dynamic_cast<vk::memory_block_host*>(mapping.second->memory.get()))
{
// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
// Take the L and try the fallback.
rsx_log.warning("Host label update at 0x%x was not possible.", address);
drain_label_queue();
return false;
}
m_host_data_ptr->last_label_release_event = m_host_data_ptr->inc_counter();
if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
{
if (vk::is_renderpass_open(*m_current_command_buffer))
{
vk::end_renderpass(*m_current_command_buffer);
}
vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
flush_command_queue();
}
else
{
auto cmd = m_secondary_cb_list.next();
cmd->begin();
vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
cmd->end();
vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
cmd->submit(submit_info);
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
return true;
}
void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload)
{
rsx::thread::sync_hint(hint, payload);
if (!(m_current_command_buffer->flags & vk::command_buffer::cb_has_occlusion_task))
{
// Occlusion queries not enabled, do nothing
return;
}
// Occlusion test result evaluation is coming up, avoid a hard sync
switch (hint)
{
case rsx::FIFO::interrupt_hint::conditional_render_eval:
{
// If a flush request is already enqueued, do nothing
if (m_flush_requests.pending())
{
return;
}
// If the result is not going to be read by CELL, do nothing
const auto ref_addr = static_cast<u32>(payload.address);
if (!zcull_ctrl->is_query_result_urgent(ref_addr))
{
// No effect on CELL behaviour, it will be faster to handle this in RSX code
return;
}
// OK, cell will be accessing the results, probably.
// Try to avoid flush spam, it is more costly to flush the CB than it is to just upload the vertex data
// This is supposed to be an optimization afterall.
const auto now = rsx::uclock();
if ((now - m_last_cond_render_eval_hint) > 50)
{
// Schedule a sync on the next loop iteration
m_flush_requests.post(false);
m_flush_requests.remove_one();
}
m_last_cond_render_eval_hint = now;
break;
}
case rsx::FIFO::interrupt_hint::zcull_sync:
{
// Check if the required report is synced to this CB
auto& data = m_occlusion_map[payload.query->driver_handle];
// NOTE: Currently, a special condition exists where the indices can be empty even with active draw count.
// This is caused by async compiler and should be removed when ubershaders are added in
if (!data.is_current(m_current_command_buffer) || data.indices.empty())
{
return;
}
// Unavoidable hard sync coming up, flush immediately
// This heavyweight hint should be used with caution
std::lock_guard lock(m_flush_queue_mutex);
flush_command_queue();
if (m_flush_requests.pending())
{
// Clear without wait
m_flush_requests.clear_pending_flag();
}
break;
}
}
}
void VKGSRender::do_local_task(rsx::FIFO::state state)
{
if (m_queue_status & flush_queue_state::deadlock)
{
// Clear offloader deadlock
// NOTE: It is not possible to handle regular flush requests before this is cleared
// NOTE: This may cause graphics corruption due to unsynchronized modification
on_invalidate_memory_range(m_offloader_fault_range, m_offloader_fault_cause);
m_queue_status.clear(flush_queue_state::deadlock);
}
if (m_queue_status & flush_queue_state::flushing)
{
// Abort recursive CB submit requests.
// When flushing flag is already set, only deadlock events may be processed.
return;
}
else if (m_flush_requests.pending())
{
if (m_flush_queue_mutex.try_lock())
{
// TODO: Determine if a hard sync is necessary
// Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
flush_command_queue();
m_flush_requests.clear_pending_flag();
m_flush_requests.consumer_wait();
m_flush_queue_mutex.unlock();
}
}
else if (!in_begin_end && state != rsx::FIFO::state::lock_wait)
{
if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty)
{
//This will re-engage locks and break the texture cache if another thread is waiting in access violation handler!
//Only call when there are no waiters
m_texture_cache.do_update();
m_graphics_state.clear(rsx::pipeline_state::framebuffer_reads_dirty);
}
}
rsx::thread::do_local_task(state);
switch (state)
{
case rsx::FIFO::state::lock_wait:
// Critical check finished
return;
//case rsx::FIFO::state::spinning:
//case rsx::FIFO::state::empty:
// We have some time, check the present queue
//check_present_status();
//break;
default:
break;
}
if (m_overlay_manager)
{
if (!in_begin_end && async_flip_requested & flip_request::native_ui && !is_stopped())
{
flush_command_queue(true);
rsx::display_flip_info_t info{};
info.buffer = current_display_buffer;
flip(info);
}
}
}
bool VKGSRender::load_program()
{
const auto shadermode = g_cfg.video.shadermode.get();
// TODO: EXT_dynamic_state should get rid of this sillyness soon (kd)
const auto vertex_state = vk::decode_vertex_input_assembly_state();
if (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits)
{
get_current_fragment_program(fs_sampler_state);
ensure(current_fragment_program.valid);
get_current_vertex_program(vs_sampler_state);
m_graphics_state.clear(rsx::pipeline_state::invalidate_pipeline_bits);
}
else if (!(m_graphics_state & rsx::pipeline_state::pipeline_config_dirty) &&
m_program &&
m_pipeline_properties.state.ia.topology == vertex_state.primitive &&
m_pipeline_properties.state.ia.primitiveRestartEnable == vertex_state.restart_index_enabled)
{
if (!m_shader_interpreter.is_interpreter(m_program)) [[ likely ]]
{
return true;
}
if (shadermode == shader_mode::interpreter_only)
{
m_program = m_shader_interpreter.get(m_pipeline_properties, current_fp_metadata);
return true;
}
}
auto &vertex_program = current_vertex_program;
auto &fragment_program = current_fragment_program;
if (m_graphics_state & rsx::pipeline_state::pipeline_config_dirty)
{
vk::pipeline_props properties = vk::decode_rsx_state(
vertex_state,
m_rtts.m_bound_depth_stencil.second,
backend_config,
static_cast<u8>(m_draw_buffers.size()),
u8((m_current_renderpass_key >> 16) & 0xF),
m_device->get_depth_bounds_support()
);
properties.renderpass_key = m_current_renderpass_key;
if (m_program &&
!m_shader_interpreter.is_interpreter(m_program) &&
m_pipeline_properties == properties)
{
// Nothing changed
return true;
}
// Fallthrough
m_pipeline_properties = properties;
m_graphics_state.clear(rsx::pipeline_state::pipeline_config_dirty);
}
else
{
// Update primitive type and restart index. Note that this is not needed with EXT_dynamic_state
m_pipeline_properties.state.set_primitive_type(vertex_state.primitive);
m_pipeline_properties.state.enable_primitive_restart(vertex_state.restart_index_enabled);
m_pipeline_properties.renderpass_key = m_current_renderpass_key;
}
m_vertex_prog = nullptr;
m_fragment_prog = nullptr;
if (shadermode != shader_mode::interpreter_only) [[likely]]
{
vk::enter_uninterruptible();
// Load current program from cache
std::tie(m_program, m_vertex_prog, m_fragment_prog) = m_prog_buffer->get_graphics_pipeline(vertex_program, fragment_program, m_pipeline_properties,
shadermode != shader_mode::recompiler, true, m_pipeline_layout);
vk::leave_uninterruptible();
if (m_prog_buffer->check_cache_missed())
{
// Notify the user with HUD notification
if (g_cfg.misc.show_shader_compilation_hint)
{
if (m_overlay_manager)
{
rsx::overlays::show_shader_compile_notification();
}
}
}
}
else
{
m_program = nullptr;
}
if (!m_program && (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only))
{
if (!m_shader_interpreter.is_interpreter(m_prev_program))
{
m_interpreter_state = rsx::invalidate_pipeline_bits;
}
m_program = m_shader_interpreter.get(m_pipeline_properties, current_fp_metadata);
}
return m_program != nullptr;
}
void VKGSRender::load_program_env()
{
if (!m_program)
{
fmt::throw_exception("Unreachable right now");
}
const u32 fragment_constants_size = current_fp_metadata.program_constants_buffer_length;
const bool update_transform_constants = !!(m_graphics_state & rsx::pipeline_state::transform_constants_dirty);
const bool update_fragment_constants = !!(m_graphics_state & rsx::pipeline_state::fragment_constants_dirty);
const bool update_vertex_env = !!(m_graphics_state & rsx::pipeline_state::vertex_state_dirty);
const bool update_fragment_env = !!(m_graphics_state & rsx::pipeline_state::fragment_state_dirty);
const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty);
const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program));
const bool update_raster_env = (rsx::method_registers.polygon_stipple_enabled() && !!(m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty));
if (update_vertex_env)
{
check_heap_status(VK_HEAP_CHECK_VERTEX_ENV_STORAGE);
// Vertex state
const auto mem = m_vertex_env_ring_info.alloc<256>(256);
auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148));
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<f32*>(buf + 132)) = rsx::method_registers.point_size() * rsx::get_resolution_scale();
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.clip_min();
*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_max();
m_vertex_env_ring_info.unmap();
m_vertex_env_buffer_info = { m_vertex_env_ring_info.heap->value, mem, 144 };
}
if (update_transform_constants)
{
// Transform constants
const usz transform_constants_size = (!m_vertex_prog || m_vertex_prog->has_indexed_constants) ? 8192 : m_vertex_prog->constant_ids.size() * 16;
if (transform_constants_size)
{
check_heap_status(VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE);
const auto alignment = m_device->gpu().get_limits().minUniformBufferOffsetAlignment;
auto mem = m_transform_constants_ring_info.alloc<1>(utils::align(transform_constants_size, alignment));
auto buf = m_transform_constants_ring_info.map(mem, transform_constants_size);
const auto constant_ids = (transform_constants_size == 8192)
? std::span<const u16>{}
: std::span<const u16>(m_vertex_prog->constant_ids);
fill_vertex_program_constants_data(buf, constant_ids);
m_transform_constants_ring_info.unmap();
m_vertex_constants_buffer_info = { m_transform_constants_ring_info.heap->value, mem, transform_constants_size };
}
}
if (update_fragment_constants && !update_instruction_buffers)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE);
// Fragment constants
if (fragment_constants_size)
{
auto mem = m_fragment_constants_ring_info.alloc<256>(fragment_constants_size);
auto buf = m_fragment_constants_ring_info.map(mem, fragment_constants_size);
m_prog_buffer->fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), fragment_constants_size },
*ensure(m_fragment_prog), current_fragment_program, true);
m_fragment_constants_ring_info.unmap();
m_fragment_constants_buffer_info = { m_fragment_constants_ring_info.heap->value, mem, fragment_constants_size };
}
else
{
m_fragment_constants_buffer_info = { m_fragment_constants_ring_info.heap->value, 0, 32 };
}
}
if (update_fragment_env)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_fragment_env_ring_info.alloc<256>(256);
auto buf = m_fragment_env_ring_info.map(mem, 32);
fill_fragment_state_buffer(buf, current_fragment_program);
m_fragment_env_ring_info.unmap();
m_fragment_env_buffer_info = { m_fragment_env_ring_info.heap->value, mem, 32 };
}
if (update_fragment_texture_env)
{
check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE);
auto mem = m_fragment_texture_params_ring_info.alloc<256>(768);
auto buf = m_fragment_texture_params_ring_info.map(mem, 768);
current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask);
m_fragment_texture_params_ring_info.unmap();
m_fragment_texture_params_buffer_info = { m_fragment_texture_params_ring_info.heap->value, mem, 768 };
}
if (update_raster_env)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_raster_env_ring_info.alloc<256>(256);
auto buf = m_raster_env_ring_info.map(mem, 128);
std::memcpy(buf, rsx::method_registers.polygon_stipple_pattern(), 128);
m_raster_env_ring_info.unmap();
m_raster_env_buffer_info = { m_raster_env_ring_info.heap->value, mem, 128 };
m_graphics_state.clear(rsx::pipeline_state::polygon_stipple_pattern_dirty);
}
if (update_instruction_buffers)
{
if (m_interpreter_state & rsx::vertex_program_dirty)
{
// Attach vertex buffer data
const auto vp_block_length = current_vp_metadata.ucode_length + 16;
auto vp_mapping = m_vertex_instructions_buffer.alloc<256>(vp_block_length);
auto vp_buf = static_cast<u8*>(m_vertex_instructions_buffer.map(vp_mapping, vp_block_length));
auto vp_config = reinterpret_cast<u32*>(vp_buf);
vp_config[0] = current_vertex_program.base_address;
vp_config[1] = current_vertex_program.entry;
vp_config[2] = current_vertex_program.output_mask;
vp_config[3] = rsx::method_registers.two_side_light_en()? 1u: 0u;
std::memcpy(vp_buf + 16, current_vertex_program.data.data(), current_vp_metadata.ucode_length);
m_vertex_instructions_buffer.unmap();
m_vertex_instructions_buffer_info = { m_vertex_instructions_buffer.heap->value, vp_mapping, vp_block_length };
}
if (m_interpreter_state & rsx::fragment_program_dirty)
{
// Attach fragment buffer data
const auto fp_block_length = current_fp_metadata.program_ucode_length + 16;
auto fp_mapping = m_fragment_instructions_buffer.alloc<256>(fp_block_length);
auto fp_buf = static_cast<u8*>(m_fragment_instructions_buffer.map(fp_mapping, fp_block_length));
// Control mask
const auto control_masks = reinterpret_cast<u32*>(fp_buf);
control_masks[0] = rsx::method_registers.shader_control();
control_masks[1] = current_fragment_program.texture_state.texture_dimensions;
std::memcpy(fp_buf + 16, current_fragment_program.get_data(), current_fragment_program.ucode_length);
m_fragment_instructions_buffer.unmap();
m_fragment_instructions_buffer_info = { m_fragment_instructions_buffer.heap->value, fp_mapping, fp_block_length };
}
}
const auto& binding_table = m_device->get_pipeline_binding_table();
m_program->bind_uniform(m_vertex_env_buffer_info, binding_table.vertex_params_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_vertex_constants_buffer_info, binding_table.vertex_constant_buffers_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_fragment_env_buffer_info, binding_table.fragment_state_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_fragment_texture_params_buffer_info, binding_table.fragment_texture_params_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_raster_env_buffer_info, binding_table.rasterizer_env_bind_slot, m_current_frame->descriptor_set);
if (!m_shader_interpreter.is_interpreter(m_program))
{
m_program->bind_uniform(m_fragment_constants_buffer_info, binding_table.fragment_constant_buffers_bind_slot, m_current_frame->descriptor_set);
}
else
{
m_program->bind_buffer(m_vertex_instructions_buffer_info, m_shader_interpreter.get_vertex_instruction_location(), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
m_program->bind_buffer(m_fragment_instructions_buffer_info, m_shader_interpreter.get_fragment_instruction_location(), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
if (vk::emulate_conditional_rendering())
{
auto predicate = m_cond_render_buffer ? m_cond_render_buffer->value : vk::get_scratch_buffer(*m_current_command_buffer, 4)->value;
m_program->bind_buffer({ predicate, 0, 4 }, binding_table.conditional_render_predicate_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
// Clear flags
m_graphics_state.clear(
rsx::pipeline_state::fragment_state_dirty |
rsx::pipeline_state::vertex_state_dirty |
rsx::pipeline_state::transform_constants_dirty |
rsx::pipeline_state::fragment_constants_dirty |
rsx::pipeline_state::fragment_texture_state_dirty);
}
void VKGSRender::update_vertex_env(u32 id, const vk::vertex_upload_info& vertex_info)
{
// Actual allocation must have been done previously
u32 base_offset;
const u32 offset32 = static_cast<u32>(m_vertex_layout_stream_info.offset);
const u32 range32 = static_cast<u32>(m_vertex_layout_stream_info.range);
if (!m_vertex_layout_storage || !m_vertex_layout_storage->in_range(offset32, range32, base_offset))
{
ensure(m_texbuffer_view_size >= m_vertex_layout_stream_info.range);
if (m_vertex_layout_storage)
m_current_frame->buffer_views_to_clean.push_back(std::move(m_vertex_layout_storage));
const usz alloc_addr = m_vertex_layout_stream_info.offset;
const usz view_size = (alloc_addr + m_texbuffer_view_size) > m_vertex_layout_ring_info.size() ? m_vertex_layout_ring_info.size() - alloc_addr : m_texbuffer_view_size;
m_vertex_layout_storage = std::make_unique<vk::buffer_view>(*m_device, m_vertex_layout_ring_info.heap->value, VK_FORMAT_R32G32_UINT, alloc_addr, view_size);
base_offset = 0;
}
u8 data_size = 16;
u32 draw_info[5];
draw_info[0] = vertex_info.vertex_index_base;
draw_info[1] = vertex_info.vertex_index_offset;
draw_info[2] = id;
draw_info[3] = (id * 16) + (base_offset / 8);
if (vk::emulate_conditional_rendering())
{
draw_info[4] = cond_render_ctrl.hw_cond_active ? 1 : 0;
data_size = 20;
}
vkCmdPushConstants(*m_current_command_buffer, m_pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, data_size, draw_info);
const usz data_offset = (id * 128) + m_vertex_layout_stream_info.offset;
auto dst = m_vertex_layout_ring_info.map(data_offset, 128);
fill_vertex_layout_state(m_vertex_layout, vertex_info.first_vertex, vertex_info.allocated_vertex_count, static_cast<s32*>(dst),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
m_vertex_layout_ring_info.unmap();
}
void VKGSRender::init_buffers(rsx::framebuffer_creation_context context, bool)
{
prepare_rtts(context);
}
void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore wait_semaphore, VkSemaphore signal_semaphore, VkPipelineStageFlags pipeline_stage_flags)
{
ensure(!m_queue_status.test_and_set(flush_queue_state::flushing));
// Workaround for deadlock occuring during RSX offloader fault
// TODO: Restructure command submission infrastructure to avoid this condition
const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
const VkBool32 force_flush = !sync_success;
if (vk::test_status_interrupt(vk::heap_dirty))
{
if (m_attrib_ring_info.is_dirty() ||
m_fragment_env_ring_info.is_dirty() ||
m_vertex_env_ring_info.is_dirty() ||
m_fragment_texture_params_ring_info.is_dirty() ||
m_vertex_layout_ring_info.is_dirty() ||
m_fragment_constants_ring_info.is_dirty() ||
m_index_buffer_ring_info.is_dirty() ||
m_transform_constants_ring_info.is_dirty() ||
m_texture_upload_buffer_ring_info.is_dirty() ||
m_raster_env_ring_info.is_dirty())
{
auto secondary_command_buffer = m_secondary_cb_list.next();
secondary_command_buffer->begin();
m_attrib_ring_info.sync(*secondary_command_buffer);
m_fragment_env_ring_info.sync(*secondary_command_buffer);
m_vertex_env_ring_info.sync(*secondary_command_buffer);
m_fragment_texture_params_ring_info.sync(*secondary_command_buffer);
m_vertex_layout_ring_info.sync(*secondary_command_buffer);
m_fragment_constants_ring_info.sync(*secondary_command_buffer);
m_index_buffer_ring_info.sync(*secondary_command_buffer);
m_transform_constants_ring_info.sync(*secondary_command_buffer);
m_texture_upload_buffer_ring_info.sync(*secondary_command_buffer);
m_raster_env_ring_info.sync(*secondary_command_buffer);
secondary_command_buffer->end();
vk::queue_submit_t submit_info{ m_device->get_graphics_queue(), nullptr };
secondary_command_buffer->submit(submit_info, force_flush);
}
vk::clear_status_interrupt(vk::heap_dirty);
}
#if 0 // Currently unreachable
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render)
{
ensure(m_render_pass_open);
m_device->_vkCmdEndConditionalRenderingEXT(*m_current_command_buffer);
}
#endif
// End any active renderpasses; the caller should handle reopening
if (vk::is_renderpass_open(*m_current_command_buffer))
{
close_render_pass();
}
// End open queries. Flags will be automatically reset by the submit routine
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
{
vkCmdUpdateBuffer(*m_current_command_buffer,
m_host_object_data->value,
::offset32(&vk::host_data_t::commands_complete_event),
sizeof(u64),
const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
m_current_command_buffer->end();
m_current_command_buffer->tag();
// Supporting concurrent access vastly simplifies this logic.
// Instead of doing CB slice injection, we can just chain these together logically with the async stream going first
vk::queue_submit_t primary_submit_info{ m_device->get_graphics_queue(), pFence };
vk::queue_submit_t secondary_submit_info{};
if (wait_semaphore)
{
primary_submit_info.wait_on(wait_semaphore, pipeline_stage_flags);
}
auto& async_scheduler = g_fxo->get<vk::AsyncTaskScheduler>();
if (async_scheduler.is_recording())
{
if (async_scheduler.is_host_mode())
{
const VkSemaphore async_sema = *async_scheduler.get_sema();
secondary_submit_info.queue_signal(async_sema);
primary_submit_info.wait_on(async_sema, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
// Delay object destruction by one cycle
vk::get_resource_manager()->push_down_current_scope();
}
async_scheduler.flush(secondary_submit_info, force_flush);
}
if (signal_semaphore)
{
primary_submit_info.queue_signal(signal_semaphore);
}
m_current_command_buffer->submit(primary_submit_info, force_flush);
m_queue_status.clear(flush_queue_state::flushing);
}
void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
{
const bool clipped_scissor = (context == rsx::framebuffer_creation_context::context_draw);
if (m_current_framebuffer_context == context && !m_graphics_state.test(rsx::rtt_config_dirty) && m_draw_fbo)
{
// Fast path
// Framebuffer usage has not changed, framebuffer exists and config regs have not changed
set_scissor(clipped_scissor);
return;
}
m_graphics_state.clear(
rsx::rtt_config_dirty |
rsx::rtt_config_contested |
rsx::rtt_config_valid |
rsx::rtt_cache_state_dirty);
get_framebuffer_layout(context, m_framebuffer_layout);
if (!m_graphics_state.test(rsx::rtt_config_valid))
{
return;
}
if (m_draw_fbo && m_framebuffer_layout.ignore_change)
{
// Nothing has changed, we're still using the same framebuffer
// Update flags to match current
set_scissor(clipped_scissor);
return;
}
m_rtts.prepare_render_target(*m_current_command_buffer,
m_framebuffer_layout.color_format, m_framebuffer_layout.depth_format,
m_framebuffer_layout.width, m_framebuffer_layout.height,
m_framebuffer_layout.target, m_framebuffer_layout.aa_mode, m_framebuffer_layout.raster_type,
m_framebuffer_layout.color_addresses, m_framebuffer_layout.zeta_address,
m_framebuffer_layout.actual_color_pitch, m_framebuffer_layout.actual_zeta_pitch,
(*m_device), *m_current_command_buffer);
// Reset framebuffer information
const auto color_bpp = get_format_block_size_in_bytes(m_framebuffer_layout.color_format);
const auto samples = get_format_sample_count(m_framebuffer_layout.aa_mode);
for (u8 i = 0; i < rsx::limits::color_buffers_count; ++i)
{
// Flush old address if we keep missing it
if (m_surface_info[i].pitch && g_cfg.video.write_color_buffers)
{
const utils::address_range rsx_range = m_surface_info[i].get_memory_range();
m_texture_cache.set_memory_read_flags(rsx_range, rsx::memory_read_flags::flush_once);
m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, rsx_range);
}
m_surface_info[i].address = m_surface_info[i].pitch = 0;
m_surface_info[i].width = m_framebuffer_layout.width;
m_surface_info[i].height = m_framebuffer_layout.height;
m_surface_info[i].color_format = m_framebuffer_layout.color_format;
m_surface_info[i].bpp = color_bpp;
m_surface_info[i].samples = samples;
}
//Process depth surface as well
{
if (m_depth_surface_info.pitch && g_cfg.video.write_depth_buffer)
{
const utils::address_range surface_range = m_depth_surface_info.get_memory_range();
m_texture_cache.set_memory_read_flags(surface_range, rsx::memory_read_flags::flush_once);
m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, surface_range);
}
m_depth_surface_info.address = m_depth_surface_info.pitch = 0;
m_depth_surface_info.width = m_framebuffer_layout.width;
m_depth_surface_info.height = m_framebuffer_layout.height;
m_depth_surface_info.depth_format = m_framebuffer_layout.depth_format;
m_depth_surface_info.bpp = get_format_block_size_in_bytes(m_framebuffer_layout.depth_format);
m_depth_surface_info.samples = samples;
}
//Bind created rtts as current fbo...
const auto draw_buffers = rsx::utility::get_rtt_indexes(m_framebuffer_layout.target);
m_draw_buffers.clear();
m_fbo_images.clear();
for (u8 index : draw_buffers)
{
if (auto surface = std::get<1>(m_rtts.m_bound_render_targets[index]))
{
m_fbo_images.push_back(surface);
m_surface_info[index].address = m_framebuffer_layout.color_addresses[index];
m_surface_info[index].pitch = m_framebuffer_layout.actual_color_pitch[index];
ensure(surface->rsx_pitch == m_framebuffer_layout.actual_color_pitch[index]);
m_texture_cache.notify_surface_changed(m_surface_info[index].get_memory_range(m_framebuffer_layout.aa_factors));
m_draw_buffers.push_back(index);
}
}
if (std::get<0>(m_rtts.m_bound_depth_stencil) != 0)
{
auto ds = std::get<1>(m_rtts.m_bound_depth_stencil);
m_fbo_images.push_back(ds);
m_depth_surface_info.address = m_framebuffer_layout.zeta_address;
m_depth_surface_info.pitch = m_framebuffer_layout.actual_zeta_pitch;
ensure(ds->rsx_pitch == m_framebuffer_layout.actual_zeta_pitch);
m_texture_cache.notify_surface_changed(m_depth_surface_info.get_memory_range(m_framebuffer_layout.aa_factors));
}
// Before messing with memory properties, flush command queue if there are dma transfers queued up
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
{
flush_command_queue();
}
if (!m_rtts.superseded_surfaces.empty())
{
for (auto& surface : m_rtts.superseded_surfaces)
{
m_texture_cache.discard_framebuffer_memory_region(*m_current_command_buffer, surface->get_memory_range());
}
m_rtts.superseded_surfaces.clear();
}
if (!m_rtts.orphaned_surfaces.empty())
{
u32 gcm_format;
bool swap_bytes;
for (auto& [base_addr, surface] : m_rtts.orphaned_surfaces)
{
bool lock = surface->is_depth_surface() ? !!g_cfg.video.write_depth_buffer :
!!g_cfg.video.write_color_buffers;
if (lock &&
#ifdef TEXTURE_CACHE_DEBUG
!m_texture_cache.is_protected(
base_addr,
surface->get_memory_range(),
rsx::texture_upload_context::framebuffer_storage)
#else
!surface->is_locked()
#endif
)
{
lock = false;
}
if (!lock) [[likely]]
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface->get_memory_range());
continue;
}
if (surface->is_depth_surface())
{
gcm_format = (surface->get_surface_depth_format() != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
swap_bytes = true;
}
else
{
auto info = get_compatible_gcm_format(surface->get_surface_color_format());
gcm_format = info.first;
swap_bytes = info.second;
}
m_texture_cache.lock_memory_region(
*m_current_command_buffer, surface, surface->get_memory_range(), false,
surface->get_surface_width<rsx::surface_metrics::pixels>(), surface->get_surface_height<rsx::surface_metrics::pixels>(), surface->get_rsx_pitch(),
gcm_format, swap_bytes);
}
m_rtts.orphaned_surfaces.clear();
}
const auto color_fmt_info = get_compatible_gcm_format(m_framebuffer_layout.color_format);
for (u8 index : m_draw_buffers)
{
if (!m_surface_info[index].address || !m_surface_info[index].pitch) continue;
const utils::address_range surface_range = m_surface_info[index].get_memory_range();
if (g_cfg.video.write_color_buffers)
{
m_texture_cache.lock_memory_region(
*m_current_command_buffer, m_rtts.m_bound_render_targets[index].second, surface_range, true,
m_surface_info[index].width, m_surface_info[index].height, m_framebuffer_layout.actual_color_pitch[index],
color_fmt_info.first, color_fmt_info.second);
}
else
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
}
}
if (m_depth_surface_info.address && m_depth_surface_info.pitch)
{
const utils::address_range surface_range = m_depth_surface_info.get_memory_range();
if (g_cfg.video.write_depth_buffer)
{
const u32 gcm_format = (m_depth_surface_info.depth_format == rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
m_texture_cache.lock_memory_region(
*m_current_command_buffer, m_rtts.m_bound_depth_stencil.second, surface_range, true,
m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, true);
}
else
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
}
}
m_current_renderpass_key = vk::get_renderpass_key(m_fbo_images);
m_cached_renderpass = vk::get_renderpass(*m_device, m_current_renderpass_key);
// Search old framebuffers for this same configuration
const auto [fbo_width, fbo_height] = rsx::apply_resolution_scale<true>(m_framebuffer_layout.width, m_framebuffer_layout.height);
if (m_draw_fbo)
{
// Release old ref
m_draw_fbo->release();
}
m_draw_fbo = vk::get_framebuffer(*m_device, fbo_width, fbo_height, VK_FALSE, m_cached_renderpass, m_fbo_images);
m_draw_fbo->add_ref();
set_viewport();
set_scissor(clipped_scissor);
check_zcull_status(true);
}
void VKGSRender::renderctl(u32 request_code, void* args)
{
switch (request_code)
{
case vk::rctrl_queue_submit:
{
const auto packet = reinterpret_cast<vk::queue_submit_t*>(args);
vk::queue_submit(packet);
free(packet);
break;
}
case vk::rctrl_run_gc:
{
auto eid = reinterpret_cast<u64>(args);
vk::on_event_completed(eid, true);
break;
}
default:
fmt::throw_exception("Unhandled request code 0x%x", request_code);
}
}
bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate)
{
if (swapchain_unavailable)
return false;
// Verify enough memory exists before attempting to handle data transfer
check_heap_status(VK_HEAP_CHECK_TEXTURE_UPLOAD_STORAGE);
if (m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer))
{
m_samplers_dirty.store(true);
m_current_command_buffer->set_flag(vk::command_buffer::cb_has_blit_transfer);
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
{
// A dma transfer has been queued onto this cb
// This likely means that we're done with the tranfers to the target (writes_likely_completed=1)
flush_command_queue();
}
return true;
}
return false;
}
void VKGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
{
ensure(!m_occlusion_query_active);
query->result = 0;
//query->sync_timestamp = get_system_time();
m_active_query_info = query;
m_occlusion_query_active = true;
m_current_command_buffer->flags |= vk::command_buffer::cb_load_occluson_task;
}
void VKGSRender::end_occlusion_query(rsx::reports::occlusion_query_info* query)
{
ensure(query == m_active_query_info);
// NOTE: flushing the queue is very expensive, do not flush just because query stopped
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
// End query
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
// Clear occlusion load flag
m_current_command_buffer->flags &= ~vk::command_buffer::cb_load_occluson_task;
m_occlusion_query_active = false;
m_active_query_info = nullptr;
}
bool VKGSRender::check_occlusion_query_status(rsx::reports::occlusion_query_info* query)
{
if (!query->num_draws)
return true;
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return true;
if (data.is_current(m_current_command_buffer))
return false;
const u32 oldest = data.indices.front();
return m_occlusion_query_manager->check_query_status(oldest);
}
void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* query)
{
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return;
if (query->num_draws)
{
if (data.is_current(m_current_command_buffer))
{
std::lock_guard lock(m_flush_queue_mutex);
flush_command_queue();
if (m_flush_requests.pending())
{
m_flush_requests.clear_pending_flag();
}
rsx_log.warning("[Performance warning] Unexpected ZCULL read caused a hard sync");
busy_wait();
}
data.sync();
// Gather data
for (const auto occlusion_id : data.indices)
{
query->result += m_occlusion_query_manager->get_query_result(occlusion_id);
if (query->result && !g_cfg.video.precise_zpass_count)
{
// We only need one hit unless precise zcull is requested
break;
}
}
}
m_occlusion_query_manager->free_queries(*m_current_command_buffer, data.indices);
data.indices.clear();
}
void VKGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query)
{
if (m_active_query_info == query)
{
end_occlusion_query(query);
}
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return;
m_occlusion_query_manager->free_queries(*m_current_command_buffer, data.indices);
data.indices.clear();
}
void VKGSRender::emergency_query_cleanup(vk::command_buffer* commands)
{
ensure(commands == static_cast<vk::command_buffer*>(m_current_command_buffer));
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
}
void VKGSRender::begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources)
{
ensure(!sources.empty());
// Flag check whether to calculate all entries or only one
bool partial_eval;
// Try and avoid regenerating the data if its a repeat/spam
// NOTE: The incoming list is reversed with the first entry being the newest
if (m_cond_render_sync_tag == sources.front()->sync_tag)
{
// Already synched, check subdraw which is possible if last sync happened while query was active
if (!m_active_query_info || m_active_query_info != sources.front())
{
rsx::thread::begin_conditional_rendering(sources);
return;
}
// Partial evaluation only
partial_eval = true;
}
else
{
m_cond_render_sync_tag = sources.front()->sync_tag;
partial_eval = false;
}
// Time to aggregate
if (!m_cond_render_buffer)
{
auto& memory_props = m_device->get_memory_mapping();
auto usage_flags = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
if (m_device->get_conditional_render_support())
{
usage_flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT;
}
m_cond_render_buffer = std::make_unique<vk::buffer>(
*m_device, 4,
memory_props.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
usage_flags, 0, VMM_ALLOCATION_POOL_UNDEFINED);
}
VkPipelineStageFlags dst_stage;
VkAccessFlags dst_access;
u32 dst_offset = 0;
u32 num_hw_queries = 0;
usz first = 0;
usz last = (!partial_eval) ? sources.size() : 1;
// Count number of queries available. This is an "opening" evaluation, if there is only one source, read it as-is.
// The idea is to avoid scheduling a compute task unless we have to.
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
num_hw_queries += ::size32(query_info.indices);
}
if (m_device->get_conditional_render_support())
{
dst_stage = VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT;
dst_access = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT;
}
else
{
dst_stage = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT;
dst_access = VK_ACCESS_SHADER_READ_BIT;
}
if (num_hw_queries == 1 && !partial_eval) [[ likely ]]
{
// Accept the first available query handle as the source of truth. No aggregation is required.
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
if (!query_info.indices.empty())
{
const auto& index = query_info.indices.front();
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, 1, m_cond_render_buffer->value, 0);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage,
VK_ACCESS_TRANSFER_WRITE_BIT, dst_access);
rsx::thread::begin_conditional_rendering(sources);
return;
}
}
// This is unreachable unless something went horribly wrong
fmt::throw_exception("Unreachable");
}
else if (num_hw_queries > 0)
{
// We'll need to do some result aggregation using a compute shader.
auto scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4);
// Range latching. Because of how the query pool manages allocations using a stack, we get an inverse sequential set of handles/indices that we can easily group together.
// This drastically boosts performance on some drivers like the NVIDIA proprietary one that seems to have a rather high cost for every individual query transer command.
struct { u32 first, last; } query_range = { umax, 0 };
auto copy_query_range_impl = [&]()
{
const auto count = (query_range.last - query_range.first + 1);
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, query_range.first, count, scratch->value, dst_offset);
dst_offset += count * 4;
};
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
for (const auto& index : query_info.indices)
{
// First iteration?
if (query_range.first == umax)
{
query_range = { index, index };
continue;
}
// Head?
if ((query_range.first - 1) == index)
{
query_range.first = index;
continue;
}
// Tail?
if ((query_range.last + 1) == index)
{
query_range.last = index;
continue;
}
// Flush pending queue. In practice, this is never reached and we fall out to the spill block outside the loops
copy_query_range_impl();
// Start a new range for the current index
query_range = { index, index };
}
}
if (query_range.first != umax)
{
// Dangling queries, flush
copy_query_range_impl();
}
// Sanity check
ensure(dst_offset <= scratch->size());
if (!partial_eval)
{
// Fast path should have been caught above
ensure(dst_offset > 4);
// Clear result to zero
vkCmdFillBuffer(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, 0);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT);
}
vk::insert_buffer_memory_barrier(*m_current_command_buffer, scratch->value, 0, dst_offset,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
vk::get_compute_task<vk::cs_aggregator>()->run(*m_current_command_buffer, m_cond_render_buffer.get(), scratch, dst_offset / 4);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, dst_stage,
VK_ACCESS_SHADER_WRITE_BIT, dst_access);
}
else if (m_program)
{
// This can sometimes happen when shaders are compiling, only log if there is a program hit
rsx_log.warning("Dubious query data pushed to cond render! Please report to developers(q.pending=%d)", sources.front()->pending);
}
rsx::thread::begin_conditional_rendering(sources);
}
void VKGSRender::end_conditional_rendering()
{
thread::end_conditional_rendering();
}