rpcsx/rpcs3/Emu/RSX/VK/VKGSRender.cpp
kd-11 d846142f0c vk: Reimplement compliant async texture streaming
- Use CONCURRENT queue access instead of fighting with queue acquire/release via submit chains.
  The minor benefits of forcing EXCLUSIVE mode are buried under the huge penalty of multiple vkQueueSubmit.
  Batching submits does not help alleviate this situation. We simply must avoid interrupting execution.
2022-07-25 21:05:31 +03:00

2815 lines
96 KiB
C++

#include "stdafx.h"
#include "../Overlays/overlay_shader_compile_notification.h"
#include "../Overlays/Shaders/shader_loading_dialog_native.h"
#include "VKAsyncScheduler.h"
#include "VKCommandStream.h"
#include "VKCommonDecompiler.h"
#include "VKCompute.h"
#include "VKGSRender.h"
#include "VKHelpers.h"
#include "VKRenderPass.h"
#include "VKResourceManager.h"
#include "vkutils/buffer_object.h"
#include "vkutils/scratch.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/Memory/vm_locking.h"
#include "../Program/program_state_cache2.hpp"
#include "util/asm.hpp"
namespace vk
{
VkCompareOp get_compare_func(rsx::comparison_function op, bool reverse_direction = false);
std::pair<VkFormat, VkComponentMapping> get_compatible_surface_format(rsx::surface_color_format color_format)
{
const VkComponentMapping o_rgb = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_ONE };
const VkComponentMapping z_rgb = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_ZERO };
switch (color_format)
{
#ifndef __APPLE__
case rsx::surface_color_format::r5g6b5:
return std::make_pair(VK_FORMAT_R5G6B5_UNORM_PACK16, vk::default_component_map);
case rsx::surface_color_format::x1r5g5b5_o1r5g5b5:
return std::make_pair(VK_FORMAT_A1R5G5B5_UNORM_PACK16, o_rgb);
case rsx::surface_color_format::x1r5g5b5_z1r5g5b5:
return std::make_pair(VK_FORMAT_A1R5G5B5_UNORM_PACK16, z_rgb);
#else
// assign B8G8R8A8_UNORM to formats that are not supported by Metal
case rsx::surface_color_format::r5g6b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::x1r5g5b5_o1r5g5b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, o_rgb);
case rsx::surface_color_format::x1r5g5b5_z1r5g5b5:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, z_rgb);
#endif
case rsx::surface_color_format::a8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::a8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, vk::default_component_map);
case rsx::surface_color_format::x8b8g8r8_o8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, o_rgb);
case rsx::surface_color_format::x8b8g8r8_z8b8g8r8:
return std::make_pair(VK_FORMAT_R8G8B8A8_UNORM, z_rgb);
case rsx::surface_color_format::x8r8g8b8_z8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, z_rgb);
case rsx::surface_color_format::x8r8g8b8_o8r8g8b8:
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, o_rgb);
case rsx::surface_color_format::w16z16y16x16:
return std::make_pair(VK_FORMAT_R16G16B16A16_SFLOAT, vk::default_component_map);
case rsx::surface_color_format::w32z32y32x32:
return std::make_pair(VK_FORMAT_R32G32B32A32_SFLOAT, vk::default_component_map);
case rsx::surface_color_format::b8:
{
const VkComponentMapping no_alpha = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE };
return std::make_pair(VK_FORMAT_R8_UNORM, no_alpha);
}
case rsx::surface_color_format::g8b8:
{
const VkComponentMapping gb_rg = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G };
return std::make_pair(VK_FORMAT_R8G8_UNORM, gb_rg);
}
case rsx::surface_color_format::x32:
{
const VkComponentMapping rrrr = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R };
return std::make_pair(VK_FORMAT_R32_SFLOAT, rrrr);
}
default:
rsx_log.error("Surface color buffer: Unsupported surface color format (0x%x)", static_cast<u32>(color_format));
return std::make_pair(VK_FORMAT_B8G8R8A8_UNORM, vk::default_component_map);
}
}
VkLogicOp get_logic_op(rsx::logic_op op)
{
switch (op)
{
case rsx::logic_op::logic_clear: return VK_LOGIC_OP_CLEAR;
case rsx::logic_op::logic_and: return VK_LOGIC_OP_AND;
case rsx::logic_op::logic_and_reverse: return VK_LOGIC_OP_AND_REVERSE;
case rsx::logic_op::logic_copy: return VK_LOGIC_OP_COPY;
case rsx::logic_op::logic_and_inverted: return VK_LOGIC_OP_AND_INVERTED;
case rsx::logic_op::logic_noop: return VK_LOGIC_OP_NO_OP;
case rsx::logic_op::logic_xor: return VK_LOGIC_OP_XOR;
case rsx::logic_op::logic_or : return VK_LOGIC_OP_OR;
case rsx::logic_op::logic_nor: return VK_LOGIC_OP_NOR;
case rsx::logic_op::logic_equiv: return VK_LOGIC_OP_EQUIVALENT;
case rsx::logic_op::logic_invert: return VK_LOGIC_OP_INVERT;
case rsx::logic_op::logic_or_reverse: return VK_LOGIC_OP_OR_REVERSE;
case rsx::logic_op::logic_copy_inverted: return VK_LOGIC_OP_COPY_INVERTED;
case rsx::logic_op::logic_or_inverted: return VK_LOGIC_OP_OR_INVERTED;
case rsx::logic_op::logic_nand: return VK_LOGIC_OP_NAND;
case rsx::logic_op::logic_set: return VK_LOGIC_OP_SET;
default:
fmt::throw_exception("Unknown logic op 0x%x", static_cast<u32>(op));
}
}
VkBlendFactor get_blend_factor(rsx::blend_factor factor)
{
switch (factor)
{
case rsx::blend_factor::one: return VK_BLEND_FACTOR_ONE;
case rsx::blend_factor::zero: return VK_BLEND_FACTOR_ZERO;
case rsx::blend_factor::src_alpha: return VK_BLEND_FACTOR_SRC_ALPHA;
case rsx::blend_factor::dst_alpha: return VK_BLEND_FACTOR_DST_ALPHA;
case rsx::blend_factor::src_color: return VK_BLEND_FACTOR_SRC_COLOR;
case rsx::blend_factor::dst_color: return VK_BLEND_FACTOR_DST_COLOR;
case rsx::blend_factor::constant_color: return VK_BLEND_FACTOR_CONSTANT_COLOR;
case rsx::blend_factor::constant_alpha: return VK_BLEND_FACTOR_CONSTANT_ALPHA;
case rsx::blend_factor::one_minus_src_color: return VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR;
case rsx::blend_factor::one_minus_dst_color: return VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
case rsx::blend_factor::one_minus_src_alpha: return VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
case rsx::blend_factor::one_minus_dst_alpha: return VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA;
case rsx::blend_factor::one_minus_constant_alpha: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA;
case rsx::blend_factor::one_minus_constant_color: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR;
case rsx::blend_factor::src_alpha_saturate: return VK_BLEND_FACTOR_SRC_ALPHA_SATURATE;
default:
fmt::throw_exception("Unknown blend factor 0x%x", static_cast<u32>(factor));
}
}
VkBlendOp get_blend_op(rsx::blend_equation op)
{
switch (op)
{
case rsx::blend_equation::add_signed:
rsx_log.trace("blend equation add_signed used. Emulating using FUNC_ADD");
[[fallthrough]];
case rsx::blend_equation::add:
return VK_BLEND_OP_ADD;
case rsx::blend_equation::substract: return VK_BLEND_OP_SUBTRACT;
case rsx::blend_equation::reverse_substract_signed:
rsx_log.trace("blend equation reverse_subtract_signed used. Emulating using FUNC_REVERSE_SUBTRACT");
[[fallthrough]];
case rsx::blend_equation::reverse_substract: return VK_BLEND_OP_REVERSE_SUBTRACT;
case rsx::blend_equation::min: return VK_BLEND_OP_MIN;
case rsx::blend_equation::max: return VK_BLEND_OP_MAX;
default:
fmt::throw_exception("Unknown blend op: 0x%x", static_cast<u32>(op));
}
}
VkStencilOp get_stencil_op(rsx::stencil_op op)
{
switch (op)
{
case rsx::stencil_op::keep: return VK_STENCIL_OP_KEEP;
case rsx::stencil_op::zero: return VK_STENCIL_OP_ZERO;
case rsx::stencil_op::replace: return VK_STENCIL_OP_REPLACE;
case rsx::stencil_op::incr: return VK_STENCIL_OP_INCREMENT_AND_CLAMP;
case rsx::stencil_op::decr: return VK_STENCIL_OP_DECREMENT_AND_CLAMP;
case rsx::stencil_op::invert: return VK_STENCIL_OP_INVERT;
case rsx::stencil_op::incr_wrap: return VK_STENCIL_OP_INCREMENT_AND_WRAP;
case rsx::stencil_op::decr_wrap: return VK_STENCIL_OP_DECREMENT_AND_WRAP;
default:
fmt::throw_exception("Unknown stencil op: 0x%x", static_cast<u32>(op));
}
}
VkFrontFace get_front_face(rsx::front_face ffv)
{
switch (ffv)
{
case rsx::front_face::cw: return VK_FRONT_FACE_CLOCKWISE;
case rsx::front_face::ccw: return VK_FRONT_FACE_COUNTER_CLOCKWISE;
default:
fmt::throw_exception("Unknown front face value: 0x%x", static_cast<u32>(ffv));
}
}
VkCullModeFlags get_cull_face(rsx::cull_face cfv)
{
switch (cfv)
{
case rsx::cull_face::back: return VK_CULL_MODE_BACK_BIT;
case rsx::cull_face::front: return VK_CULL_MODE_FRONT_BIT;
case rsx::cull_face::front_and_back: return VK_CULL_MODE_FRONT_AND_BACK;
default:
fmt::throw_exception("Unknown cull face value: 0x%x", static_cast<u32>(cfv));
}
}
}
namespace
{
std::tuple<VkPipelineLayout, VkDescriptorSetLayout> get_shared_pipeline_layout(VkDevice dev)
{
const auto& binding_table = vk::get_current_renderer()->get_pipeline_binding_table();
std::vector<VkDescriptorSetLayoutBinding> bindings(binding_table.total_descriptor_bindings);
usz idx = 0;
// Vertex stream, one stream for cacheable data, one stream for transient data
for (int i = 0; i < 3; i++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_buffers_first_bind_slot + i;
idx++;
}
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_constant_buffers_bind_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_state_bind_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.fragment_texture_params_bind_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_constant_buffers_bind_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_ALL_GRAPHICS;
bindings[idx].binding = binding_table.vertex_params_bind_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.conditional_render_predicate_slot;
idx++;
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding_table.rasterizer_env_bind_slot;
idx++;
for (auto binding = binding_table.textures_first_bind_slot;
binding < binding_table.vertex_textures_first_bind_slot;
binding++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
bindings[idx].binding = binding;
idx++;
}
for (int i = 0; i < rsx::limits::vertex_textures_count; i++)
{
bindings[idx].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
bindings[idx].descriptorCount = 1;
bindings[idx].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
bindings[idx].binding = binding_table.vertex_textures_first_bind_slot + i;
idx++;
}
ensure(idx == binding_table.total_descriptor_bindings);
std::array<VkPushConstantRange, 1> push_constants;
push_constants[0].offset = 0;
push_constants[0].size = 16;
push_constants[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
if (vk::emulate_conditional_rendering())
{
// Conditional render toggle
push_constants[0].size = 20;
}
const auto set_layout = vk::descriptors::create_layout(bindings);
VkPipelineLayoutCreateInfo layout_info = {};
layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
layout_info.setLayoutCount = 1;
layout_info.pSetLayouts = &set_layout;
layout_info.pushConstantRangeCount = 1;
layout_info.pPushConstantRanges = push_constants.data();
VkPipelineLayout result;
CHECK_RESULT(vkCreatePipelineLayout(dev, &layout_info, nullptr, &result));
return std::make_tuple(result, set_layout);
}
}
u64 VKGSRender::get_cycles()
{
return thread_ctrl::get_cycles(static_cast<named_thread<VKGSRender>&>(*this));
}
VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
{
if (m_instance.create("RPCS3"))
{
m_instance.bind();
}
else
{
rsx_log.fatal("Could not find a vulkan compatible GPU driver. Your GPU(s) may not support Vulkan, or you need to install the vulkan runtime and drivers");
m_device = VK_NULL_HANDLE;
return;
}
std::vector<vk::physical_device>& gpus = m_instance.enumerate_devices();
//Actually confirm that the loader found at least one compatible device
//This should not happen unless something is wrong with the driver setup on the target system
if (gpus.empty())
{
//We can't throw in Emulator::Load, so we show error and return
rsx_log.fatal("No compatible GPU devices found");
m_device = VK_NULL_HANDLE;
return;
}
bool gpu_found = false;
std::string adapter_name = g_cfg.video.vk.adapter;
display_handle_t display = m_frame->handle();
#ifdef HAVE_X11
std::visit([this](auto&& p) {
using T = std::decay_t<decltype(p)>;
if constexpr (std::is_same_v<T, std::pair<Display*, Window>>)
{
m_display_handle = p.first; XFlush(m_display_handle);
}
}, display);
#endif
for (auto &gpu : gpus)
{
if (gpu.get_name() == adapter_name)
{
m_swapchain.reset(m_instance.create_swapchain(display, gpu));
gpu_found = true;
break;
}
}
if (!gpu_found || adapter_name.empty())
{
m_swapchain.reset(m_instance.create_swapchain(display, gpus[0]));
}
if (!m_swapchain)
{
m_device = VK_NULL_HANDLE;
rsx_log.fatal("Could not successfully initialize a swapchain");
return;
}
m_device = const_cast<vk::render_device*>(&m_swapchain->get_device());
vk::set_current_renderer(m_swapchain->get_device());
m_swapchain_dims.width = m_frame->client_width();
m_swapchain_dims.height = m_frame->client_height();
if (!m_swapchain->init(m_swapchain_dims.width, m_swapchain_dims.height))
{
swapchain_unavailable = true;
}
//create command buffer...
m_command_buffer_pool.create((*m_device), m_device->get_graphics_queue_family());
m_primary_cb_list.create(m_command_buffer_pool, vk::command_buffer::access_type_hint::flush_only);
m_current_command_buffer = m_primary_cb_list.get();
m_current_command_buffer->begin();
//Create secondary command_buffer for parallel operations
m_secondary_command_buffer_pool.create((*m_device), m_device->get_graphics_queue_family());
m_secondary_cb_list.create(m_secondary_command_buffer_pool, vk::command_buffer::access_type_hint::all);
//Precalculated stuff
std::tie(pipeline_layout, descriptor_layouts) = get_shared_pipeline_layout(*m_device);
//Occlusion
m_occlusion_query_manager = std::make_unique<vk::query_pool_manager>(*m_device, VK_QUERY_TYPE_OCCLUSION, OCCLUSION_MAX_POOL_SIZE);
m_occlusion_map.resize(occlusion_query_count);
for (u32 n = 0; n < occlusion_query_count; ++n)
m_occlusion_query_data[n].driver_handle = n;
if (g_cfg.video.precise_zpass_count)
{
m_occlusion_query_manager->set_control_flags(VK_QUERY_CONTROL_PRECISE_BIT, 0);
}
// Generate frame contexts
const u32 max_draw_calls = m_device->get_descriptor_max_draw_calls();
const auto& binding_table = m_device->get_pipeline_binding_table();
const u32 num_fs_samplers = binding_table.vertex_textures_first_bind_slot - binding_table.textures_first_bind_slot;
std::vector<VkDescriptorPoolSize> sizes;
sizes.push_back({ VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER , 6 * max_draw_calls });
sizes.push_back({ VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER , 3 * max_draw_calls });
sizes.push_back({ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER , (num_fs_samplers + 4) * max_draw_calls });
// Conditional rendering predicate slot; refactor to allow skipping this when not needed
sizes.push_back({ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1 * max_draw_calls });
VkSemaphoreCreateInfo semaphore_info = {};
semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
// VRAM allocation
m_attrib_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000, "attrib buffer", 0x400000, VK_TRUE);
m_fragment_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment env buffer");
m_vertex_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "vertex env buffer");
m_fragment_texture_params_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment texture params buffer");
m_vertex_layout_ring_info.create(VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "vertex layout buffer", 0x10000, VK_TRUE);
m_fragment_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "fragment constants buffer");
m_transform_constants_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_TRANSFORM_CONSTANTS_BUFFER_SIZE_M * 0x100000, "transform constants buffer");
m_index_buffer_ring_info.create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_INDEX_RING_BUFFER_SIZE_M * 0x100000, "index buffer");
m_texture_upload_buffer_ring_info.create(VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_TEXTURE_UPLOAD_RING_BUFFER_SIZE_M * 0x100000, "texture upload buffer", 32 * 0x100000);
m_raster_env_ring_info.create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_UBO_RING_BUFFER_SIZE_M * 0x100000, "raster env buffer");
const auto shadermode = g_cfg.video.shadermode.get();
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
m_vertex_instructions_buffer.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, 64 * 0x100000, "vertex instructions buffer", 512 * 16);
m_fragment_instructions_buffer.create(VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, 64 * 0x100000, "fragment instructions buffer", 2048);
}
// Initialize optional allocation information with placeholders
m_raster_env_buffer_info = { m_raster_env_ring_info.heap->value, 0, 128 };
const auto limits = m_device->gpu().get_limits();
m_texbuffer_view_size = std::min(limits.maxTexelBufferElements, VK_ATTRIB_RING_BUFFER_SIZE_M * 0x100000u);
if (m_texbuffer_view_size < 0x800000)
{
// Warn, only possibly expected on macOS
rsx_log.warning("Current driver may crash due to memory limitations (%uk)", m_texbuffer_view_size / 1024);
}
for (auto &ctx : frame_context_storage)
{
vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.present_wait_semaphore);
vkCreateSemaphore((*m_device), &semaphore_info, nullptr, &ctx.acquire_signal_semaphore);
ctx.descriptor_pool.create(*m_device, sizes.data(), static_cast<u32>(sizes.size()), max_draw_calls, 1);
}
const auto& memory_map = m_device->get_memory_mapping();
null_buffer = std::make_unique<vk::buffer>(*m_device, 32, memory_map.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, 0, VMM_ALLOCATION_POOL_UNDEFINED);
null_buffer_view = std::make_unique<vk::buffer_view>(*m_device, null_buffer->value, VK_FORMAT_R8_UINT, 0, 32);
vk::initialize_compiler_context();
vk::initialize_pipe_compiler(g_cfg.video.shader_compiler_threads_count);
m_prog_buffer = std::make_unique<vk::program_cache>
(
[this](const vk::pipeline_props& props, const RSXVertexProgram& vp, const RSXFragmentProgram& fp)
{
// Program was linked or queued for linking
m_shaders_cache->store(props, vp, fp);
}
);
if (g_cfg.video.disable_vertex_cache || g_cfg.video.multithreaded_rsx)
m_vertex_cache = std::make_unique<vk::null_vertex_cache>();
else
m_vertex_cache = std::make_unique<vk::weak_vertex_cache>();
m_shaders_cache = std::make_unique<vk::shader_cache>(*m_prog_buffer, "vulkan", "v1.93");
for (u32 i = 0; i < m_swapchain->get_swap_image_count(); ++i)
{
const auto target_layout = m_swapchain->get_optimal_present_layout();
const auto target_image = m_swapchain->get_image(i);
VkClearColorValue clear_color{};
VkImageSubresourceRange range = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
vk::change_image_layout(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, range);
vkCmdClearColorImage(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear_color, 1, &range);
vk::change_image_layout(*m_current_command_buffer, target_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, target_layout, range);
}
m_current_frame = &frame_context_storage[0];
m_texture_cache.initialize((*m_device), m_device->get_graphics_queue(),
m_texture_upload_buffer_ring_info);
vk::get_overlay_pass<vk::ui_overlay_renderer>()->init(*m_current_command_buffer, m_texture_upload_buffer_ring_info);
if (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only)
{
m_shader_interpreter.init(*m_device);
}
backend_config.supports_multidraw = true;
// NOTE: We do not actually need multiple sample support for A2C to work
// This is here for visual consistency - will be removed when AA problems due to mipmaps are fixed
if (g_cfg.video.antialiasing_level != msaa_level::none)
{
backend_config.supports_hw_msaa = true;
backend_config.supports_hw_a2c = true;
backend_config.supports_hw_a2one = m_device->get_alpha_to_one_support();
}
// NOTE: On NVIDIA cards going back decades (including the PS3) there is a slight normalization inaccuracy in compressed formats.
// Confirmed in BLES01916 (The Evil Within) which uses RGB565 for some virtual texturing data.
backend_config.supports_hw_renormalization = (vk::get_driver_vendor() == vk::driver_vendor::NVIDIA);
// Conditional rendering support
backend_config.supports_hw_conditional_render = true;
// Passthrough DMA
backend_config.supports_passthrough_dma = m_device->get_external_memory_host_support();
// Host sync
backend_config.supports_host_gpu_labels = !!g_cfg.video.host_label_synchronization;
// Async compute and related operations
if (g_cfg.video.vk.asynchronous_texture_streaming)
{
// Optimistic, enable async compute
backend_config.supports_asynchronous_compute = true;
if (m_device->get_graphics_queue() == m_device->get_transfer_queue())
{
rsx_log.error("Cannot run graphics and async transfer in the same queue. Async uploads are disabled. This is a limitation of your GPU");
backend_config.supports_asynchronous_compute = false;
}
}
// Sanity checks
switch (vk::get_driver_vendor())
{
case vk::driver_vendor::NVIDIA:
if (backend_config.supports_asynchronous_compute)
{
if (auto chip_family = vk::get_chip_family();
chip_family == vk::chip_class::NV_kepler || chip_family == vk::chip_class::NV_maxwell)
{
rsx_log.warning("Older NVIDIA cards do not meet requirements for true asynchronous compute due to some driver fakery.");
}
rsx_log.notice("Forcing safe async compute for NVIDIA device to avoid crashing.");
g_cfg.video.vk.asynchronous_scheduler.set(vk_gpu_scheduler_mode::safe);
}
break;
#if !defined(_WIN32)
// Anything running on AMDGPU kernel driver will not work due to the check for fd-backed memory allocations
case vk::driver_vendor::RADV:
case vk::driver_vendor::AMD:
#if !defined(__linux__)
// Intel chipsets would fail on BSD in most cases and DRM_IOCTL_i915_GEM_USERPTR unimplemented
case vk::driver_vendor::ANV:
#endif
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("AMDGPU kernel driver on linux and INTEL driver on some platforms cannot support passthrough DMA buffers.");
backend_config.supports_passthrough_dma = false;
}
break;
#endif
case vk::driver_vendor::MVK:
// Async compute crashes immediately on Apple GPUs
rsx_log.error("Apple GPUs are incompatible with the current implementation of asynchronous texture decoding.");
backend_config.supports_asynchronous_compute = false;
break;
case vk::driver_vendor::INTEL:
// As expected host allocations won't work on INTEL despite the extension being present
if (backend_config.supports_passthrough_dma)
{
rsx_log.error("INTEL driver does not support passthrough DMA buffers");
backend_config.supports_passthrough_dma = false;
}
break;
default: break;
}
if (backend_config.supports_asynchronous_compute)
{
// Run only if async compute can be used.
g_fxo->init<vk::AsyncTaskScheduler>(g_cfg.video.vk.asynchronous_scheduler);
}
if (backend_config.supports_host_gpu_labels)
{
if (backend_config.supports_passthrough_dma)
{
m_host_object_data = std::make_unique<vk::buffer>(*m_device,
0x10000,
memory_map.host_visible_coherent, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
VK_BUFFER_USAGE_TRANSFER_DST_BIT, 0,
VMM_ALLOCATION_POOL_SYSTEM);
m_host_data_ptr = new (m_host_object_data->map(0, 0x100000)) vk::host_data_t();
ensure(m_host_data_ptr->magic == 0xCAFEBABE);
}
else
{
rsx_log.error("Your GPU/driver does not support extensions required to enable passthrough DMA emulation. Host GPU labels will be disabled.");
backend_config.supports_host_gpu_labels = false;
}
}
if (!backend_config.supports_host_gpu_labels &&
!backend_config.supports_asynchronous_compute)
{
// Disable passthrough DMA unless we enable a feature that requires it.
// I'm avoiding an explicit checkbox for this until I figure out why host labels don't fix all problems with passthrough.
backend_config.supports_passthrough_dma = false;
}
}
VKGSRender::~VKGSRender()
{
if (m_device == VK_NULL_HANDLE)
{
//Initialization failed
return;
}
// Flush DMA queue
while (!g_fxo->get<rsx::dma_manager>().sync())
{
do_local_task(rsx::FIFO_state::lock_wait);
}
//Wait for device to finish up with resources
vkDeviceWaitIdle(*m_device);
// Globals. TODO: Refactor lifetime management
if (backend_config.supports_asynchronous_compute)
{
g_fxo->get<vk::AsyncTaskScheduler>().destroy();
}
// Host data
if (m_host_object_data)
{
m_host_object_data->unmap();
m_host_object_data.reset();
}
// Clear flush requests
m_flush_requests.clear_pending_flag();
// Shaders
vk::destroy_pipe_compiler(); // Ensure no pending shaders being compiled
vk::finalize_compiler_context(); // Shut down the glslang compiler
m_prog_buffer->clear(); // Delete shader objects
m_shader_interpreter.destroy();
m_persistent_attribute_storage.reset();
m_volatile_attribute_storage.reset();
m_vertex_layout_storage.reset();
// Upscaler (references some global resources)
m_upscaler.reset();
// Heaps
m_attrib_ring_info.destroy();
m_fragment_env_ring_info.destroy();
m_vertex_env_ring_info.destroy();
m_fragment_texture_params_ring_info.destroy();
m_vertex_layout_ring_info.destroy();
m_fragment_constants_ring_info.destroy();
m_transform_constants_ring_info.destroy();
m_index_buffer_ring_info.destroy();
m_texture_upload_buffer_ring_info.destroy();
m_vertex_instructions_buffer.destroy();
m_fragment_instructions_buffer.destroy();
m_raster_env_ring_info.destroy();
// Fallback bindables
null_buffer.reset();
null_buffer_view.reset();
if (m_current_frame == &m_aux_frame_context)
{
// Return resources back to the owner
m_current_frame = &frame_context_storage[m_current_queue_index];
m_current_frame->swap_storage(m_aux_frame_context);
m_current_frame->grab_resources(m_aux_frame_context);
}
m_aux_frame_context.buffer_views_to_clean.clear();
// NOTE: aux_context uses descriptor pools borrowed from the main queues and any allocations will be automatically freed when pool is destroyed
for (auto &ctx : frame_context_storage)
{
vkDestroySemaphore((*m_device), ctx.present_wait_semaphore, nullptr);
vkDestroySemaphore((*m_device), ctx.acquire_signal_semaphore, nullptr);
ctx.descriptor_pool.destroy();
ctx.buffer_views_to_clean.clear();
}
// Textures
m_rtts.destroy();
m_texture_cache.destroy();
m_stencil_mirror_sampler.reset();
// Overlay text handler
m_text_writer.reset();
//Pipeline descriptors
vkDestroyPipelineLayout(*m_device, pipeline_layout, nullptr);
vkDestroyDescriptorSetLayout(*m_device, descriptor_layouts, nullptr);
// Queries
m_occlusion_query_manager.reset();
m_cond_render_buffer.reset();
// Command buffer
m_primary_cb_list.destroy();
m_secondary_cb_list.destroy();
m_command_buffer_pool.destroy();
m_secondary_command_buffer_pool.destroy();
// Global resources
vk::destroy_global_resources();
// Device handles/contexts
m_swapchain->destroy();
m_instance.destroy();
#if defined(HAVE_X11) && defined(HAVE_VULKAN)
if (m_display_handle)
XCloseDisplay(m_display_handle);
#endif
}
bool VKGSRender::on_access_violation(u32 address, bool is_writing)
{
vk::texture_cache::thrashed_set result;
{
const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
result = m_texture_cache.invalidate_address(*m_secondary_cb_list.get(), address, cause);
}
if (result.invalidate_samplers)
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
if (!result.violation_handled)
{
return zcull_ctrl->on_access_violation(address);
}
if (result.num_flushable > 0)
{
if (g_fxo->get<rsx::dma_manager>().is_current_thread())
{
// The offloader thread cannot handle flush requests
ensure(!(m_queue_status & flush_queue_state::deadlock));
m_offloader_fault_range = g_fxo->get<rsx::dma_manager>().get_fault_range(is_writing);
m_offloader_fault_cause = (is_writing) ? rsx::invalidation_cause::write : rsx::invalidation_cause::read;
g_fxo->get<rsx::dma_manager>().set_mem_fault_flag();
m_queue_status |= flush_queue_state::deadlock;
m_eng_interrupt_mask |= rsx::backend_interrupt;
// Wait for deadlock to clear
while (m_queue_status & flush_queue_state::deadlock)
{
utils::pause();
}
g_fxo->get<rsx::dma_manager>().clear_mem_fault_flag();
return true;
}
bool has_queue_ref = false;
if (!is_current_thread()) [[likely]]
{
// Always submit primary cb to ensure state consistency (flush pending changes such as image transitions)
vm::temporary_unlock();
std::lock_guard lock(m_flush_queue_mutex);
m_flush_requests.post(false);
m_eng_interrupt_mask |= rsx::backend_interrupt;
has_queue_ref = true;
}
else
{
if (vk::is_uninterruptible())
{
rsx_log.error("Fault in uninterruptible code!");
}
// Flush primary cb queue to sync pending changes (e.g image transitions!)
flush_command_queue();
}
if (has_queue_ref)
{
// Wait for the RSX thread to process request if it hasn't already
m_flush_requests.producer_wait();
}
m_texture_cache.flush_all(*m_secondary_cb_list.next(), result);
if (has_queue_ref)
{
// Release RSX thread
m_flush_requests.remove_one();
}
}
return true;
}
void VKGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause)
{
std::lock_guard lock(m_secondary_cb_guard);
auto data = m_texture_cache.invalidate_range(*m_secondary_cb_list.next(), range, cause);
AUDIT(data.empty());
if (cause == rsx::invalidation_cause::unmap)
{
if (data.violation_handled)
{
m_texture_cache.purge_unreleased_sections();
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
}
vk::unmap_dma(range.start, range.length());
}
}
void VKGSRender::on_semaphore_acquire_wait()
{
if (m_flush_requests.pending() ||
(async_flip_requested & flip_request::emu_requested) ||
(m_queue_status & flush_queue_state::deadlock))
{
do_local_task(rsx::FIFO_state::lock_wait);
}
}
bool VKGSRender::on_vram_exhausted(rsx::problem_severity severity)
{
ensure(!vk::is_uninterruptible() && rsx::get_current_renderer()->is_current_thread());
bool texture_cache_relieved = false;
if (severity >= rsx::problem_severity::fatal && m_texture_cache.is_overallocated())
{
// Evict some unused textures. Do not evict any active references
std::set<u32> exclusion_list;
auto scan_array = [&](const auto& texture_array)
{
for (auto i = 0ull; i < texture_array.size(); ++i)
{
const auto& tex = texture_array[i];
const auto addr = rsx::get_address(tex.offset(), tex.location());
exclusion_list.insert(addr);
}
};
scan_array(rsx::method_registers.fragment_textures);
scan_array(rsx::method_registers.vertex_textures);
// Hold the secondary lock guard to prevent threads from trying to touch access violation handler stuff
std::lock_guard lock(m_secondary_cb_guard);
rsx_log.warning("Texture cache is overallocated. Will evict unnecessary textures.");
texture_cache_relieved = m_texture_cache.evict_unused(exclusion_list);
}
texture_cache_relieved |= m_texture_cache.handle_memory_pressure(severity);
if (severity == rsx::problem_severity::low)
{
// Low severity only handles invalidating unused textures
return texture_cache_relieved;
}
bool surface_cache_relieved = false;
if (severity >= rsx::problem_severity::moderate)
{
// Check if we need to spill
const auto mem_info = m_device->get_memory_mapping();
if (severity >= rsx::problem_severity::fatal && // Only spill for fatal errors
mem_info.device_local != mem_info.host_visible_coherent && // Do not spill if it is an IGP, there is nowhere to spill to
m_rtts.is_overallocated()) // Surface cache must be over-allocated by the design quota
{
// Queue a VRAM spill operation.
m_rtts.spill_unused_memory();
}
// Moderate severity and higher also starts removing stale render target objects
if (m_rtts.handle_memory_pressure(*m_current_command_buffer, severity))
{
surface_cache_relieved = true;
m_rtts.free_invalidated(*m_current_command_buffer, severity);
}
if (severity >= rsx::problem_severity::fatal && surface_cache_relieved && !m_samplers_dirty)
{
// If surface cache was modified destructively, then we must reload samplers touching the surface cache.
bool invalidate_samplers = false;
auto scan_array = [&](const auto& texture_array, const auto& sampler_states)
{
for (auto i = 0ull; i < texture_array.size() && !invalidate_samplers; ++i)
{
if (texture_array[i].enabled() && sampler_states[i])
{
invalidate_samplers = (sampler_states[i]->upload_context == rsx::texture_upload_context::framebuffer_storage);
}
}
};
scan_array(rsx::method_registers.fragment_textures, fs_sampler_state);
scan_array(rsx::method_registers.vertex_textures, vs_sampler_state);
if (invalidate_samplers)
{
m_samplers_dirty.store(true);
}
}
}
const bool any_cache_relieved = (texture_cache_relieved || surface_cache_relieved);
if (any_cache_relieved && severity >= rsx::problem_severity::fatal)
{
// Imminent crash, full GPU sync is the least of our problems
flush_command_queue(true, true);
}
return any_cache_relieved;
}
void VKGSRender::notify_tile_unbound(u32 tile)
{
//TODO: Handle texture writeback
if (false)
{
u32 addr = rsx::get_address(tiles[tile].offset, tiles[tile].location);
on_notify_memory_unmapped(addr, tiles[tile].size);
m_rtts.invalidate_surface_address(addr, false);
}
{
std::lock_guard lock(m_sampler_mutex);
m_samplers_dirty.store(true);
}
}
void VKGSRender::check_heap_status(u32 flags)
{
ensure(flags);
bool heap_critical;
if (flags == VK_HEAP_CHECK_ALL)
{
heap_critical = m_attrib_ring_info.is_critical() ||
m_texture_upload_buffer_ring_info.is_critical() ||
m_fragment_env_ring_info.is_critical() ||
m_vertex_env_ring_info.is_critical() ||
m_fragment_texture_params_ring_info.is_critical() ||
m_vertex_layout_ring_info.is_critical() ||
m_fragment_constants_ring_info.is_critical() ||
m_transform_constants_ring_info.is_critical() ||
m_index_buffer_ring_info.is_critical() ||
m_raster_env_ring_info.is_critical();
}
else
{
heap_critical = false;
u32 test = 1u << std::countr_zero(flags);
do
{
switch (flags & test)
{
case 0:
break;
case VK_HEAP_CHECK_TEXTURE_UPLOAD_STORAGE:
heap_critical = m_texture_upload_buffer_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_STORAGE:
heap_critical = m_attrib_ring_info.is_critical() || m_index_buffer_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_ENV_STORAGE:
heap_critical = m_vertex_env_ring_info.is_critical();
break;
case VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE:
heap_critical = m_fragment_env_ring_info.is_critical() || m_raster_env_ring_info.is_critical();
break;
case VK_HEAP_CHECK_TEXTURE_ENV_STORAGE:
heap_critical = m_fragment_texture_params_ring_info.is_critical();
break;
case VK_HEAP_CHECK_VERTEX_LAYOUT_STORAGE:
heap_critical = m_vertex_layout_ring_info.is_critical();
break;
case VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE:
heap_critical = m_transform_constants_ring_info.is_critical();
break;
case VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE:
heap_critical = m_fragment_constants_ring_info.is_critical();
break;
default:
fmt::throw_exception("Unexpected heap flag set! (0x%X)", test);
}
flags &= ~test;
test <<= 1;
}
while (flags && !heap_critical);
}
if (heap_critical)
{
m_profiler.start();
vk::frame_context_t *target_frame = nullptr;
if (!m_queued_frames.empty())
{
if (m_current_frame != &m_aux_frame_context)
{
target_frame = m_queued_frames.front();
}
}
if (target_frame == nullptr)
{
flush_command_queue(true);
m_vertex_cache->purge();
m_index_buffer_ring_info.reset_allocation_stats();
m_fragment_env_ring_info.reset_allocation_stats();
m_vertex_env_ring_info.reset_allocation_stats();
m_fragment_texture_params_ring_info.reset_allocation_stats();
m_vertex_layout_ring_info.reset_allocation_stats();
m_fragment_constants_ring_info.reset_allocation_stats();
m_transform_constants_ring_info.reset_allocation_stats();
m_attrib_ring_info.reset_allocation_stats();
m_texture_upload_buffer_ring_info.reset_allocation_stats();
m_raster_env_ring_info.reset_allocation_stats();
m_current_frame->reset_heap_ptrs();
m_last_heap_sync_time = rsx::get_shared_tag();
}
else
{
// Flush the frame context
frame_context_cleanup(target_frame);
}
m_frame_stats.flip_time += m_profiler.duration();
}
}
void VKGSRender::check_present_status()
{
while (!m_queued_frames.empty())
{
auto ctx = m_queued_frames.front();
if (!ctx->swap_command_buffer->poke())
{
return;
}
frame_context_cleanup(ctx);
}
}
void VKGSRender::check_descriptors()
{
// Ease resource pressure if the number of draw calls becomes too high or we are running low on memory resources
const auto required_descriptors = rsx::method_registers.current_draw_clause.pass_count();
if (!m_current_frame->descriptor_pool.can_allocate(required_descriptors, m_current_frame->used_descriptors))
{
// Should hard sync before resetting descriptors for spec compliance
flush_command_queue(true);
m_current_frame->descriptor_pool.reset(0);
m_current_frame->used_descriptors = 0;
}
}
VkDescriptorSet VKGSRender::allocate_descriptor_set()
{
if (!m_shader_interpreter.is_interpreter(m_program)) [[likely]]
{
return m_current_frame->descriptor_pool.allocate(descriptor_layouts, VK_TRUE, m_current_frame->used_descriptors++);
}
else
{
return m_shader_interpreter.allocate_descriptor_set();
}
}
void VKGSRender::set_viewport()
{
const auto [clip_width, clip_height] = rsx::apply_resolution_scale<true>(
rsx::method_registers.surface_clip_width(), rsx::method_registers.surface_clip_height());
const auto zclip_near = rsx::method_registers.clip_min();
const auto zclip_far = rsx::method_registers.clip_max();
//NOTE: The scale_offset matrix already has viewport matrix factored in
m_viewport.x = 0;
m_viewport.y = 0;
m_viewport.width = clip_width;
m_viewport.height = clip_height;
if (m_device->get_unrestricted_depth_range_support())
{
m_viewport.minDepth = zclip_near;
m_viewport.maxDepth = zclip_far;
}
else
{
m_viewport.minDepth = 0.f;
m_viewport.maxDepth = 1.f;
}
m_graphics_state &= ~(rsx::pipeline_state::zclip_config_state_dirty);
}
void VKGSRender::set_scissor(bool clip_viewport)
{
areau scissor;
if (get_scissor(scissor, clip_viewport))
{
m_scissor.extent.height = scissor.height();
m_scissor.extent.width = scissor.width();
m_scissor.offset.x = scissor.x1;
m_scissor.offset.y = scissor.y1;
}
}
void VKGSRender::bind_viewport()
{
if (m_graphics_state & rsx::pipeline_state::zclip_config_state_dirty)
{
if (m_device->get_unrestricted_depth_range_support())
{
m_viewport.minDepth = rsx::method_registers.clip_min();
m_viewport.maxDepth = rsx::method_registers.clip_max();
}
m_graphics_state &= ~(rsx::pipeline_state::zclip_config_state_dirty);
}
vkCmdSetViewport(*m_current_command_buffer, 0, 1, &m_viewport);
vkCmdSetScissor(*m_current_command_buffer, 0, 1, &m_scissor);
}
void VKGSRender::on_init_thread()
{
if (m_device == VK_NULL_HANDLE)
{
fmt::throw_exception("No vulkan device was created");
}
GSRender::on_init_thread();
zcull_ctrl.reset(static_cast<::rsx::reports::ZCULL_control*>(this));
if (!m_overlay_manager)
{
m_frame->hide();
m_shaders_cache->load(nullptr, pipeline_layout);
m_frame->show();
}
else
{
rsx::shader_loading_dialog_native dlg(this);
// TODO: Handle window resize messages during loading on GPUs without OUT_OF_DATE_KHR support
m_shaders_cache->load(&dlg, pipeline_layout);
}
}
void VKGSRender::on_exit()
{
GSRender::on_exit();
zcull_ctrl.release();
}
void VKGSRender::clear_surface(u32 mask)
{
if (skip_current_frame || swapchain_unavailable) return;
// If stencil write mask is disabled, remove clear_stencil bit
if (!rsx::method_registers.stencil_mask()) mask &= ~RSX_GCM_CLEAR_STENCIL_BIT;
// Ignore invalid clear flags
if (!(mask & RSX_GCM_CLEAR_ANY_MASK)) return;
u8 ctx = rsx::framebuffer_creation_context::context_draw;
if (mask & RSX_GCM_CLEAR_COLOR_MASK) ctx |= rsx::framebuffer_creation_context::context_clear_color;
if (mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK) ctx |= rsx::framebuffer_creation_context::context_clear_depth;
init_buffers(rsx::framebuffer_creation_context{ctx});
if (!framebuffer_status_valid) return;
//float depth_clear = 1.f;
u32 stencil_clear = 0;
u32 depth_stencil_mask = 0;
std::vector<VkClearAttachment> clear_descriptors;
VkClearValue depth_stencil_clear_values = {}, color_clear_values = {};
u16 scissor_x = static_cast<u16>(m_scissor.offset.x);
u16 scissor_w = static_cast<u16>(m_scissor.extent.width);
u16 scissor_y = static_cast<u16>(m_scissor.offset.y);
u16 scissor_h = static_cast<u16>(m_scissor.extent.height);
const u16 fb_width = m_draw_fbo->width();
const u16 fb_height = m_draw_fbo->height();
//clip region
std::tie(scissor_x, scissor_y, scissor_w, scissor_h) = rsx::clip_region<u16>(fb_width, fb_height, scissor_x, scissor_y, scissor_w, scissor_h, true);
VkClearRect region = { { { scissor_x, scissor_y }, { scissor_w, scissor_h } }, 0, 1 };
const bool full_frame = (scissor_w == fb_width && scissor_h == fb_height);
bool update_color = false, update_z = false;
auto surface_depth_format = rsx::method_registers.surface_depth_fmt();
if (auto ds = std::get<1>(m_rtts.m_bound_depth_stencil); mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK)
{
if (mask & RSX_GCM_CLEAR_DEPTH_BIT)
{
u32 max_depth_value = get_max_depth_value(surface_depth_format);
u32 clear_depth = rsx::method_registers.z_clear_value(is_depth_stencil_format(surface_depth_format));
float depth_clear = static_cast<float>(clear_depth) / max_depth_value;
depth_stencil_clear_values.depthStencil.depth = depth_clear;
depth_stencil_clear_values.depthStencil.stencil = stencil_clear;
depth_stencil_mask |= VK_IMAGE_ASPECT_DEPTH_BIT;
}
if (is_depth_stencil_format(surface_depth_format))
{
if (mask & RSX_GCM_CLEAR_STENCIL_BIT)
{
u8 clear_stencil = rsx::method_registers.stencil_clear_value();
depth_stencil_clear_values.depthStencil.stencil = clear_stencil;
depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
if (ds->samples() > 1)
{
if (full_frame) ds->stencil_init_flags &= 0xFF;
ds->stencil_init_flags |= clear_stencil;
}
}
}
if ((depth_stencil_mask && depth_stencil_mask != ds->aspect()) || !full_frame)
{
// At least one aspect is not being cleared or the clear does not cover the full frame
// Steps to initialize memory are required
if (ds->state_flags & rsx::surface_state_flags::erase_bkgnd && // Needs initialization
ds->old_contents.empty() && !g_cfg.video.read_depth_buffer) // No way to load data from memory, so no initialization given
{
// Only one aspect was cleared. Make sure to memory initialize the other before removing dirty flag
const auto ds_mask = (mask & RSX_GCM_CLEAR_DEPTH_STENCIL_MASK);
if (ds_mask == RSX_GCM_CLEAR_DEPTH_BIT && (ds->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT))
{
// Depth was cleared, initialize stencil
depth_stencil_clear_values.depthStencil.stencil = 0xFF;
depth_stencil_mask |= VK_IMAGE_ASPECT_STENCIL_BIT;
}
else if (ds_mask == RSX_GCM_CLEAR_STENCIL_BIT)
{
// Stencil was cleared, initialize depth
depth_stencil_clear_values.depthStencil.depth = 1.f;
depth_stencil_mask |= VK_IMAGE_ASPECT_DEPTH_BIT;
}
}
else
{
// Barrier required before any writes
ds->write_barrier(*m_current_command_buffer);
}
}
}
if (auto colormask = (mask & RSX_GCM_CLEAR_COLOR_MASK))
{
if (!m_draw_buffers.empty())
{
bool use_fast_clear = false;
u8 clear_a = rsx::method_registers.clear_color_a();
u8 clear_r = rsx::method_registers.clear_color_r();
u8 clear_g = rsx::method_registers.clear_color_g();
u8 clear_b = rsx::method_registers.clear_color_b();
switch (rsx::method_registers.surface_color())
{
case rsx::surface_color_format::x32:
case rsx::surface_color_format::w16z16y16x16:
case rsx::surface_color_format::w32z32y32x32:
{
//NOP
colormask = 0;
break;
}
case rsx::surface_color_format::b8:
{
rsx::get_b8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_b8_clearmask(colormask);
use_fast_clear = (colormask == RSX_GCM_CLEAR_RED_BIT);
break;
}
case rsx::surface_color_format::g8b8:
{
rsx::get_g8b8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_g8b8_r8g8_clearmask(colormask);
use_fast_clear = (colormask == (RSX_GCM_CLEAR_RED_BIT | RSX_GCM_CLEAR_GREEN_BIT));
break;
}
case rsx::surface_color_format::a8b8g8r8:
case rsx::surface_color_format::x8b8g8r8_o8b8g8r8:
case rsx::surface_color_format::x8b8g8r8_z8b8g8r8:
{
rsx::get_abgr8_clear_color(clear_r, clear_g, clear_b, clear_a);
colormask = rsx::get_abgr8_clearmask(colormask);
[[fallthrough]];
}
default:
{
use_fast_clear = (colormask == RSX_GCM_CLEAR_COLOR_MASK);
break;
}
}
if (colormask)
{
if (!use_fast_clear || !full_frame)
{
// If we're not clobber all the memory, a barrier is required
for (const auto& index : m_rtts.m_bound_render_target_ids)
{
m_rtts.m_bound_render_targets[index].second->write_barrier(*m_current_command_buffer);
}
}
color_clear_values.color.float32[0] = static_cast<float>(clear_r) / 255;
color_clear_values.color.float32[1] = static_cast<float>(clear_g) / 255;
color_clear_values.color.float32[2] = static_cast<float>(clear_b) / 255;
color_clear_values.color.float32[3] = static_cast<float>(clear_a) / 255;
if (use_fast_clear)
{
for (u32 index = 0; index < m_draw_buffers.size(); ++index)
{
clear_descriptors.push_back({ VK_IMAGE_ASPECT_COLOR_BIT, index, color_clear_values });
}
}
else
{
color4f clear_color =
{
color_clear_values.color.float32[0],
color_clear_values.color.float32[1],
color_clear_values.color.float32[2],
color_clear_values.color.float32[3]
};
auto attachment_clear_pass = vk::get_overlay_pass<vk::attachment_clear_pass>();
attachment_clear_pass->run(*m_current_command_buffer, m_draw_fbo, region.rect, colormask, clear_color, get_render_pass());
}
update_color = true;
}
}
}
if (depth_stencil_mask)
{
if ((depth_stencil_mask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
rsx::method_registers.stencil_mask() != 0xff)
{
// Partial stencil clear. Disables fast stencil clear
auto ds = std::get<1>(m_rtts.m_bound_depth_stencil);
auto key = vk::get_renderpass_key({ ds });
auto renderpass = vk::get_renderpass(*m_device, key);
vk::get_overlay_pass<vk::stencil_clear_pass>()->run(
*m_current_command_buffer, ds, region.rect,
depth_stencil_clear_values.depthStencil.stencil,
rsx::method_registers.stencil_mask(), renderpass);
depth_stencil_mask &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
}
if (depth_stencil_mask)
{
clear_descriptors.push_back({ static_cast<VkImageAspectFlags>(depth_stencil_mask), 0, depth_stencil_clear_values });
}
update_z = true;
}
if (update_color || update_z)
{
m_rtts.on_write({ update_color, update_color, update_color, update_color }, update_z);
}
if (!clear_descriptors.empty())
{
begin_render_pass();
vkCmdClearAttachments(*m_current_command_buffer, ::size32(clear_descriptors), clear_descriptors.data(), 1, &region);
}
}
void VKGSRender::flush_command_queue(bool hard_sync, bool do_not_switch)
{
close_and_submit_command_buffer();
if (hard_sync)
{
// wait for the latest instruction to execute
m_current_command_buffer->reset();
// Clear all command buffer statuses
m_primary_cb_list.poke_all();
// Drain present queue
while (!m_queued_frames.empty())
{
check_present_status();
}
m_flush_requests.clear_pending_flag();
}
if (!do_not_switch)
{
// Grab next cb in line and make it usable
// NOTE: Even in the case of a hard sync, this is required to free any waiters on the CB (ZCULL)
m_current_command_buffer = m_primary_cb_list.next();
m_current_command_buffer->reset();
}
else
{
// Special hard-sync where we must preserve the CB. This can happen when an emergency event handler is invoked and needs to flush to hw.
ensure(hard_sync);
}
// Just in case a queued frame holds a ref to this cb, drain the present queue
check_present_status();
if (m_occlusion_query_active)
{
m_current_command_buffer->flags |= vk::command_buffer::cb_load_occluson_task;
}
m_current_command_buffer->begin();
}
bool VKGSRender::release_GCM_label(u32 address, u32 args)
{
if (!backend_config.supports_host_gpu_labels)
{
return false;
}
auto drain_label_queue = [this]()
{
while (m_host_data_ptr->last_label_release_event > m_host_data_ptr->commands_complete_event)
{
utils::pause();
if (thread_ctrl::state() == thread_state::aborting)
{
break;
}
}
};
ensure(m_host_data_ptr);
if (m_host_data_ptr->texture_load_complete_event == m_host_data_ptr->texture_load_request_event)
{
// All texture loads already seen by the host GPU
// Wait for all previously submitted labels to be flushed
drain_label_queue();
return false;
}
const auto mapping = vk::map_dma(address, 4);
const auto write_data = std::bit_cast<u32, be_t<u32>>(args);
if (!dynamic_cast<vk::memory_block_host*>(mapping.second->memory.get()))
{
// NVIDIA GPUs can disappoint when DMA blocks straddle VirtualAlloc boundaries.
// Take the L and try the fallback.
rsx_log.warning("Host label update at 0x%x was not possible.", address);
drain_label_queue();
return false;
}
m_host_data_ptr->last_label_release_event = m_host_data_ptr->inc_counter();
if (m_host_data_ptr->texture_load_request_event > m_host_data_ptr->last_label_submit_event)
{
if (vk::is_renderpass_open(*m_current_command_buffer))
{
vk::end_renderpass(*m_current_command_buffer);
}
vkCmdUpdateBuffer(*m_current_command_buffer, mapping.second->value, mapping.first, 4, &write_data);
flush_command_queue();
}
else
{
auto cmd = m_secondary_cb_list.next();
cmd->begin();
vkCmdUpdateBuffer(*cmd, mapping.second->value, mapping.first, 4, &write_data);
vkCmdUpdateBuffer(*cmd, m_host_object_data->value, ::offset32(&vk::host_data_t::commands_complete_event), 8, const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
cmd->end();
vk::queue_submit_t submit_info = { m_device->get_graphics_queue(), nullptr };
cmd->submit(submit_info);
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
return true;
}
void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_t payload)
{
rsx::thread::sync_hint(hint, payload);
if (!(m_current_command_buffer->flags & vk::command_buffer::cb_has_occlusion_task))
{
// Occlusion queries not enabled, do nothing
return;
}
// Occlusion test result evaluation is coming up, avoid a hard sync
switch (hint)
{
case rsx::FIFO_hint::hint_conditional_render_eval:
{
// If a flush request is already enqueued, do nothing
if (m_flush_requests.pending())
{
return;
}
// If the result is not going to be read by CELL, do nothing
const auto ref_addr = static_cast<u32>(payload.address);
if (!zcull_ctrl->is_query_result_urgent(ref_addr))
{
// No effect on CELL behaviour, it will be faster to handle this in RSX code
return;
}
// OK, cell will be accessing the results, probably.
// Try to avoid flush spam, it is more costly to flush the CB than it is to just upload the vertex data
// This is supposed to be an optimization afterall.
const auto now = rsx::uclock();
if ((now - m_last_cond_render_eval_hint) > 50)
{
// Schedule a sync on the next loop iteration
m_flush_requests.post(false);
m_flush_requests.remove_one();
}
m_last_cond_render_eval_hint = now;
break;
}
case rsx::FIFO_hint::hint_zcull_sync:
{
// Check if the required report is synced to this CB
auto& data = m_occlusion_map[payload.query->driver_handle];
// NOTE: Currently, a special condition exists where the indices can be empty even with active draw count.
// This is caused by async compiler and should be removed when ubershaders are added in
if (!data.is_current(m_current_command_buffer) || data.indices.empty())
{
return;
}
// Unavoidable hard sync coming up, flush immediately
// This heavyweight hint should be used with caution
std::lock_guard lock(m_flush_queue_mutex);
flush_command_queue();
if (m_flush_requests.pending())
{
// Clear without wait
m_flush_requests.clear_pending_flag();
}
break;
}
}
}
void VKGSRender::do_local_task(rsx::FIFO_state state)
{
if (m_queue_status & flush_queue_state::deadlock)
{
// Clear offloader deadlock
// NOTE: It is not possible to handle regular flush requests before this is cleared
// NOTE: This may cause graphics corruption due to unsynchronized modification
on_invalidate_memory_range(m_offloader_fault_range, m_offloader_fault_cause);
m_queue_status.clear(flush_queue_state::deadlock);
}
if (m_queue_status & flush_queue_state::flushing)
{
// Abort recursive CB submit requests.
// When flushing flag is already set, only deadlock events may be processed.
return;
}
else if (m_flush_requests.pending())
{
if (m_flush_queue_mutex.try_lock())
{
// TODO: Determine if a hard sync is necessary
// Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
flush_command_queue();
m_flush_requests.clear_pending_flag();
m_flush_requests.consumer_wait();
m_flush_queue_mutex.unlock();
}
}
else if (!in_begin_end && state != rsx::FIFO_state::lock_wait)
{
if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty)
{
//This will re-engage locks and break the texture cache if another thread is waiting in access violation handler!
//Only call when there are no waiters
m_texture_cache.do_update();
m_graphics_state &= ~rsx::pipeline_state::framebuffer_reads_dirty;
}
}
rsx::thread::do_local_task(state);
switch (state)
{
case rsx::FIFO_state::lock_wait:
// Critical check finished
return;
//case rsx::FIFO_state::spinning:
//case rsx::FIFO_state::empty:
// We have some time, check the present queue
//check_present_status();
//break;
default:
break;
}
if (m_overlay_manager)
{
if (!in_begin_end && async_flip_requested & flip_request::native_ui)
{
flush_command_queue(true);
rsx::display_flip_info_t info{};
info.buffer = current_display_buffer;
flip(info);
}
}
}
bool VKGSRender::load_program()
{
if (m_graphics_state & rsx::pipeline_state::invalidate_pipeline_bits)
{
get_current_fragment_program(fs_sampler_state);
ensure(current_fragment_program.valid);
get_current_vertex_program(vs_sampler_state);
m_graphics_state &= ~rsx::pipeline_state::invalidate_pipeline_bits;
}
auto &vertex_program = current_vertex_program;
auto &fragment_program = current_fragment_program;
auto old_program = m_program;
vk::pipeline_props properties{};
// Input assembly
bool emulated_primitive_type;
properties.state.set_primitive_type(vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, emulated_primitive_type));
const bool restarts_valid = rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed && !emulated_primitive_type && !rsx::method_registers.current_draw_clause.is_disjoint_primitive;
if (rsx::method_registers.restart_index_enabled() && !vk::emulate_primitive_restart(rsx::method_registers.current_draw_clause.primitive) && restarts_valid)
properties.state.enable_primitive_restart();
// Rasterizer state
properties.state.set_attachment_count(::size32(m_draw_buffers));
properties.state.set_front_face(vk::get_front_face(rsx::method_registers.front_face_mode()));
properties.state.enable_depth_clamp(rsx::method_registers.depth_clamp_enabled() || !rsx::method_registers.depth_clip_enabled());
properties.state.enable_depth_bias(true);
properties.state.enable_depth_bounds_test(m_device->get_depth_bounds_support());
if (rsx::method_registers.depth_test_enabled())
{
//NOTE: Like stencil, depth write is meaningless without depth test
properties.state.set_depth_mask(rsx::method_registers.depth_write_enabled());
properties.state.enable_depth_test(vk::get_compare_func(rsx::method_registers.depth_func()));
}
if (rsx::method_registers.logic_op_enabled())
properties.state.enable_logic_op(vk::get_logic_op(rsx::method_registers.logic_operation()));
if (rsx::method_registers.cull_face_enabled())
properties.state.enable_cull_face(vk::get_cull_face(rsx::method_registers.cull_face_mode()));
for (uint index = 0; index < m_draw_buffers.size(); ++index)
{
bool color_mask_b = rsx::method_registers.color_mask_b(index);
bool color_mask_g = rsx::method_registers.color_mask_g(index);
bool color_mask_r = rsx::method_registers.color_mask_r(index);
bool color_mask_a = rsx::method_registers.color_mask_a(index);
switch (rsx::method_registers.surface_color())
{
case rsx::surface_color_format::b8:
rsx::get_b8_colormask(color_mask_r, color_mask_g, color_mask_b, color_mask_a);
break;
case rsx::surface_color_format::g8b8:
rsx::get_g8b8_r8g8_colormask(color_mask_r, color_mask_g, color_mask_b, color_mask_a);
break;
default:
break;
}
properties.state.set_color_mask(index, color_mask_r, color_mask_g, color_mask_b, color_mask_a);
}
bool mrt_blend_enabled[] =
{
rsx::method_registers.blend_enabled(),
rsx::method_registers.blend_enabled_surface_1(),
rsx::method_registers.blend_enabled_surface_2(),
rsx::method_registers.blend_enabled_surface_3()
};
VkBlendFactor sfactor_rgb, sfactor_a, dfactor_rgb, dfactor_a;
VkBlendOp equation_rgb, equation_a;
if (mrt_blend_enabled[0] || mrt_blend_enabled[1] || mrt_blend_enabled[2] || mrt_blend_enabled[3])
{
sfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_rgb());
sfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_sfactor_a());
dfactor_rgb = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_rgb());
dfactor_a = vk::get_blend_factor(rsx::method_registers.blend_func_dfactor_a());
equation_rgb = vk::get_blend_op(rsx::method_registers.blend_equation_rgb());
equation_a = vk::get_blend_op(rsx::method_registers.blend_equation_a());
for (u8 idx = 0; idx < m_draw_buffers.size(); ++idx)
{
if (mrt_blend_enabled[idx])
{
properties.state.enable_blend(idx, sfactor_rgb, sfactor_a, dfactor_rgb, dfactor_a, equation_rgb, equation_a);
}
}
}
if (rsx::method_registers.stencil_test_enabled())
{
if (!rsx::method_registers.two_sided_stencil_test_enabled())
{
properties.state.enable_stencil_test(
vk::get_stencil_op(rsx::method_registers.stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
}
else
{
properties.state.enable_stencil_test_separate(0,
vk::get_stencil_op(rsx::method_registers.stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
properties.state.enable_stencil_test_separate(1,
vk::get_stencil_op(rsx::method_registers.back_stencil_op_fail()),
vk::get_stencil_op(rsx::method_registers.back_stencil_op_zfail()),
vk::get_stencil_op(rsx::method_registers.back_stencil_op_zpass()),
vk::get_compare_func(rsx::method_registers.back_stencil_func()),
0xFF, 0xFF); //write mask, func_mask, ref are dynamic
}
if (auto ds = m_rtts.m_bound_depth_stencil.second;
ds && ds->samples() > 1 && !(ds->stencil_init_flags & 0xFF00))
{
if (properties.state.ds.front.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.front.passOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.failOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.depthFailOp != VK_STENCIL_OP_KEEP ||
properties.state.ds.back.passOp != VK_STENCIL_OP_KEEP)
{
// Toggle bit 9 to signal require full bit-wise transfer
ds->stencil_init_flags |= (1 << 8);
}
}
}
const auto rasterization_samples = u8((m_current_renderpass_key >> 16) & 0xF);
if (backend_config.supports_hw_a2c || rasterization_samples > 1)
{
const bool alpha_to_one_enable = rsx::method_registers.msaa_alpha_to_one_enabled() && backend_config.supports_hw_a2one;
properties.state.set_multisample_state(
rasterization_samples,
rsx::method_registers.msaa_sample_mask(),
rsx::method_registers.msaa_enabled(),
rsx::method_registers.msaa_alpha_to_coverage_enabled(),
alpha_to_one_enable);
// A problem observed on multiple GPUs is that interior geometry edges can resolve 0 samples unless we force shading rate of 1.
// For whatever reason, the way MSAA images are 'resolved' on PS3 bypasses this issue.
// NOTE: We do not do image resolve at all, the output is merely 'exploded' and the guest application is responsible for doing the resolve in software as it is on real hardware.
properties.state.set_multisample_shading_rate(1.f);
}
properties.renderpass_key = m_current_renderpass_key;
if (!m_interpreter_state && m_program) [[likely]]
{
if (!m_shader_interpreter.is_interpreter(m_program) &&
m_pipeline_properties == properties)
{
// Nothing changed
return true;
}
}
const auto shadermode = g_cfg.video.shadermode.get();
m_vertex_prog = nullptr;
m_fragment_prog = nullptr;
if (shadermode != shader_mode::interpreter_only) [[likely]]
{
vk::enter_uninterruptible();
// Load current program from cache
std::tie(m_program, m_vertex_prog, m_fragment_prog) = m_prog_buffer->get_graphics_pipeline(vertex_program, fragment_program, properties,
shadermode != shader_mode::recompiler, true, pipeline_layout);
vk::leave_uninterruptible();
if (m_prog_buffer->check_cache_missed())
{
// Notify the user with HUD notification
if (g_cfg.misc.show_shader_compilation_hint)
{
if (m_overlay_manager)
{
if (auto dlg = m_overlay_manager->get<rsx::overlays::shader_compile_notification>())
{
// Extend duration
dlg->touch();
}
else
{
// Create dialog but do not show immediately
m_overlay_manager->create<rsx::overlays::shader_compile_notification>();
}
}
}
}
}
else
{
m_program = nullptr;
}
if (!m_program && (shadermode == shader_mode::async_with_interpreter || shadermode == shader_mode::interpreter_only))
{
if (!m_shader_interpreter.is_interpreter(old_program))
{
m_interpreter_state = rsx::invalidate_pipeline_bits;
}
m_program = m_shader_interpreter.get(properties, current_fp_metadata);
}
m_pipeline_properties = properties;
return m_program != nullptr;
}
void VKGSRender::load_program_env()
{
if (!m_program)
{
fmt::throw_exception("Unreachable right now");
}
const u32 fragment_constants_size = current_fp_metadata.program_constants_buffer_length;
const bool update_transform_constants = !!(m_graphics_state & rsx::pipeline_state::transform_constants_dirty);
const bool update_fragment_constants = !!(m_graphics_state & rsx::pipeline_state::fragment_constants_dirty);
const bool update_vertex_env = !!(m_graphics_state & rsx::pipeline_state::vertex_state_dirty);
const bool update_fragment_env = !!(m_graphics_state & rsx::pipeline_state::fragment_state_dirty);
const bool update_fragment_texture_env = !!(m_graphics_state & rsx::pipeline_state::fragment_texture_state_dirty);
const bool update_instruction_buffers = (!!m_interpreter_state && m_shader_interpreter.is_interpreter(m_program));
const bool update_raster_env = (rsx::method_registers.polygon_stipple_enabled() && !!(m_graphics_state & rsx::pipeline_state::polygon_stipple_pattern_dirty));
if (update_vertex_env)
{
check_heap_status(VK_HEAP_CHECK_VERTEX_ENV_STORAGE);
// Vertex state
const auto mem = m_vertex_env_ring_info.alloc<256>(256);
auto buf = static_cast<u8*>(m_vertex_env_ring_info.map(mem, 148));
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<f32*>(buf + 132)) = rsx::method_registers.point_size() * rsx::get_resolution_scale();
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.clip_min();
*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_max();
m_vertex_env_ring_info.unmap();
m_vertex_env_buffer_info = { m_vertex_env_ring_info.heap->value, mem, 144 };
}
if (update_transform_constants)
{
// Transform constants
const usz transform_constants_size = (!m_vertex_prog || m_vertex_prog->has_indexed_constants) ? 8192 : m_vertex_prog->constant_ids.size() * 16;
if (transform_constants_size)
{
check_heap_status(VK_HEAP_CHECK_TRANSFORM_CONSTANTS_STORAGE);
const auto alignment = m_device->gpu().get_limits().minUniformBufferOffsetAlignment;
auto mem = m_transform_constants_ring_info.alloc<1>(utils::align(transform_constants_size, alignment));
auto buf = m_transform_constants_ring_info.map(mem, transform_constants_size);
const std::vector<u16>& constant_ids = (transform_constants_size == 8192) ? std::vector<u16>{} : m_vertex_prog->constant_ids;
fill_vertex_program_constants_data(buf, constant_ids);
m_transform_constants_ring_info.unmap();
m_vertex_constants_buffer_info = { m_transform_constants_ring_info.heap->value, mem, transform_constants_size };
}
}
if (update_fragment_constants && !update_instruction_buffers)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_CONSTANTS_STORAGE);
// Fragment constants
if (fragment_constants_size)
{
auto mem = m_fragment_constants_ring_info.alloc<256>(fragment_constants_size);
auto buf = m_fragment_constants_ring_info.map(mem, fragment_constants_size);
m_prog_buffer->fill_fragment_constants_buffer({ reinterpret_cast<float*>(buf), fragment_constants_size },
*ensure(m_fragment_prog), current_fragment_program, true);
m_fragment_constants_ring_info.unmap();
m_fragment_constants_buffer_info = { m_fragment_constants_ring_info.heap->value, mem, fragment_constants_size };
}
else
{
m_fragment_constants_buffer_info = { m_fragment_constants_ring_info.heap->value, 0, 32 };
}
}
if (update_fragment_env)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_fragment_env_ring_info.alloc<256>(256);
auto buf = m_fragment_env_ring_info.map(mem, 32);
fill_fragment_state_buffer(buf, current_fragment_program);
m_fragment_env_ring_info.unmap();
m_fragment_env_buffer_info = { m_fragment_env_ring_info.heap->value, mem, 32 };
}
if (update_fragment_texture_env)
{
check_heap_status(VK_HEAP_CHECK_TEXTURE_ENV_STORAGE);
auto mem = m_fragment_texture_params_ring_info.alloc<256>(512);
auto buf = m_fragment_texture_params_ring_info.map(mem, 512);
current_fragment_program.texture_params.write_to(buf, current_fp_metadata.referenced_textures_mask);
m_fragment_texture_params_ring_info.unmap();
m_fragment_texture_params_buffer_info = { m_fragment_texture_params_ring_info.heap->value, mem, 512 };
}
if (update_raster_env)
{
check_heap_status(VK_HEAP_CHECK_FRAGMENT_ENV_STORAGE);
auto mem = m_raster_env_ring_info.alloc<256>(256);
auto buf = m_raster_env_ring_info.map(mem, 128);
std::memcpy(buf, rsx::method_registers.polygon_stipple_pattern(), 128);
m_raster_env_ring_info.unmap();
m_raster_env_buffer_info = { m_raster_env_ring_info.heap->value, mem, 128 };
m_graphics_state &= ~(rsx::pipeline_state::polygon_stipple_pattern_dirty);
}
if (update_instruction_buffers)
{
if (m_interpreter_state & rsx::vertex_program_dirty)
{
// Attach vertex buffer data
const auto vp_block_length = current_vp_metadata.ucode_length + 16;
auto vp_mapping = m_vertex_instructions_buffer.alloc<256>(vp_block_length);
auto vp_buf = static_cast<u8*>(m_vertex_instructions_buffer.map(vp_mapping, vp_block_length));
auto vp_config = reinterpret_cast<u32*>(vp_buf);
vp_config[0] = current_vertex_program.base_address;
vp_config[1] = current_vertex_program.entry;
vp_config[2] = current_vertex_program.output_mask;
vp_config[3] = rsx::method_registers.two_side_light_en()? 1u: 0u;
std::memcpy(vp_buf + 16, current_vertex_program.data.data(), current_vp_metadata.ucode_length);
m_vertex_instructions_buffer.unmap();
m_vertex_instructions_buffer_info = { m_vertex_instructions_buffer.heap->value, vp_mapping, vp_block_length };
}
if (m_interpreter_state & rsx::fragment_program_dirty)
{
// Attach fragment buffer data
const auto fp_block_length = current_fp_metadata.program_ucode_length + 16;
auto fp_mapping = m_fragment_instructions_buffer.alloc<256>(fp_block_length);
auto fp_buf = static_cast<u8*>(m_fragment_instructions_buffer.map(fp_mapping, fp_block_length));
// Control mask
const auto control_masks = reinterpret_cast<u32*>(fp_buf);
control_masks[0] = rsx::method_registers.shader_control();
control_masks[1] = current_fragment_program.texture_state.texture_dimensions;
std::memcpy(fp_buf + 16, current_fragment_program.get_data(), current_fragment_program.ucode_length);
m_fragment_instructions_buffer.unmap();
m_fragment_instructions_buffer_info = { m_fragment_instructions_buffer.heap->value, fp_mapping, fp_block_length };
}
}
const auto& binding_table = m_device->get_pipeline_binding_table();
m_program->bind_uniform(m_vertex_env_buffer_info, binding_table.vertex_params_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_vertex_constants_buffer_info, binding_table.vertex_constant_buffers_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_fragment_env_buffer_info, binding_table.fragment_state_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_fragment_texture_params_buffer_info, binding_table.fragment_texture_params_bind_slot, m_current_frame->descriptor_set);
m_program->bind_uniform(m_raster_env_buffer_info, binding_table.rasterizer_env_bind_slot, m_current_frame->descriptor_set);
if (!m_shader_interpreter.is_interpreter(m_program))
{
m_program->bind_uniform(m_fragment_constants_buffer_info, binding_table.fragment_constant_buffers_bind_slot, m_current_frame->descriptor_set);
}
else
{
m_program->bind_buffer(m_vertex_instructions_buffer_info, m_shader_interpreter.get_vertex_instruction_location(), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
m_program->bind_buffer(m_fragment_instructions_buffer_info, m_shader_interpreter.get_fragment_instruction_location(), VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
if (vk::emulate_conditional_rendering())
{
auto predicate = m_cond_render_buffer ? m_cond_render_buffer->value : vk::get_scratch_buffer(*m_current_command_buffer, 4)->value;
m_program->bind_buffer({ predicate, 0, 4 }, binding_table.conditional_render_predicate_slot, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_current_frame->descriptor_set);
}
//Clear flags
const u32 handled_flags = (rsx::pipeline_state::fragment_state_dirty | rsx::pipeline_state::vertex_state_dirty | rsx::pipeline_state::transform_constants_dirty | rsx::pipeline_state::fragment_constants_dirty | rsx::pipeline_state::fragment_texture_state_dirty);
m_graphics_state &= ~handled_flags;
}
void VKGSRender::update_vertex_env(u32 id, const vk::vertex_upload_info& vertex_info)
{
// Actual allocation must have been done previously
u32 base_offset;
const u32 offset32 = static_cast<u32>(m_vertex_layout_stream_info.offset);
const u32 range32 = static_cast<u32>(m_vertex_layout_stream_info.range);
if (!m_vertex_layout_storage || !m_vertex_layout_storage->in_range(offset32, range32, base_offset))
{
ensure(m_texbuffer_view_size >= m_vertex_layout_stream_info.range);
if (m_vertex_layout_storage)
m_current_frame->buffer_views_to_clean.push_back(std::move(m_vertex_layout_storage));
const usz alloc_addr = m_vertex_layout_stream_info.offset;
const usz view_size = (alloc_addr + m_texbuffer_view_size) > m_vertex_layout_ring_info.size() ? m_vertex_layout_ring_info.size() - alloc_addr : m_texbuffer_view_size;
m_vertex_layout_storage = std::make_unique<vk::buffer_view>(*m_device, m_vertex_layout_ring_info.heap->value, VK_FORMAT_R32G32_UINT, alloc_addr, view_size);
base_offset = 0;
}
u8 data_size = 16;
u32 draw_info[5];
draw_info[0] = vertex_info.vertex_index_base;
draw_info[1] = vertex_info.vertex_index_offset;
draw_info[2] = id;
draw_info[3] = (id * 16) + (base_offset / 8);
if (vk::emulate_conditional_rendering())
{
draw_info[4] = cond_render_ctrl.hw_cond_active ? 1 : 0;
data_size = 20;
}
vkCmdPushConstants(*m_current_command_buffer, pipeline_layout, VK_SHADER_STAGE_VERTEX_BIT, 0, data_size, draw_info);
const usz data_offset = (id * 128) + m_vertex_layout_stream_info.offset;
auto dst = m_vertex_layout_ring_info.map(data_offset, 128);
fill_vertex_layout_state(m_vertex_layout, vertex_info.first_vertex, vertex_info.allocated_vertex_count, static_cast<s32*>(dst),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
m_vertex_layout_ring_info.unmap();
}
void VKGSRender::init_buffers(rsx::framebuffer_creation_context context, bool)
{
prepare_rtts(context);
}
void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore wait_semaphore, VkSemaphore signal_semaphore, VkPipelineStageFlags pipeline_stage_flags)
{
ensure(!m_queue_status.test_and_set(flush_queue_state::flushing));
// Workaround for deadlock occuring during RSX offloader fault
// TODO: Restructure command submission infrastructure to avoid this condition
const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
const VkBool32 force_flush = !sync_success;
if (vk::test_status_interrupt(vk::heap_dirty))
{
if (m_attrib_ring_info.is_dirty() ||
m_fragment_env_ring_info.is_dirty() ||
m_vertex_env_ring_info.is_dirty() ||
m_fragment_texture_params_ring_info.is_dirty() ||
m_vertex_layout_ring_info.is_dirty() ||
m_fragment_constants_ring_info.is_dirty() ||
m_index_buffer_ring_info.is_dirty() ||
m_transform_constants_ring_info.is_dirty() ||
m_texture_upload_buffer_ring_info.is_dirty() ||
m_raster_env_ring_info.is_dirty())
{
auto secondary_command_buffer = m_secondary_cb_list.next();
secondary_command_buffer->begin();
m_attrib_ring_info.sync(*secondary_command_buffer);
m_fragment_env_ring_info.sync(*secondary_command_buffer);
m_vertex_env_ring_info.sync(*secondary_command_buffer);
m_fragment_texture_params_ring_info.sync(*secondary_command_buffer);
m_vertex_layout_ring_info.sync(*secondary_command_buffer);
m_fragment_constants_ring_info.sync(*secondary_command_buffer);
m_index_buffer_ring_info.sync(*secondary_command_buffer);
m_transform_constants_ring_info.sync(*secondary_command_buffer);
m_texture_upload_buffer_ring_info.sync(*secondary_command_buffer);
m_raster_env_ring_info.sync(*secondary_command_buffer);
secondary_command_buffer->end();
vk::queue_submit_t submit_info{ m_device->get_graphics_queue(), nullptr };
secondary_command_buffer->submit(submit_info, force_flush);
}
vk::clear_status_interrupt(vk::heap_dirty);
}
#if 0 // Currently unreachable
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_conditional_render)
{
ensure(m_render_pass_open);
m_device->_vkCmdEndConditionalRenderingEXT(*m_current_command_buffer);
}
#endif
// End any active renderpasses; the caller should handle reopening
if (vk::is_renderpass_open(*m_current_command_buffer))
{
close_render_pass();
}
// End open queries. Flags will be automatically reset by the submit routine
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
if (m_host_data_ptr && m_host_data_ptr->last_label_release_event > m_host_data_ptr->last_label_submit_event)
{
vkCmdUpdateBuffer(*m_current_command_buffer,
m_host_object_data->value,
::offset32(&vk::host_data_t::commands_complete_event),
sizeof(u64),
const_cast<u64*>(&m_host_data_ptr->last_label_release_event));
m_host_data_ptr->last_label_submit_event = m_host_data_ptr->last_label_release_event;
}
m_current_command_buffer->end();
m_current_command_buffer->tag();
// Supporting concurrent access vastly simplifies this logic.
// Instead of doing CB slice injection, we can just chain these together logically with the async stream going first
vk::queue_submit_t primary_submit_info{ m_device->get_graphics_queue(), pFence };
vk::queue_submit_t secondary_submit_info{};
if (wait_semaphore)
{
primary_submit_info.wait_on(wait_semaphore, pipeline_stage_flags);
}
auto& async_scheduler = g_fxo->get<vk::AsyncTaskScheduler>();
if (async_scheduler.is_recording())
{
if (async_scheduler.is_host_mode())
{
const VkSemaphore async_sema = *async_scheduler.get_sema();
secondary_submit_info.queue_signal(async_sema);
primary_submit_info.wait_on(async_sema, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
// Delay object destruction by one cycle
vk::get_resource_manager()->push_down_current_scope();
}
async_scheduler.flush(secondary_submit_info, force_flush);
}
if (signal_semaphore)
{
primary_submit_info.queue_signal(signal_semaphore);
}
m_current_command_buffer->submit(primary_submit_info, force_flush);
m_queue_status.clear(flush_queue_state::flushing);
}
void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
{
const bool clipped_scissor = (context == rsx::framebuffer_creation_context::context_draw);
if (m_current_framebuffer_context == context && !m_rtts_dirty && m_draw_fbo)
{
// Fast path
// Framebuffer usage has not changed, framebuffer exists and config regs have not changed
set_scissor(clipped_scissor);
return;
}
m_rtts_dirty = false;
framebuffer_status_valid = false;
m_framebuffer_state_contested = false;
get_framebuffer_layout(context, m_framebuffer_layout);
if (!framebuffer_status_valid)
{
return;
}
if (m_draw_fbo && m_framebuffer_layout.ignore_change)
{
// Nothing has changed, we're still using the same framebuffer
// Update flags to match current
set_scissor(clipped_scissor);
return;
}
m_rtts.prepare_render_target(*m_current_command_buffer,
m_framebuffer_layout.color_format, m_framebuffer_layout.depth_format,
m_framebuffer_layout.width, m_framebuffer_layout.height,
m_framebuffer_layout.target, m_framebuffer_layout.aa_mode, m_framebuffer_layout.raster_type,
m_framebuffer_layout.color_addresses, m_framebuffer_layout.zeta_address,
m_framebuffer_layout.actual_color_pitch, m_framebuffer_layout.actual_zeta_pitch,
(*m_device), *m_current_command_buffer);
// Reset framebuffer information
const auto color_bpp = get_format_block_size_in_bytes(m_framebuffer_layout.color_format);
const auto samples = get_format_sample_count(m_framebuffer_layout.aa_mode);
for (u8 i = 0; i < rsx::limits::color_buffers_count; ++i)
{
// Flush old address if we keep missing it
if (m_surface_info[i].pitch && g_cfg.video.write_color_buffers)
{
const utils::address_range rsx_range = m_surface_info[i].get_memory_range();
m_texture_cache.set_memory_read_flags(rsx_range, rsx::memory_read_flags::flush_once);
m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, rsx_range);
}
m_surface_info[i].address = m_surface_info[i].pitch = 0;
m_surface_info[i].width = m_framebuffer_layout.width;
m_surface_info[i].height = m_framebuffer_layout.height;
m_surface_info[i].color_format = m_framebuffer_layout.color_format;
m_surface_info[i].bpp = color_bpp;
m_surface_info[i].samples = samples;
}
//Process depth surface as well
{
if (m_depth_surface_info.pitch && g_cfg.video.write_depth_buffer)
{
const utils::address_range surface_range = m_depth_surface_info.get_memory_range();
m_texture_cache.set_memory_read_flags(surface_range, rsx::memory_read_flags::flush_once);
m_texture_cache.flush_if_cache_miss_likely(*m_current_command_buffer, surface_range);
}
m_depth_surface_info.address = m_depth_surface_info.pitch = 0;
m_depth_surface_info.width = m_framebuffer_layout.width;
m_depth_surface_info.height = m_framebuffer_layout.height;
m_depth_surface_info.depth_format = m_framebuffer_layout.depth_format;
m_depth_surface_info.bpp = get_format_block_size_in_bytes(m_framebuffer_layout.depth_format);
m_depth_surface_info.samples = samples;
}
//Bind created rtts as current fbo...
const auto draw_buffers = rsx::utility::get_rtt_indexes(m_framebuffer_layout.target);
m_draw_buffers.clear();
m_fbo_images.clear();
for (u8 index : draw_buffers)
{
if (auto surface = std::get<1>(m_rtts.m_bound_render_targets[index]))
{
m_fbo_images.push_back(surface);
m_surface_info[index].address = m_framebuffer_layout.color_addresses[index];
m_surface_info[index].pitch = m_framebuffer_layout.actual_color_pitch[index];
ensure(surface->rsx_pitch == m_framebuffer_layout.actual_color_pitch[index]);
m_texture_cache.notify_surface_changed(m_surface_info[index].get_memory_range(m_framebuffer_layout.aa_factors));
m_draw_buffers.push_back(index);
}
}
if (std::get<0>(m_rtts.m_bound_depth_stencil) != 0)
{
auto ds = std::get<1>(m_rtts.m_bound_depth_stencil);
m_fbo_images.push_back(ds);
m_depth_surface_info.address = m_framebuffer_layout.zeta_address;
m_depth_surface_info.pitch = m_framebuffer_layout.actual_zeta_pitch;
ensure(ds->rsx_pitch == m_framebuffer_layout.actual_zeta_pitch);
m_texture_cache.notify_surface_changed(m_depth_surface_info.get_memory_range(m_framebuffer_layout.aa_factors));
}
// Before messing with memory properties, flush command queue if there are dma transfers queued up
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
{
flush_command_queue();
}
if (!m_rtts.superseded_surfaces.empty())
{
for (auto& surface : m_rtts.superseded_surfaces)
{
m_texture_cache.discard_framebuffer_memory_region(*m_current_command_buffer, surface->get_memory_range());
}
m_rtts.superseded_surfaces.clear();
}
const auto color_fmt_info = get_compatible_gcm_format(m_framebuffer_layout.color_format);
for (u8 index : m_draw_buffers)
{
if (!m_surface_info[index].address || !m_surface_info[index].pitch) continue;
const utils::address_range surface_range = m_surface_info[index].get_memory_range();
if (g_cfg.video.write_color_buffers)
{
m_texture_cache.lock_memory_region(
*m_current_command_buffer, m_rtts.m_bound_render_targets[index].second, surface_range, true,
m_surface_info[index].width, m_surface_info[index].height, m_framebuffer_layout.actual_color_pitch[index],
color_fmt_info.first, color_fmt_info.second);
}
else
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
}
}
if (m_depth_surface_info.address && m_depth_surface_info.pitch)
{
const utils::address_range surface_range = m_depth_surface_info.get_memory_range();
if (g_cfg.video.write_depth_buffer)
{
const u32 gcm_format = (m_depth_surface_info.depth_format == rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
m_texture_cache.lock_memory_region(
*m_current_command_buffer, m_rtts.m_bound_depth_stencil.second, surface_range, true,
m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, true);
}
else
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface_range);
}
}
if (!m_rtts.orphaned_surfaces.empty())
{
u32 gcm_format;
bool swap_bytes;
for (auto& surface : m_rtts.orphaned_surfaces)
{
const bool lock = surface->is_depth_surface() ? !!g_cfg.video.write_depth_buffer :
!!g_cfg.video.write_color_buffers;
if (!lock) [[likely]]
{
m_texture_cache.commit_framebuffer_memory_region(*m_current_command_buffer, surface->get_memory_range());
continue;
}
if (surface->is_depth_surface())
{
gcm_format = (surface->get_surface_depth_format() != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
swap_bytes = true;
}
else
{
auto info = get_compatible_gcm_format(surface->get_surface_color_format());
gcm_format = info.first;
swap_bytes = info.second;
}
m_texture_cache.lock_memory_region(
*m_current_command_buffer, surface, surface->get_memory_range(), false,
surface->get_surface_width<rsx::surface_metrics::pixels>(), surface->get_surface_height<rsx::surface_metrics::pixels>(), surface->get_rsx_pitch(),
gcm_format, swap_bytes);
}
m_rtts.orphaned_surfaces.clear();
}
m_current_renderpass_key = vk::get_renderpass_key(m_fbo_images);
m_cached_renderpass = vk::get_renderpass(*m_device, m_current_renderpass_key);
// Search old framebuffers for this same configuration
const auto [fbo_width, fbo_height] = rsx::apply_resolution_scale<true>(m_framebuffer_layout.width, m_framebuffer_layout.height);
if (m_draw_fbo)
{
// Release old ref
m_draw_fbo->release();
}
m_draw_fbo = vk::get_framebuffer(*m_device, fbo_width, fbo_height, VK_FALSE, m_cached_renderpass, m_fbo_images);
m_draw_fbo->add_ref();
set_viewport();
set_scissor(clipped_scissor);
check_zcull_status(true);
}
void VKGSRender::renderctl(u32 request_code, void* args)
{
switch (request_code)
{
case vk::rctrl_queue_submit:
{
const auto packet = reinterpret_cast<vk::queue_submit_t*>(args);
vk::queue_submit(packet);
free(packet);
break;
}
case vk::rctrl_run_gc:
{
auto eid = reinterpret_cast<u64>(args);
vk::on_event_completed(eid, true);
break;
}
default:
fmt::throw_exception("Unhandled request code 0x%x", request_code);
}
}
bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate)
{
if (swapchain_unavailable)
return false;
// Verify enough memory exists before attempting to handle data transfer
check_heap_status(VK_HEAP_CHECK_TEXTURE_UPLOAD_STORAGE);
if (m_texture_cache.blit(src, dst, interpolate, m_rtts, *m_current_command_buffer))
{
m_samplers_dirty.store(true);
m_current_command_buffer->set_flag(vk::command_buffer::cb_has_blit_transfer);
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_dma_transfer)
{
// A dma transfer has been queued onto this cb
// This likely means that we're done with the tranfers to the target (writes_likely_completed=1)
flush_command_queue();
}
return true;
}
return false;
}
void VKGSRender::begin_occlusion_query(rsx::reports::occlusion_query_info* query)
{
ensure(!m_occlusion_query_active);
query->result = 0;
//query->sync_timestamp = get_system_time();
m_active_query_info = query;
m_occlusion_query_active = true;
m_current_command_buffer->flags |= vk::command_buffer::cb_load_occluson_task;
}
void VKGSRender::end_occlusion_query(rsx::reports::occlusion_query_info* query)
{
ensure(query == m_active_query_info);
// NOTE: flushing the queue is very expensive, do not flush just because query stopped
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
// End query
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
// Clear occlusion load flag
m_current_command_buffer->flags &= ~vk::command_buffer::cb_load_occluson_task;
m_occlusion_query_active = false;
m_active_query_info = nullptr;
}
bool VKGSRender::check_occlusion_query_status(rsx::reports::occlusion_query_info* query)
{
if (!query->num_draws)
return true;
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return true;
if (data.is_current(m_current_command_buffer))
return false;
const u32 oldest = data.indices.front();
return m_occlusion_query_manager->check_query_status(oldest);
}
void VKGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* query)
{
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return;
if (query->num_draws)
{
if (data.is_current(m_current_command_buffer))
{
std::lock_guard lock(m_flush_queue_mutex);
flush_command_queue();
if (m_flush_requests.pending())
{
m_flush_requests.clear_pending_flag();
}
rsx_log.warning("[Performance warning] Unexpected ZCULL read caused a hard sync");
busy_wait();
}
data.sync();
// Gather data
for (const auto occlusion_id : data.indices)
{
query->result += m_occlusion_query_manager->get_query_result(occlusion_id);
if (query->result && !g_cfg.video.precise_zpass_count)
{
// We only need one hit unless precise zcull is requested
break;
}
}
}
m_occlusion_query_manager->free_queries(*m_current_command_buffer, data.indices);
data.indices.clear();
}
void VKGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query)
{
if (m_active_query_info == query)
{
end_occlusion_query(query);
}
auto &data = m_occlusion_map[query->driver_handle];
if (data.indices.empty())
return;
m_occlusion_query_manager->free_queries(*m_current_command_buffer, data.indices);
data.indices.clear();
}
void VKGSRender::emergency_query_cleanup(vk::command_buffer* commands)
{
ensure(commands == static_cast<vk::command_buffer*>(m_current_command_buffer));
if (m_current_command_buffer->flags & vk::command_buffer::cb_has_open_query)
{
auto open_query = m_occlusion_map[m_active_query_info->driver_handle].indices.back();
m_occlusion_query_manager->end_query(*m_current_command_buffer, open_query);
m_current_command_buffer->flags &= ~vk::command_buffer::cb_has_open_query;
}
}
void VKGSRender::begin_conditional_rendering(const std::vector<rsx::reports::occlusion_query_info*>& sources)
{
ensure(!sources.empty());
// Flag check whether to calculate all entries or only one
bool partial_eval;
// Try and avoid regenerating the data if its a repeat/spam
// NOTE: The incoming list is reversed with the first entry being the newest
if (m_cond_render_sync_tag == sources.front()->sync_tag)
{
// Already synched, check subdraw which is possible if last sync happened while query was active
if (!m_active_query_info || m_active_query_info != sources.front())
{
rsx::thread::begin_conditional_rendering(sources);
return;
}
// Partial evaluation only
partial_eval = true;
}
else
{
m_cond_render_sync_tag = sources.front()->sync_tag;
partial_eval = false;
}
// Time to aggregate
if (!m_cond_render_buffer)
{
auto& memory_props = m_device->get_memory_mapping();
auto usage_flags = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
if (m_device->get_conditional_render_support())
{
usage_flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT;
}
m_cond_render_buffer = std::make_unique<vk::buffer>(
*m_device, 4,
memory_props.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
usage_flags, 0, VMM_ALLOCATION_POOL_UNDEFINED);
}
VkPipelineStageFlags dst_stage;
VkAccessFlags dst_access;
u32 dst_offset = 0;
u32 num_hw_queries = 0;
usz first = 0;
usz last = (!partial_eval) ? sources.size() : 1;
// Count number of queries available. This is an "opening" evaluation, if there is only one source, read it as-is.
// The idea is to avoid scheduling a compute task unless we have to.
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
num_hw_queries += ::size32(query_info.indices);
}
if (m_device->get_conditional_render_support())
{
dst_stage = VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT;
dst_access = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT;
}
else
{
dst_stage = VK_PIPELINE_STAGE_VERTEX_SHADER_BIT;
dst_access = VK_ACCESS_SHADER_READ_BIT;
}
if (num_hw_queries == 1 && !partial_eval) [[ likely ]]
{
// Accept the first available query handle as the source of truth. No aggregation is required.
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
if (!query_info.indices.empty())
{
const auto& index = query_info.indices.front();
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, m_cond_render_buffer->value, 0);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage,
VK_ACCESS_TRANSFER_WRITE_BIT, dst_access);
rsx::thread::begin_conditional_rendering(sources);
return;
}
}
// This is unreachable unless something went horribly wrong
fmt::throw_exception("Unreachable");
}
else if (num_hw_queries > 0)
{
// We'll need to do some result aggregation using a compute shader.
auto scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4);
for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
for (const auto& index : query_info.indices)
{
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, scratch->value, dst_offset);
dst_offset += 4;
}
}
// Sanity check
ensure(dst_offset <= scratch->size());
if (!partial_eval)
{
// Fast path should have been caught above
ensure(dst_offset > 4);
// Clear result to zero
vkCmdFillBuffer(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4, 0);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_WRITE_BIT);
}
vk::insert_buffer_memory_barrier(*m_current_command_buffer, scratch->value, 0, dst_offset,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
vk::get_compute_task<vk::cs_aggregator>()->run(*m_current_command_buffer, m_cond_render_buffer.get(), scratch, dst_offset / 4);
vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, dst_stage,
VK_ACCESS_SHADER_WRITE_BIT, dst_access);
}
else if (m_program)
{
// This can sometimes happen when shaders are compiling, only log if there is a program hit
rsx_log.warning("Dubious query data pushed to cond render!, Please report to developers(q.pending=%d)", sources.front()->pending);
}
rsx::thread::begin_conditional_rendering(sources);
}
void VKGSRender::end_conditional_rendering()
{
thread::end_conditional_rendering();
}