From 50f72b4e42364af54e287dba01de37999e4a9fc1 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Fri, 1 Apr 2016 21:52:39 -0500 Subject: [PATCH] Enable native MSAA Copy back EDRAM buffers in order by base offset. --- src/xenia/gpu/vulkan/pipeline_cache.cc | 94 ++++-- src/xenia/gpu/vulkan/pipeline_cache.h | 5 + src/xenia/gpu/vulkan/render_cache.cc | 284 +++++++++++------- src/xenia/gpu/vulkan/render_cache.h | 28 +- .../gpu/vulkan/vulkan_command_processor.cc | 88 +++--- .../gpu/vulkan/vulkan_command_processor.h | 1 + 6 files changed, 333 insertions(+), 167 deletions(-) diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index efcaf5b46..19db3cd4f 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -187,6 +187,10 @@ PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline( VkCommandBuffer command_buffer, const RenderState* render_state, VulkanShader* vertex_shader, VulkanShader* pixel_shader, PrimitiveType primitive_type, VkPipeline* pipeline_out) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + assert_not_null(pipeline_out); // Perform a pass over all registers and state updating our cached structures. @@ -323,6 +327,10 @@ VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type, bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, bool full_update) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = set_dynamic_state_registers_; bool window_offset_dirty = SetShadowRegister(®s.pa_sc_window_offset, @@ -393,20 +401,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, auto surface_msaa = static_cast((regs.rb_surface_info >> 16) & 0x3); // TODO(benvanik): ?? + // FIXME: Some games depend on these for proper clears (e.g. only clearing + // half the size they actually want with 4x MSAA), but others don't. + // Figure out how these games are expecting clears to be done. float window_width_scalar = 1; float window_height_scalar = 1; switch (surface_msaa) { case MsaaSamples::k1X: break; case MsaaSamples::k2X: - window_width_scalar = 2; + // ?? + window_width_scalar = window_height_scalar = 1.41421356f; break; case MsaaSamples::k4X: - window_width_scalar = 2; - window_height_scalar = 2; + window_width_scalar = window_height_scalar = 2; break; } + // window_width_scalar = window_height_scalar = 1; + // Whether each of the viewport settings are enabled. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf bool vport_xscale_enable = (regs.pa_cl_vte_cntl & (1 << 0)) > 0; @@ -434,6 +447,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, float voy = vport_yoffset_enable ? regs.pa_cl_vport_yoffset : 0; float vsx = vport_xscale_enable ? regs.pa_cl_vport_xscale : 1; float vsy = vport_yscale_enable ? regs.pa_cl_vport_yscale : 1; + window_width_scalar = window_height_scalar = 1; float vpw = 2 * window_width_scalar * vsx; float vph = -2 * window_height_scalar * vsy; @@ -481,25 +495,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, vkCmdSetBlendConstants(command_buffer, regs.rb_blend_rgba); } - // VK_DYNAMIC_STATE_LINE_WIDTH - vkCmdSetLineWidth(command_buffer, 1.0f); + if (full_update) { + // VK_DYNAMIC_STATE_LINE_WIDTH + vkCmdSetLineWidth(command_buffer, 1.0f); - // VK_DYNAMIC_STATE_DEPTH_BIAS - vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f); + // VK_DYNAMIC_STATE_DEPTH_BIAS + vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f); - // VK_DYNAMIC_STATE_DEPTH_BOUNDS - vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f); + // VK_DYNAMIC_STATE_DEPTH_BOUNDS + vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f); - // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK - vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK + vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - // VK_DYNAMIC_STATE_STENCIL_REFERENCE - vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + // VK_DYNAMIC_STATE_STENCIL_REFERENCE + vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK - vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - - // TODO(benvanik): push constants. + // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK + vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + } bool push_constants_dirty = full_update || viewport_state_dirty; push_constants_dirty |= @@ -530,7 +544,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, push_constants.window_scale[1] = -1.0f; } else { push_constants.window_scale[0] = 1.0f / 2560.0f; - push_constants.window_scale[1] = -1.0f / 2560.0f; + push_constants.window_scale[1] = 1.0f / 2560.0f; } // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf @@ -756,7 +770,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState( : VK_FORMAT_A2R10G10B10_UNORM_PACK32; break; case VertexFormat::k_10_11_11: - assert_always("unsupported?"); + // assert_always("unsupported?"); vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32; break; case VertexFormat::k_11_11_10: @@ -934,6 +948,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR); dirty |= SetShadowRegister(®s.multi_prim_ib_reset_index, XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX); + dirty |= SetShadowRegister(®s.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL); regs.primitive_type = primitive_type; XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { @@ -947,7 +962,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( // TODO(benvanik): right setting? state_info.depthClampEnable = VK_FALSE; - // TODO(benvanik): use in depth-only mode? + // Discard rasterizer output in depth-only mode. + // TODO(DrChat): Figure out how to make this work properly. + /* + auto enable_mode = static_cast(regs.rb_modecontrol & 0x7); + state_info.rasterizerDiscardEnable = + enable_mode == xenos::ModeControl::kColorDepth ? VK_FALSE : VK_TRUE; + //*/ state_info.rasterizerDiscardEnable = VK_FALSE; bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0; @@ -1004,20 +1025,49 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() { auto& regs = update_multisample_state_regs_; auto& state_info = update_multisample_state_info_; + bool dirty = false; + dirty |= SetShadowRegister(®s.pa_sc_aa_config, XE_GPU_REG_PA_SC_AA_CONFIG); + dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, + XE_GPU_REG_PA_SU_SC_MODE_CNTL); + dirty |= SetShadowRegister(®s.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO); + XXH64_update(&hash_state_, ®s, sizeof(regs)); + if (!dirty) { + return UpdateStatus::kCompatible; + } + state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; state_info.pNext = nullptr; state_info.flags = 0; // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES // PA_SU_SC_MODE_CNTL MSAA_ENABLE - state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + // state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + //* + auto msaa_num_samples = + static_cast((regs.rb_surface_info >> 16) & 0x3); + switch (msaa_num_samples) { + case MsaaSamples::k1X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(msaa_num_samples); + break; + } + //*/ + state_info.sampleShadingEnable = VK_FALSE; state_info.minSampleShading = 0; state_info.pSampleMask = nullptr; state_info.alphaToCoverageEnable = VK_FALSE; state_info.alphaToOneEnable = VK_FALSE; - return UpdateStatus::kCompatible; + return UpdateStatus::kMismatch; } PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index 66b2e87ef..f240b9c0d 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -211,6 +211,7 @@ class PipelineCache { uint32_t pa_sc_screen_scissor_tl; uint32_t pa_sc_screen_scissor_br; uint32_t multi_prim_ib_reset_index; + uint32_t rb_modecontrol; UpdateRasterizationStateRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } @@ -218,6 +219,10 @@ class PipelineCache { VkPipelineRasterizationStateCreateInfo update_rasterization_state_info_; struct UpdateMultisampleStateeRegisters { + uint32_t pa_sc_aa_config; + uint32_t pa_su_sc_mode_cntl; + uint32_t rb_surface_info; + UpdateMultisampleStateeRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } } update_multisample_state_regs_; diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc index 334a1215f..7e0528866 100644 --- a/src/xenia/gpu/vulkan/render_cache.cc +++ b/src/xenia/gpu/vulkan/render_cache.cc @@ -165,8 +165,23 @@ CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device, image_info.extent.depth = 1; image_info.mipLevels = 1; image_info.arrayLayers = 1; - image_info.samples = - static_cast(VK_SAMPLE_COUNT_1_BIT); + // image_info.samples = VK_SAMPLE_COUNT_1_BIT; + //* + auto msaa_samples = static_cast(key.msaa_samples); + switch (msaa_samples) { + case MsaaSamples::k1X: + image_info.samples = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + image_info.samples = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + image_info.samples = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(msaa_samples); + } + //*/ image_info.tiling = VK_IMAGE_TILING_OPTIMAL; image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | @@ -322,13 +337,29 @@ CachedRenderPass::CachedRenderPass(VkDevice device, : device_(device) { std::memcpy(&config, &desired_config, sizeof(config)); + VkSampleCountFlagBits sample_count; + switch (desired_config.surface_msaa) { + case MsaaSamples::k1X: + sample_count = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + sample_count = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + sample_count = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(desired_config.surface_msaa); + break; + } + // Initialize all attachments to default unused. // As we set layout(location=RT) in shaders we must always provide 4. VkAttachmentDescription attachments[5]; for (int i = 0; i < 4; ++i) { attachments[i].flags = 0; attachments[i].format = VK_FORMAT_UNDEFINED; - attachments[i].samples = VK_SAMPLE_COUNT_1_BIT; + attachments[i].samples = sample_count; attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE; attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; @@ -339,7 +370,7 @@ CachedRenderPass::CachedRenderPass(VkDevice device, auto& depth_stencil_attachment = attachments[4]; depth_stencil_attachment.flags = 0; depth_stencil_attachment.format = VK_FORMAT_UNDEFINED; - depth_stencil_attachment.samples = VK_SAMPLE_COUNT_1_BIT; + depth_stencil_attachment.samples = sample_count; depth_stencil_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; depth_stencil_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; depth_stencil_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; @@ -404,6 +435,10 @@ CachedRenderPass::~CachedRenderPass() { bool CachedRenderPass::IsCompatible( const RenderConfiguration& desired_config) const { + if (config.surface_msaa != desired_config.surface_msaa) { + return false; + } + for (int i = 0; i < 4; ++i) { // TODO(benvanik): allow compatible vulkan formats. if (config.color[i].format != desired_config.color[i].format) { @@ -503,12 +538,18 @@ bool RenderCache::dirty() const { regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; dirty |= cur_regs.pa_sc_window_scissor_br != regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + dirty |= (cur_regs.rb_depthcontrol & (0x4 | 0x2)) != + (regs[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2)); return dirty; } const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + assert_null(current_command_buffer_); current_command_buffer_ = command_buffer; @@ -520,6 +561,7 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, bool dirty = false; dirty |= SetShadowRegister(®s.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL); dirty |= SetShadowRegister(®s.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO); + dirty |= SetShadowRegister(®s.rb_color_mask, XE_GPU_REG_RB_COLOR_MASK); dirty |= SetShadowRegister(®s.rb_color_info, XE_GPU_REG_RB_COLOR_INFO); dirty |= SetShadowRegister(®s.rb_color1_info, XE_GPU_REG_RB_COLOR1_INFO); dirty |= SetShadowRegister(®s.rb_color2_info, XE_GPU_REG_RB_COLOR2_INFO); @@ -529,7 +571,11 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL); dirty |= SetShadowRegister(®s.pa_sc_window_scissor_br, XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR); - regs.rb_depthcontrol = register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32; + dirty |= + (regs.rb_depthcontrol & (0x4 | 0x2)) != + (register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2)); + regs.rb_depthcontrol = + register_file_->values[XE_GPU_REG_RB_DEPTHCONTROL].u32 & (0x4 | 0x2); if (!dirty && current_state_.render_pass) { // No registers have changed so we can reuse the previous render pass - // just begin with what we had. @@ -549,7 +595,10 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, // Speculatively see if targets are actually used so we can skip copies for (int i = 0; i < 4; i++) { - config->color[i].used = pixel_shader->writes_color_target(i); + uint32_t color_mask = (regs.rb_color_mask >> (i * 4)) & 0xF; + config->color[i].used = + config->mode_control == xenos::ModeControl::kColorDepth && + color_mask != 0; } config->depth_stencil.used = !!(regs.rb_depthcontrol & (0x4 | 0x2)); @@ -558,66 +607,20 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, current_state_.framebuffer = framebuffer; current_state_.framebuffer_handle = framebuffer->handle; - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = edram_buffer_; - barrier.offset = 0; - barrier.size = 0; - - // Copy EDRAM buffer into render targets with tight packing. - VkBufferImageCopy region; - region.bufferRowLength = 0; - region.bufferImageHeight = 0; - region.imageOffset = {0, 0, 0}; - // Depth auto depth_target = current_state_.framebuffer->depth_stencil_attachment; if (depth_target && current_state_.config.depth_stencil.used) { - region.imageSubresource = { - VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1}; - region.bufferOffset = depth_target->key.tile_offset * 5120; - - // Wait for any potential copies to finish. - barrier.offset = region.bufferOffset; - barrier.size = depth_target->key.tile_width * 80 * - depth_target->key.tile_height * 16 * 4; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1, - &barrier, 0, nullptr); - - region.imageExtent = {depth_target->key.tile_width * 80u, - depth_target->key.tile_height * 16u, 1}; - vkCmdCopyBufferToImage(command_buffer, edram_buffer_, depth_target->image, - VK_IMAGE_LAYOUT_GENERAL, 1, ®ion); + UpdateTileView(command_buffer, depth_target, true); } // Color - region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; for (int i = 0; i < 4; i++) { auto target = current_state_.framebuffer->color_attachments[i]; if (!target || !current_state_.config.color[i].used) { continue; } - region.bufferOffset = target->key.tile_offset * 5120; - - // Wait for any potential copies to finish. - barrier.offset = region.bufferOffset; - barrier.size = - target->key.tile_width * 80 * target->key.tile_height * 16 * 4; - vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, - VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1, - &barrier, 0, nullptr); - - region.imageExtent = {target->key.tile_width * 80u, - target->key.tile_height * 16u, 1}; - vkCmdCopyBufferToImage(command_buffer, edram_buffer_, target->image, - VK_IMAGE_LAYOUT_GENERAL, 1, ®ion); + UpdateTileView(command_buffer, target, true); } } if (!render_pass) { @@ -758,6 +761,7 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer, color_key.tile_width = xe::round_up(config->surface_pitch_px, 80) / 80; color_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16; color_key.color_or_depth = 1; + color_key.msaa_samples = static_cast(config->surface_msaa); color_key.edram_format = static_cast(config->color[i].format); target_color_attachments[i] = FindOrCreateTileView(command_buffer, color_key); @@ -774,6 +778,8 @@ bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer, depth_stencil_key.tile_height = xe::round_up(config->surface_height_px, 16) / 16; depth_stencil_key.color_or_depth = 0; + depth_stencil_key.msaa_samples = + static_cast(config->surface_msaa); depth_stencil_key.edram_format = static_cast(config->depth_stencil.format); auto target_depth_stencil_attachment = @@ -810,6 +816,51 @@ CachedTileView* RenderCache::FindOrCreateTileView( return tile_view; } +void RenderCache::UpdateTileView(VkCommandBuffer command_buffer, + CachedTileView* view, bool load, + bool insert_barrier) { + if (insert_barrier) { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = nullptr; + if (load) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + } else { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + } + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = edram_buffer_; + barrier.offset = view->key.tile_offset * 5120; + barrier.size = view->key.tile_width * 80 * view->key.tile_height * 16 * 4; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1, + &barrier, 0, nullptr); + } + + VkBufferImageCopy region; + region.bufferOffset = view->key.tile_offset * 5120; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource = {0, 0, 0, 1}; + region.imageSubresource.aspectMask = + view->key.color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + region.imageOffset = {0, 0, 0}; + region.imageExtent = {view->key.tile_width * 80u, view->key.tile_height * 16u, + 1}; + if (load) { + vkCmdCopyBufferToImage(command_buffer, edram_buffer_, view->image, + VK_IMAGE_LAYOUT_GENERAL, 1, ®ion); + } else { + vkCmdCopyImageToBuffer(command_buffer, view->image, VK_IMAGE_LAYOUT_GENERAL, + edram_buffer_, 1, ®ion); + } +} + CachedTileView* RenderCache::FindTileView(const TileViewKey& view_key) const { // Check the cache. // TODO(benvanik): better lookup. @@ -837,35 +888,31 @@ void RenderCache::EndRenderPass() { // can't get the correct height atm) and we may end up overwriting the valid // contents of another render target by mistake! Need to reorder copy commands // to avoid this. - VkBufferImageCopy region; - region.bufferRowLength = 0; - region.bufferImageHeight = 0; - region.imageOffset = {0, 0, 0}; - // Depth/stencil + + std::vector cached_views; + + // Depth auto depth_target = current_state_.framebuffer->depth_stencil_attachment; if (depth_target && current_state_.config.depth_stencil.used) { - region.imageSubresource = { - VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1}; - region.bufferOffset = depth_target->key.tile_offset * 5120; - region.imageExtent = {depth_target->key.tile_width * 80u, - depth_target->key.tile_height * 16u, 1}; - vkCmdCopyImageToBuffer(current_command_buffer_, depth_target->image, - VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, ®ion); + cached_views.push_back(depth_target); } // Color - region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; for (int i = 0; i < 4; i++) { auto target = current_state_.framebuffer->color_attachments[i]; if (!target || !current_state_.config.color[i].used) { continue; } - region.bufferOffset = target->key.tile_offset * 5120; - region.imageExtent = {target->key.tile_width * 80u, - target->key.tile_height * 16u, 1}; - vkCmdCopyImageToBuffer(current_command_buffer_, target->image, - VK_IMAGE_LAYOUT_GENERAL, edram_buffer_, 1, ®ion); + cached_views.push_back(target); + } + + std::sort( + cached_views.begin(), cached_views.end(), + [](CachedTileView const* a, CachedTileView const* b) { return *a < *b; }); + + for (auto view : cached_views) { + UpdateTileView(current_command_buffer_, view, false, false); } current_command_buffer_ = nullptr; @@ -920,6 +967,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer, &buffer_barrier, 0, nullptr); // Issue the copy command. + // TODO(DrChat): Stencil copies. VkBufferImageCopy region; region.bufferOffset = edram_base * 5120; region.bufferImageHeight = 0; @@ -928,8 +976,7 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer, region.imageExtent = extents; region.imageSubresource = {0, 0, 0, 1}; region.imageSubresource.aspectMask = - color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT - : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; vkCmdCopyBufferToImage(command_buffer, edram_buffer_, image, image_layout, 1, ®ion); @@ -947,13 +994,15 @@ void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer, void RenderCache::BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base, uint32_t pitch, - uint32_t height, VkImage image, - VkImageLayout image_layout, bool color_or_depth, - uint32_t format, VkFilter filter, - VkOffset3D offset, VkExtent3D extents) { + uint32_t height, MsaaSamples num_samples, + VkImage image, VkImageLayout image_layout, + bool color_or_depth, uint32_t format, + VkFilter filter, VkOffset3D offset, + VkExtent3D extents) { // Grab a tile view that represents the source image. TileViewKey key; key.color_or_depth = color_or_depth ? 1 : 0; + key.msaa_samples = static_cast(num_samples); key.edram_format = format; key.tile_offset = edram_base; key.tile_width = xe::round_up(pitch, 80) / 80; @@ -979,14 +1028,14 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer, // Update the tile view with current EDRAM contents. // TODO: Heuristics to determine if this copy is avoidable. + // TODO(DrChat): Stencil copies. VkBufferImageCopy buffer_copy; buffer_copy.bufferOffset = edram_base * 5120; buffer_copy.bufferImageHeight = 0; buffer_copy.bufferRowLength = 0; buffer_copy.imageSubresource = {0, 0, 0, 1}; buffer_copy.imageSubresource.aspectMask = - color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT - : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; buffer_copy.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u}; buffer_copy.imageOffset = {0, 0, 0}; vkCmdCopyBufferToImage(command_buffer, edram_buffer_, tile_view->image, @@ -1018,26 +1067,48 @@ void RenderCache::BlitToImage(VkCommandBuffer command_buffer, assert_true(extents.height <= key.tile_height * 16u); // Now issue the blit to the destination. - // TODO: Resolve to destination if necessary. - VkImageBlit image_blit; - image_blit.srcSubresource = {0, 0, 0, 1}; - image_blit.srcSubresource.aspectMask = - color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT - : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - image_blit.srcOffsets[0] = {0, 0, 0}; - image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height), - int32_t(extents.depth)}; + if (num_samples == MsaaSamples::k1X) { + VkImageBlit image_blit; + image_blit.srcSubresource = {0, 0, 0, 1}; + image_blit.srcSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_blit.srcOffsets[0] = {0, 0, 0}; + image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height), + int32_t(extents.depth)}; - image_blit.dstSubresource = {0, 0, 0, 1}; - image_blit.dstSubresource.aspectMask = - color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT - : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - image_blit.dstOffsets[0] = offset; - image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width), - offset.y + int32_t(extents.height), - offset.z + int32_t(extents.depth)}; - vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL, - image, image_layout, 1, &image_blit, filter); + image_blit.dstSubresource = {0, 0, 0, 1}; + image_blit.dstSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_blit.dstOffsets[0] = offset; + image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width), + offset.y + int32_t(extents.height), + offset.z + int32_t(extents.depth)}; + vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL, + image, image_layout, 1, &image_blit, filter); + } else { + VkImageResolve image_resolve; + image_resolve.srcSubresource = {0, 0, 0, 1}; + image_resolve.srcSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_resolve.srcOffset = {0, 0, 0}; + + image_resolve.dstSubresource = {0, 0, 0, 1}; + image_resolve.dstSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_resolve.dstOffset = offset; + + image_resolve.extent = extents; + vkCmdResolveImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL, + image, image_layout, 1, &image_resolve); + } // Transition the image back into its previous layout. image_barrier.srcAccessMask = image_barrier.dstAccessMask; @@ -1052,13 +1123,14 @@ void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base, ColorRenderTargetFormat format, uint32_t pitch, uint32_t height, - float* color) { + MsaaSamples num_samples, float* color) { // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just // need to detect this and calculate a value. // Grab a tile view (as we need to clear an image first) TileViewKey key; key.color_or_depth = 1; + key.msaa_samples = static_cast(num_samples); key.edram_format = static_cast(format); key.tile_offset = edram_base; key.tile_width = xe::round_up(pitch, 80) / 80; @@ -1091,13 +1163,15 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, uint32_t edram_base, DepthRenderTargetFormat format, uint32_t pitch, uint32_t height, - float depth, uint32_t stencil) { + MsaaSamples num_samples, float depth, + uint32_t stencil) { // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just // need to detect this and calculate a value. // Grab a tile view (as we need to clear an image first) TileViewKey key; key.color_or_depth = 0; + key.msaa_samples = static_cast(num_samples); key.edram_format = static_cast(format); key.tile_offset = edram_base; key.tile_width = xe::round_up(pitch, 80) / 80; @@ -1117,12 +1191,13 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range); // Copy image back into EDRAM buffer + // TODO(DrChat): Stencil copies. VkBufferImageCopy copy_range; copy_range.bufferOffset = edram_base * 5120; copy_range.bufferImageHeight = 0; copy_range.bufferRowLength = 0; copy_range.imageSubresource = { - VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0, 1, + VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0, 1, }; copy_range.imageExtent = {key.tile_width * 80u, key.tile_height * 16u, 1u}; copy_range.imageOffset = {0, 0, 0}; @@ -1131,6 +1206,11 @@ void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, ©_range); } +void RenderCache::FillEDRAM(VkCommandBuffer command_buffer, uint32_t value) { + vkCmdFillBuffer(command_buffer, edram_buffer_, 0, kEdramBufferCapacity, + value); +} + bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) { uint32_t value = register_file_->values[register_name].u32; if (*dest == value) { diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h index 2e8d1c5fe..86edac7bc 100644 --- a/src/xenia/gpu/vulkan/render_cache.h +++ b/src/xenia/gpu/vulkan/render_cache.h @@ -38,9 +38,9 @@ struct TileViewKey { // 1 if format is ColorRenderTargetFormat, else DepthRenderTargetFormat. uint16_t color_or_depth : 1; // Surface MSAA samples - // uint16_t msaa_samples : 2; + uint16_t msaa_samples : 2; // Either ColorRenderTargetFormat or DepthRenderTargetFormat. - uint16_t edram_format : 15; // 13; + uint16_t edram_format : 13; }; static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed"); @@ -69,6 +69,10 @@ class CachedTileView { return *a == *b; } + bool operator<(const CachedTileView& other) const { + return key.tile_offset < other.key.tile_offset; + } + private: VkDevice device_ = nullptr; }; @@ -278,22 +282,26 @@ class RenderCache { // Queues commands to blit EDRAM contents into an image. // The command buffer must not be inside of a render pass when calling this. void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base, - uint32_t pitch, uint32_t height, VkImage image, - VkImageLayout image_layout, bool color_or_depth, - uint32_t format, VkFilter filter, VkOffset3D offset, - VkExtent3D extents); + uint32_t pitch, uint32_t height, MsaaSamples num_samples, + VkImage image, VkImageLayout image_layout, + bool color_or_depth, uint32_t format, VkFilter filter, + VkOffset3D offset, VkExtent3D extents); // Queues commands to clear EDRAM contents with a solid color. // The command buffer must not be inside of a render pass when calling this. void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base, ColorRenderTargetFormat format, uint32_t pitch, - uint32_t height, float* color); + uint32_t height, MsaaSamples num_samples, float* color); // Queues commands to clear EDRAM contents with depth/stencil values. // The command buffer must not be inside of a render pass when calling this. void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, uint32_t edram_base, DepthRenderTargetFormat format, uint32_t pitch, - uint32_t height, float depth, uint32_t stencil); + uint32_t height, MsaaSamples num_samples, + float depth, uint32_t stencil); + // Queues commands to fill EDRAM contents with a constant value. + // The command buffer must not be inside of a render pass when calling this. + void FillEDRAM(VkCommandBuffer command_buffer, uint32_t value); private: // Parses the current state into a configuration object. @@ -306,6 +314,9 @@ class RenderCache { CachedTileView* FindOrCreateTileView(VkCommandBuffer command_buffer, const TileViewKey& view_key); + void UpdateTileView(VkCommandBuffer command_buffer, CachedTileView* view, + bool load, bool insert_barrier = true); + // Gets or creates a render pass and frame buffer for the given configuration. // This attempts to reuse as much as possible across render passes and // framebuffers. @@ -335,6 +346,7 @@ class RenderCache { struct ShadowRegisters { uint32_t rb_modecontrol; uint32_t rb_surface_info; + uint32_t rb_color_mask; uint32_t rb_color_info; uint32_t rb_color1_info; uint32_t rb_color2_info; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 1d559d896..fd604733b 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -152,19 +152,8 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr, // TODO(benvanik): move to CP or to host (trace dump, etc). // This only needs to surround a vkQueueSubmit. - static uint32_t frame = 0; - if (device_->is_renderdoc_attached() && - (FLAGS_vulkan_renderdoc_capture_all || - trace_state_ == TraceState::kSingleFrame)) { - if (queue_mutex_) { - queue_mutex_->lock(); - } - - device_->BeginRenderDocFrameCapture(); - - if (queue_mutex_) { - queue_mutex_->unlock(); - } + if (queue_mutex_) { + queue_mutex_->lock(); } // TODO(DrChat): If setup buffer is empty, don't bother queueing it up. @@ -182,45 +171,37 @@ void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr, submit_info.signalSemaphoreCount = 0; submit_info.pSignalSemaphores = nullptr; if (queue_mutex_) { - queue_mutex_->lock(); + // queue_mutex_->lock(); } status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_); if (queue_mutex_) { - queue_mutex_->unlock(); + // queue_mutex_->unlock(); } CheckResult(status, "vkQueueSubmit"); + // TODO(DrChat): Disable this completely. VkFence fences[] = {*current_batch_fence_}; status = vkWaitForFences(*device_, 1, fences, true, -1); CheckResult(status, "vkWaitForFences"); - if (device_->is_renderdoc_attached() && - (FLAGS_vulkan_renderdoc_capture_all || - trace_state_ == TraceState::kSingleFrame)) { - if (queue_mutex_) { - queue_mutex_->lock(); - } - + if (device_->is_renderdoc_attached() && capturing_) { device_->EndRenderDocFrameCapture(); + capturing_ = false; // HACK(DrChat): Used b/c I disabled trace saving code in the CP. // Remove later. if (!trace_writer_.is_open()) { trace_state_ = TraceState::kDisabled; } - - if (queue_mutex_) { - queue_mutex_->unlock(); - } + } + if (queue_mutex_) { + queue_mutex_->unlock(); } // Scavenging. current_command_buffer_ = nullptr; current_setup_buffer_ = nullptr; - while (command_buffer_pool_->has_pending()) { - command_buffer_pool_->Scavenge(); - xe::threading::MaybeYield(); - } + command_buffer_pool_->Scavenge(); texture_cache_->Scavenge(); current_batch_fence_ = nullptr; @@ -331,6 +312,22 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info); CheckResult(status, "vkBeginCommandBuffer"); + static uint32_t frame = 0; + if (device_->is_renderdoc_attached() && !capturing_ && + (FLAGS_vulkan_renderdoc_capture_all || + trace_state_ == TraceState::kSingleFrame)) { + if (queue_mutex_) { + queue_mutex_->lock(); + } + + capturing_ = true; + device_->BeginRenderDocFrameCapture(); + + if (queue_mutex_) { + queue_mutex_->unlock(); + } + } + started_command_buffer = true; } auto command_buffer = current_command_buffer_; @@ -357,6 +354,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, current_render_state_ = render_cache_->BeginRenderPass( command_buffer, vertex_shader, pixel_shader); if (!current_render_state_) { + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; return false; } } @@ -378,18 +379,30 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, // Pass registers to the shaders. if (!PopulateConstants(command_buffer, vertex_shader, pixel_shader)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; return false; } // Upload and bind index buffer data (if we have any). if (!PopulateIndexBuffer(command_buffer, index_buffer_info)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; return false; } // Upload and bind all vertex buffer data. if (!PopulateVertexBuffers(command_buffer, vertex_shader)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; return false; } @@ -423,6 +436,10 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, bool VulkanCommandProcessor::PopulateConstants(VkCommandBuffer command_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + // Upload the constants the shaders require. // These are optional, and if none are defined 0 will be returned. auto constant_offsets = buffer_cache_->UploadConstantRegisters( @@ -742,7 +759,7 @@ bool VulkanCommandProcessor::IssueCopy() { tex_info.size_2d.input_height = dest_block_height; tex_info.size_2d.input_pitch = copy_dest_pitch * 4; auto texture = texture_cache_->DemandResolveTexture( - tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr, nullptr); + tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr); if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) { // Transition the image to a general layout. VkImageMemoryBarrier image_barrier; @@ -810,8 +827,9 @@ bool VulkanCommandProcessor::IssueCopy() { case CopyCommand::kConvert: render_cache_->BlitToImage( command_buffer, edram_base, surface_pitch, resolve_extent.height, - texture->image, texture->image_layout, copy_src_select <= 3, - src_format, VK_FILTER_LINEAR, resolve_offset, resolve_extent); + surface_msaa, texture->image, texture->image_layout, + copy_src_select <= 3, src_format, VK_FILTER_LINEAR, resolve_offset, + resolve_extent); break; case CopyCommand::kConstantOne: @@ -839,7 +857,7 @@ bool VulkanCommandProcessor::IssueCopy() { // TODO(DrChat): Do we know the surface height at this point? render_cache_->ClearEDRAMColor(command_buffer, color_edram_base, color_format, surface_pitch, - resolve_extent.height, color); + resolve_extent.height, surface_msaa, color); } if (depth_clear_enabled) { @@ -850,7 +868,7 @@ bool VulkanCommandProcessor::IssueCopy() { // TODO(DrChat): Do we know the surface height at this point? render_cache_->ClearEDRAMDepthStencil( command_buffer, depth_edram_base, depth_format, surface_pitch, - resolve_extent.height, depth, stencil); + resolve_extent.height, surface_msaa, depth, stencil); } return true; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index c87c515c0..287e4f65e 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -94,6 +94,7 @@ class VulkanCommandProcessor : public CommandProcessor { // Last copy base address, for debugging only. uint32_t last_copy_base_ = 0; + bool capturing_ = false; std::unique_ptr buffer_cache_; std::unique_ptr pipeline_cache_;