diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 95744b49c..172c63660 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -101,6 +101,10 @@ void D3D12CommandProcessor::RestoreEdramSnapshot(const void* snapshot) { uint32_t D3D12CommandProcessor::GetCurrentColorMask( uint32_t shader_writes_color_targets) const { auto& regs = *register_file_; + if (regs.Get().edram_mode != + xenos::ModeControl::kColorDepth) { + return 0; + } uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32 & 0xFFFF; for (uint32_t i = 0; i < 4; ++i) { if (!(shader_writes_color_targets & (1 << i))) { @@ -1801,12 +1805,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, SCOPE_profile_cpu_f("gpu"); #endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES - xenos::ModeControl enable_mode = regs.Get().edram_mode; - if (enable_mode == xenos::ModeControl::kIgnore) { - // Ignored. - return true; - } - if (enable_mode == xenos::ModeControl::kCopy) { + xenos::ModeControl edram_mode = regs.Get().edram_mode; + if (edram_mode == xenos::ModeControl::kCopy) { // Special copy handling. return IssueCopy(); } @@ -1818,64 +1818,60 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, return true; } - // Shaders will have already been defined by previous loads. - // We need them to do just about anything so validate here. + // Vertex shader. auto vertex_shader = static_cast(active_vertex_shader()); - auto pixel_shader = static_cast(active_pixel_shader()); if (!vertex_shader) { // Always need a vertex shader. return false; } - // Depth-only mode doesn't need a pixel shader. - if (enable_mode == xenos::ModeControl::kDepth) { - pixel_shader = nullptr; - } else if (!pixel_shader) { - // Need a pixel shader in normal color mode. - return false; - } - // Gather shader ucode information to get the color mask, which is needed by - // the render target cache, and memexport configuration, and also get the - // current shader modification bits. - DxbcShaderTranslator::Modification vertex_shader_modification; - DxbcShaderTranslator::Modification pixel_shader_modification; - if (!pipeline_cache_->AnalyzeShaderUcodeAndGetCurrentModifications( - vertex_shader, pixel_shader, vertex_shader_modification, - pixel_shader_modification)) { - return false; - } - D3D12Shader::D3D12Translation* vertex_shader_translation = - static_cast( - vertex_shader->GetOrCreateTranslation( - vertex_shader_modification.value)); - D3D12Shader::D3D12Translation* pixel_shader_translation = - pixel_shader ? static_cast( - pixel_shader->GetOrCreateTranslation( - pixel_shader_modification.value)) - : nullptr; - bool tessellated = vertex_shader_modification.host_vertex_shader_type != - Shader::HostVertexShaderType::kVertex; - - // Check if memexport is used. If it is, we can't skip draw calls that have no - // visual effect. + pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); bool memexport_used_vertex = !vertex_shader->memexport_stream_constants().empty(); - bool memexport_used_pixel = - pixel_shader != nullptr && - !pixel_shader->memexport_stream_constants().empty(); - bool memexport_used = memexport_used_vertex || memexport_used_pixel; - + DxbcShaderTranslator::Modification vertex_shader_modification; + pipeline_cache_->GetCurrentShaderModification(*vertex_shader, + vertex_shader_modification); + bool tessellated = vertex_shader_modification.host_vertex_shader_type != + Shader::HostVertexShaderType::kVertex; bool primitive_polygonal = xenos::IsPrimitivePolygonal(tessellated, primitive_type); - auto sq_program_cntl = regs.Get(); - auto pa_su_sc_mode_cntl = regs.Get(); - if (!memexport_used_vertex && - (sq_program_cntl.vs_export_mode == - xenos::VertexShaderExportMode::kMultipass || - (primitive_polygonal && pa_su_sc_mode_cntl.cull_front && - pa_su_sc_mode_cntl.cull_back))) { - // All faces are culled - can't be expressed in the pipeline. - return true; + + // Pixel shader. + D3D12Shader* pixel_shader = nullptr; + if (draw_util::IsRasterizationPotentiallyDone(regs, primitive_polygonal)) { + // See xenos::ModeControl for explanation why the pixel shader is only used + // when it's kColorDepth here. + if (edram_mode == xenos::ModeControl::kColorDepth) { + pixel_shader = static_cast(active_pixel_shader()); + if (pixel_shader) { + pipeline_cache_->AnalyzeShaderUcode(*pixel_shader); + if (!draw_util::IsPixelShaderNeededWithRasterization(*pixel_shader, + regs)) { + pixel_shader = nullptr; + } + } + } + } else { + // Disabling pixel shader for this case is also required by the pipeline + // cache. + if (!memexport_used_vertex) { + // This draw has no effect. + return true; + } } + bool memexport_used_pixel; + DxbcShaderTranslator::Modification pixel_shader_modification; + if (pixel_shader) { + memexport_used_pixel = !pixel_shader->memexport_stream_constants().empty(); + if (!pipeline_cache_->GetCurrentShaderModification( + *pixel_shader, pixel_shader_modification)) { + return false; + } + } else { + memexport_used_pixel = false; + pixel_shader_modification = DxbcShaderTranslator::Modification(0); + } + + bool memexport_used = memexport_used_vertex || memexport_used_pixel; BeginSubmission(true); @@ -1953,6 +1949,15 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, } // Translate the shaders and create the pipeline if needed. + D3D12Shader::D3D12Translation* vertex_shader_translation = + static_cast( + vertex_shader->GetOrCreateTranslation( + vertex_shader_modification.value)); + D3D12Shader::D3D12Translation* pixel_shader_translation = + pixel_shader ? static_cast( + pixel_shader->GetOrCreateTranslation( + pixel_shader_modification.value)) + : nullptr; void* pipeline_handle; ID3D12RootSignature* root_signature; if (!pipeline_cache_->ConfigurePipeline( @@ -2844,7 +2849,7 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( Register stencil_ref_mask_reg; auto pa_su_sc_mode_cntl = regs.Get(); if (primitive_polygonal && - regs.Get().backface_enable && + draw_util::GetDepthControlForCurrentEdramMode(regs).backface_enable && pa_su_sc_mode_cntl.cull_front && !pa_su_sc_mode_cntl.cull_back) { stencil_ref_mask_reg = XE_GPU_REG_RB_STENCILREFMASK_BF; } else { @@ -2880,7 +2885,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; auto rb_colorcontrol = regs.Get(); auto rb_depth_info = regs.Get(); - auto rb_depthcontrol = regs.Get(); + auto rb_depthcontrol = draw_util::GetDepthControlForCurrentEdramMode(regs); auto rb_stencilrefmask = regs.Get(); auto rb_stencilrefmask_bf = regs.Get(XE_GPU_REG_RB_STENCILREFMASK_BF); @@ -3068,24 +3073,11 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } // Conversion to Direct3D 12 normalized device coordinates. - // Kill all primitives if multipass or both faces are culled, but still need - // to do memexport. - if (sq_program_cntl.vs_export_mode == - xenos::VertexShaderExportMode::kMultipass || - (primitive_polygonal && pa_su_sc_mode_cntl.cull_front && - pa_su_sc_mode_cntl.cull_back)) { - float nan_value = std::nanf(""); - for (uint32_t i = 0; i < 3; ++i) { - dirty |= !std::isnan(system_constants_.ndc_scale[i]); - system_constants_.ndc_scale[i] = nan_value; - } - } else { - for (uint32_t i = 0; i < 3; ++i) { - dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i]; - dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i]; - system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i]; - system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; - } + for (uint32_t i = 0; i < 3; ++i) { + dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i]; + dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i]; + system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i]; + system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i]; } // Point size. diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index c29dd4c0d..5e2289306 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -33,6 +33,7 @@ #include "xenia/base/string_buffer.h" #include "xenia/base/xxhash.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" +#include "xenia/gpu/draw_util.h" #include "xenia/gpu/gpu_flags.h" #include "xenia/ui/d3d12/d3d12_util.h" @@ -857,32 +858,30 @@ D3D12Shader* PipelineCache::LoadShader(xenos::ShaderType shader_type, return shader; } -bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - DxbcShaderTranslator::Modification& vertex_shader_modification_out, - DxbcShaderTranslator::Modification& pixel_shader_modification_out) { - Shader::HostVertexShaderType host_vertex_shader_type = - GetCurrentHostVertexShaderTypeIfValid(); - if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { - return false; - } +bool PipelineCache::GetCurrentShaderModification( + const Shader& shader, + DxbcShaderTranslator::Modification& modification_out) const { + assert_true(shader.is_ucode_analyzed()); const auto& regs = register_file_; auto sq_program_cntl = regs.Get(); - - vertex_shader->AnalyzeUcode(ucode_disasm_buffer_); - vertex_shader_modification_out = DxbcShaderTranslator::Modification( - shader_translator_->GetDefaultModification( - xenos::ShaderType::kVertex, - vertex_shader->GetDynamicAddressableRegisterCount( - sq_program_cntl.vs_num_reg), - host_vertex_shader_type)); - - if (pixel_shader) { - pixel_shader->AnalyzeUcode(ucode_disasm_buffer_); + if (shader.type() == xenos::ShaderType::kVertex) { + Shader::HostVertexShaderType host_vertex_shader_type = + GetCurrentHostVertexShaderTypeIfValid(); + if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) { + return false; + } + modification_out = DxbcShaderTranslator::Modification( + shader_translator_->GetDefaultModification( + xenos::ShaderType::kVertex, + shader.GetDynamicAddressableRegisterCount( + sq_program_cntl.vs_num_reg), + host_vertex_shader_type)); + } else { + assert_true(shader.type() == xenos::ShaderType::kPixel); DxbcShaderTranslator::Modification pixel_shader_modification( shader_translator_->GetDefaultModification( xenos::ShaderType::kPixel, - pixel_shader->GetDynamicAddressableRegisterCount( + shader.GetDynamicAddressableRegisterCount( sq_program_cntl.ps_num_reg))); if (!edram_rov_used_) { using DepthStencilMode = @@ -891,7 +890,7 @@ bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications( flags::DepthFloat24Conversion::kOnOutputTruncating || depth_float24_conversion_ == flags::DepthFloat24Conversion::kOnOutputRounding) && - regs.Get().z_enable && + draw_util::GetDepthControlForCurrentEdramMode(regs).z_enable && regs.Get().depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { pixel_shader_modification.depth_stencil_mode = @@ -900,11 +899,10 @@ bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications( ? DepthStencilMode::kFloat24Truncating : DepthStencilMode::kFloat24Rounding; } else { - auto rb_colorcontrol = regs.Get(); - if (pixel_shader->implicit_early_z_write_allowed() && - (!rb_colorcontrol.alpha_test_enable || - rb_colorcontrol.alpha_func == xenos::CompareFunction::kAlways) && - !rb_colorcontrol.alpha_to_mask_enable) { + if (shader.implicit_early_z_write_allowed() && + (!shader.writes_color_target(0) || + !draw_util::DoesCoverageDependOnAlpha( + regs.Get()))) { pixel_shader_modification.depth_stencil_mode = DepthStencilMode::kEarlyHint; } else { @@ -913,11 +911,7 @@ bool PipelineCache::AnalyzeShaderUcodeAndGetCurrentModifications( } } } - pixel_shader_modification_out = pixel_shader_modification; - } else { - pixel_shader_modification_out = DxbcShaderTranslator::Modification( - shader_translator_->GetDefaultModification(xenos::ShaderType::kPixel, - 0)); + modification_out = pixel_shader_modification; } return true; } @@ -1336,6 +1330,21 @@ bool PipelineCache::GetCurrentStateDescription( bool tessellated = DxbcShaderTranslator::Modification(vertex_shader->modification()) .host_vertex_shader_type != Shader::HostVertexShaderType::kVertex; + bool primitive_polygonal = + xenos::IsPrimitivePolygonal(tessellated, primitive_type); + bool rasterization_enabled = + draw_util::IsRasterizationPotentiallyDone(regs, primitive_polygonal); + // In Direct3D, rasterization (along with pixel counting) is disabled by + // disabling the pixel shader and depth / stencil. However, if rasterization + // should be disabled, the pixel shader must be disabled externally, to ensure + // things like texture binding layout is correct for the shader actually being + // used (don't replace anything here). + if (!rasterization_enabled) { + assert_null(pixel_shader); + if (pixel_shader) { + return false; + } + } // Root signature. runtime_description_out.root_signature = command_processor_.GetRootSignature( @@ -1347,17 +1356,11 @@ bool PipelineCache::GetCurrentStateDescription( return false; } - // Shaders. + // Vertex shader. runtime_description_out.vertex_shader = vertex_shader; description_out.vertex_shader_hash = vertex_shader->shader().ucode_data_hash(); description_out.vertex_shader_modification = vertex_shader->modification(); - if (pixel_shader) { - runtime_description_out.pixel_shader = pixel_shader; - description_out.pixel_shader_hash = - pixel_shader->shader().ucode_data_hash(); - description_out.pixel_shader_modification = pixel_shader->modification(); - } // Index buffer strip cut value. if (pa_su_sc_mode_cntl.multi_prim_ib_ena) { @@ -1411,8 +1414,20 @@ bool PipelineCache::GetCurrentStateDescription( } } - bool primitive_polygonal = - xenos::IsPrimitivePolygonal(tessellated, primitive_type); + // The rest doesn't matter when rasterization is disabled (thus no writing to + // anywhere from post-geometry stages and no samples are counted). + if (!rasterization_enabled) { + description_out.cull_mode = PipelineCullMode::kDisableRasterization; + return true; + } + + // Pixel shader. + if (pixel_shader) { + runtime_description_out.pixel_shader = pixel_shader; + description_out.pixel_shader_hash = + pixel_shader->shader().ucode_data_hash(); + description_out.pixel_shader_modification = pixel_shader->modification(); + } // Rasterizer state. // Because Direct3D 12 doesn't support per-side fill mode and depth bias, the @@ -1428,7 +1443,8 @@ bool PipelineCache::GetCurrentStateDescription( // developer didn't want to fill the whole primitive and use wireframe (like // Xenos fill mode 1). // Here we also assume that only one side is culled - if two sides are culled, - // the D3D12 command processor will drop such draw early. + // rasterization will be disabled externally, or the draw call will be dropped + // early if the vertex shader doesn't export to memory. bool cull_front, cull_back; float poly_offset = 0.0f, poly_offset_scale = 0.0f; if (primitive_polygonal) { @@ -1436,6 +1452,9 @@ bool PipelineCache::GetCurrentStateDescription( cull_front = pa_su_sc_mode_cntl.cull_front != 0; cull_back = pa_su_sc_mode_cntl.cull_back != 0; if (cull_front) { + // The case when both faces are culled should be handled by disabling + // rasterization. + assert_false(cull_back); description_out.cull_mode = PipelineCullMode::kFront; } else if (cull_back) { description_out.cull_mode = PipelineCullMode::kBack; @@ -1522,7 +1541,8 @@ bool PipelineCache::GetCurrentStateDescription( // Depth/stencil. No stencil, always passing depth test and no depth writing // means depth disabled. if (render_targets[4].format != DXGI_FORMAT_UNKNOWN) { - auto rb_depthcontrol = regs.Get(); + auto rb_depthcontrol = + draw_util::GetDepthControlForCurrentEdramMode(regs); if (rb_depthcontrol.z_enable) { description_out.depth_func = rb_depthcontrol.zfunc; description_out.depth_write = rb_depthcontrol.z_write_enable; @@ -1864,6 +1884,9 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_BACK; break; default: + assert_true(description.cull_mode == PipelineCullMode::kNone || + description.cull_mode == + PipelineCullMode::kDisableRasterization); state_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; break; } @@ -1990,6 +2013,23 @@ ID3D12PipelineState* PipelineCache::CreateD3D12Pipeline( } } + // Disable rasterization if needed (parameter combinations that make no + // difference when rasterization is disabled have already been handled in + // GetCurrentStateDescription) the way it's disabled in Direct3D by design + // (disabling a pixel shader and depth / stencil). + // TODO(Triang3l): When it happens to be that a combination of parameters + // (no host pixel shader and depth / stencil without ROV) would disable + // rasterization when it's still needed (for occlusion query sample counting), + // ensure rasterization happens (by binding an empty pixel shader, or maybe + // via ForcedSampleCount when not using 2x MSAA - its requirements for + // OMSetRenderTargets need some investigation though). + if (description.cull_mode == PipelineCullMode::kDisableRasterization) { + state_desc.PS.pShaderBytecode = nullptr; + state_desc.PS.BytecodeLength = 0; + state_desc.DepthStencilState.DepthEnable = FALSE; + state_desc.DepthStencilState.StencilEnable = FALSE; + } + // Create the D3D12 pipeline state object. auto device = command_processor_.GetD3D12Context().GetD3D12Provider().GetDevice(); diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index 9a733e40a..67ba0a993 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -63,14 +63,19 @@ class PipelineCache { D3D12Shader* LoadShader(xenos::ShaderType shader_type, const uint32_t* host_address, uint32_t dword_count); + // Analyze shader microcode on the translator thread. + void AnalyzeShaderUcode(Shader& shader) { + shader.AnalyzeUcode(ucode_disasm_buffer_); + } - // Ensures microcode is analyzed, retrieves the shader modifications for the - // current state, and returns whether they are valid. - bool AnalyzeShaderUcodeAndGetCurrentModifications( - D3D12Shader* vertex_shader, D3D12Shader* pixel_shader, - DxbcShaderTranslator::Modification& vertex_shader_modification_out, - DxbcShaderTranslator::Modification& pixel_shader_modification_out); + // Retrieves the shader modification for the current state, and returns + // whether it is valid. The shader must have microcode analyzed. + bool PipelineCache::GetCurrentShaderModification( + const Shader& shader, + DxbcShaderTranslator::Modification& modification_out) const; + // If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader + // MUST be made nullptr BEFORE calling this! bool ConfigurePipeline( D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, @@ -134,6 +139,8 @@ class PipelineCache { kNone, kFront, kBack, + // Special case, handled via disabling the pixel shader and depth / stencil. + kDisableRasterization, }; enum class PipelineBlendFactor : uint32_t { @@ -234,6 +241,8 @@ class PipelineCache { IDxcUtils* dxc_utils = nullptr, IDxcCompiler* dxc_compiler = nullptr); + // If draw_util::IsRasterizationPotentiallyDone is false, the pixel shader + // MUST be made nullptr BEFORE calling this! bool GetCurrentStateDescription( D3D12Shader::D3D12Translation* vertex_shader, D3D12Shader::D3D12Translation* pixel_shader, diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index f5a4e0c6b..989e20a64 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -647,7 +647,7 @@ bool RenderTargetCache::UpdateRenderTargets( formats_are_64bpp[i] = xenos::IsColorRenderTargetFormat64bpp( xenos::ColorRenderTargetFormat(formats[i])); } - auto rb_depthcontrol = regs.Get(); + auto rb_depthcontrol = draw_util::GetDepthControlForCurrentEdramMode(regs); auto rb_depth_info = regs.Get(); // 0x1 = stencil test, 0x2 = depth test. enabled[4] = rb_depthcontrol.stencil_enable || rb_depthcontrol.z_enable; diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index a9d9fff92..1b7cf33c8 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -111,6 +111,78 @@ int32_t FloatToD3D11Fixed16p8(float f32) { return result.s; } +bool IsRasterizationPotentiallyDone(const RegisterFile& regs, + bool primitive_polygonal) { + // TODO(Triang3l): Investigate ModeControl::kIgnore better, with respect to + // sample counting. Let's assume sample counting is a part of depth / stencil, + // thus disabled too. + xenos::ModeControl edram_mode = regs.Get().edram_mode; + if (edram_mode != xenos::ModeControl::kColorDepth && + edram_mode != xenos::ModeControl::kDepth) { + return false; + } + auto sq_program_cntl = regs.Get(); + if (sq_program_cntl.vs_export_mode == + xenos::VertexShaderExportMode::kMultipass) { + return false; + } + if (primitive_polygonal) { + auto pa_su_sc_mode_cntl = regs.Get(); + if (pa_su_sc_mode_cntl.cull_front && pa_su_sc_mode_cntl.cull_back) { + // Both faces are culled. + return false; + } + } + return true; +} + +bool IsPixelShaderNeededWithRasterization(const Shader& shader, + const RegisterFile& regs) { + assert_true(shader.type() == xenos::ShaderType::kPixel); + assert_true(shader.is_ucode_analyzed()); + + // See xenos::ModeControl for explanation why the pixel shader is only used + // when it's kColorDepth here. + if (regs.Get().edram_mode != + xenos::ModeControl::kColorDepth) { + return false; + } + + // Discarding (explicitly or through alphatest or alpha to coverage) has side + // effects on pixel counting. + // + // Depth output only really matters if depth test is active, but it's used + // extremely rarely, and pretty much always intentionally - for simplicity, + // consider it as always mattering. + // + // Memory export is an obvious intentional side effect. + if (shader.kills_pixels() || shader.writes_depth() || + !shader.memexport_stream_constants().empty() || + (shader.writes_color_target(0) && + DoesCoverageDependOnAlpha(regs.Get()))) { + return true; + } + + // Check if a color target is actually written. + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t rts_remaining = shader.writes_color_targets(); + uint32_t rt_index; + while (xe::bit_scan_forward(rts_remaining, &rt_index)) { + rts_remaining &= ~(uint32_t(1) << rt_index); + uint32_t format_component_count = GetColorRenderTargetFormatComponentCount( + regs.Get( + reg::RB_COLOR_INFO::rt_register_indices[rt_index]) + .color_format); + if ((rb_color_mask >> (rt_index * 4)) & + ((uint32_t(1) << format_component_count) - 1)) { + return true; + } + } + + // Only depth / stencil passthrough potentially. + return false; +} + void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, float pixel_size_y, bool origin_bottom_left, float x_max, float y_max, bool allow_reverse_z, @@ -271,7 +343,8 @@ void GetHostViewportInfo(const RegisterFile& regs, float pixel_size_x, ndc_scale_z = -ndc_scale_z; ndc_offset_z = 1.0f - ndc_offset_z; } - if (convert_z_to_float24 && regs.Get().z_enable && + if (convert_z_to_float24 && + GetDepthControlForCurrentEdramMode(regs).z_enable && regs.Get().depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { // Need to adjust the bounds that the resulting depth values will be clamped diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index c47640a20..f3cff25d2 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -16,6 +16,7 @@ #include "xenia/base/assert.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/shader.h" #include "xenia/gpu/trace_writer.h" #include "xenia/gpu/xenos.h" #include "xenia/memory.h" @@ -33,6 +34,45 @@ namespace draw_util { // for use with the top-left rasterization rule later. int32_t FloatToD3D11Fixed16p8(float f32); +// Whether with the current state, any samples to rasterize (for any reason, not +// only to write something to a render target, but also to do sample counting or +// pixel shader memexport) can be generated. Finally dropping draw calls can +// only be done if the vertex shader doesn't memexport. +bool IsRasterizationPotentiallyDone(const RegisterFile& regs, + bool primitive_polygonal); + +inline reg::RB_DEPTHCONTROL GetDepthControlForCurrentEdramMode( + const RegisterFile& regs) { + xenos::ModeControl edram_mode = regs.Get().edram_mode; + if (edram_mode != xenos::ModeControl::kColorDepth && + edram_mode != xenos::ModeControl::kDepth) { + // Both depth and stencil disabled (EDRAM depth and stencil ignored). + reg::RB_DEPTHCONTROL disabled; + disabled.value = 0; + return disabled; + } + return regs.Get(); +} + +inline bool DoesCoverageDependOnAlpha(reg::RB_COLORCONTROL rb_colorcontrol) { + return (rb_colorcontrol.alpha_test_enable && + rb_colorcontrol.alpha_func != xenos::CompareFunction::kAlways) || + rb_colorcontrol.alpha_to_mask_enable; +} + +// Whether the pixel shader can be disabled on the host to speed up depth +// pre-passes and shadowmaps. The shader must have its ucode analyzed. If +// IsRasterizationPotentiallyDone, this shouldn't be called, and assumed false +// instead. Helps reject the pixel shader in some cases - memexport draws in +// Halo 3, and also most of some 1-point draws not covering anything done for +// some reason in different games with a leftover pixel shader from the previous +// draw, but with SQ_PROGRAM_CNTL destroyed, reducing the number of +// unpredictable unneeded translations of random shaders with different host +// modification bits, such as register count and depth format-related (though +// shaders with side effects on depth or memory export will still be preserved). +bool IsPixelShaderNeededWithRasterization(const Shader& shader, + const RegisterFile& regs); + struct ViewportInfo { // The returned viewport will always be in the positive quarter-plane for // simplicity of clamping to the maximum size supported by the host, negative diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 9f849ee8b..c9e737a5f 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -892,11 +892,11 @@ class Shader { // TODO(Triang3l): Investigate what happens to memexport when the pixel // fails the depth/stencil test, but in Direct3D 11 UAV writes disable early // depth/stencil. - return !writes_depth() && !kills_pixels() && + return !kills_pixels() && !writes_depth() && memexport_stream_constants().empty(); } - // Whether each color render target is written to on any exection path. + // Whether each color render target is written to on any execution path. uint32_t writes_color_targets() const { return writes_color_targets_; } bool writes_color_target(uint32_t i) const { return (writes_color_targets() & (uint32_t(1) << i)) != 0; @@ -954,8 +954,9 @@ class Shader { // compiled when a new material appears in the game, and having the order of // draws also matter in such unpredictable way would break this rule; limit // the effect to shaders with dynamic register addressing only, which are - // extremely rare), also some info needed for drawing is collected during the - // ucode analysis. + // extremely rare; however care should be taken regarding depth format-related + // translation modifications in this case), also some info needed for drawing + // is collected during the ucode analysis. bool is_ucode_analyzed_ = false; std::string ucode_disassembly_; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index f8e178f15..f30e0b5c8 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -297,6 +297,20 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) { format == ColorRenderTargetFormat::k_32_32_FLOAT; } +inline uint32_t GetColorRenderTargetFormatComponentCount( + ColorRenderTargetFormat format) { + switch (format) { + case ColorRenderTargetFormat::k_32_FLOAT: + return 1; + case ColorRenderTargetFormat::k_16_16: + case ColorRenderTargetFormat::k_16_16_FLOAT: + case ColorRenderTargetFormat::k_32_32_FLOAT: + return 2; + default: + return 4; + } +} + enum class DepthRenderTargetFormat : uint32_t { kD24S8 = 0, // 20e4 [0, 2). @@ -749,6 +763,26 @@ enum class PolygonType : uint32_t { enum class ModeControl : uint32_t { kIgnore = 0, kColorDepth = 4, + // TODO(Triang3l): Verify whether kDepth means the pixel shader is ignored + // completely even if it writes depth, exports to memory or kills pixels. + // Hints suggesting that it should be completely ignored (which is desirable + // on real hardware to avoid scheduling the pixel shader at all and waiting + // for it especially since the Xbox 360 doesn't have early per-sample depth / + // stencil, only early hi-Z / hi-stencil, and other registers possibly + // toggling pixel shader execution are yet to be found): + // - Most of depth pre-pass draws in Call of Duty 4 use the kDepth more with + // a `oC0 = tfetch2D(tf0, r0.xy) * r1` shader, some use `oC0 = r0` though. + // However, when alphatested surfaces are drawn, kColorDepth is explicitly + // used with the same shader performing the texture fetch. + // - Red Dead Redemption has some kDepth draws with alphatest enabled, but the + // shader is `oC0 = r0`, which makes no sense (alphatest based on an + // interpolant from the vertex shader) as no texture alpha cutout is + // involved. + // - Red Dead Redemption also has kDepth draws with pretty complex shaders + // clearly for use only in the color pass - even fetching and filtering a + // shadowmap. + // For now, based on these, let's assume the pixel shader is never used with + // kDepth. kDepth = 5, kCopy = 6, };