/** ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** * Copyright 2018 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ #include "xenia/gpu/d3d12/render_target_cache.h" #include #include #include #include "xenia/base/assert.h" #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/memory.h" #include "xenia/base/profiling.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/texture_info.h" #include "xenia/gpu/texture_util.h" #include "xenia/ui/d3d12/d3d12_util.h" DEFINE_bool(d3d12_16bit_rtv_full_range, true, "Use full -32...32 range for RG16 and RGBA16 render targets " "(at the expense of blending correctness) without ROV.", "D3D12"); DEFINE_bool(d3d12_resolution_scale_resolve_edge_clamp, true, "When using resolution scale, apply the hack that duplicates the " "right/lower subpixel in the left and top sides of render target " "resolve areas to eliminate the gap caused by half-pixel offset " "(this is necessary for certain games like GTA IV to work).", "D3D12"); DECLARE_bool(d3d12_half_pixel_offset); namespace xe { namespace gpu { namespace d3d12 { // Generated with `xb buildhlsl`. #include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_2x_resolve_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_2x_resolve_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_2x_resolve_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_7e3_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_tile_sample_32bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/edram_tile_sample_64bpp_cs.h" #include "xenia/gpu/d3d12/shaders/dxbc/resolve_ps.h" #include "xenia/gpu/d3d12/shaders/dxbc/resolve_vs.h" #if 0 constexpr uint32_t RenderTargetCache::kHeap4MBPages; #endif constexpr uint32_t RenderTargetCache::kRenderTargetDescriptorHeapSize; const RenderTargetCache::EDRAMLoadStoreModeInfo RenderTargetCache::edram_load_store_mode_info_[size_t( RenderTargetCache::EDRAMLoadStoreMode::kCount)] = { {edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs), L"EDRAM Load 32bpp Color", edram_store_color_32bpp_cs, sizeof(edram_store_color_32bpp_cs), L"EDRAM Store 32bpp Color", edram_load_color_32bpp_2x_resolve_cs, sizeof(edram_load_color_32bpp_2x_resolve_cs), L"EDRAM Load 32bpp Color for 2x Resolve"}, {edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs), L"EDRAM Load 64bpp Color", edram_store_color_64bpp_cs, sizeof(edram_store_color_64bpp_cs), L"EDRAM Store 64bpp Color", edram_load_color_64bpp_2x_resolve_cs, sizeof(edram_load_color_64bpp_2x_resolve_cs), L"EDRAM Load 64bpp Color for 2x Resolve"}, {edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs), L"EDRAM Load 7e3 Color", edram_store_color_7e3_cs, sizeof(edram_store_color_7e3_cs), L"EDRAM Store 7e3 Color", edram_load_color_7e3_2x_resolve_cs, sizeof(edram_load_color_7e3_2x_resolve_cs), L"EDRAM Load 7e3 Color for 2x Resolve"}, {edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs), L"EDRAM Load UNorm Depth", edram_store_depth_unorm_cs, sizeof(edram_store_depth_unorm_cs), L"EDRAM Store UNorm Depth", nullptr, 0, nullptr}, {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs), L"EDRAM Load Float Depth", edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs), L"EDRAM Store Float Depth", nullptr, 0, nullptr}, }; RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor, RegisterFile* register_file) : command_processor_(command_processor), register_file_(register_file) {} RenderTargetCache::~RenderTargetCache() { Shutdown(); } bool RenderTargetCache::Initialize(const TextureCache* texture_cache) { // EDRAM buffer size depends on this. resolution_scale_2x_ = texture_cache->IsResolutionScale2X(); assert_false(resolution_scale_2x_ && !command_processor_->IsROVUsedForEDRAM()); auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); // Create the buffer for reinterpreting EDRAM contents. // No need to clear it in the first frame, memory is zeroed out when allocated // on Windows. D3D12_RESOURCE_DESC edram_buffer_desc; ui::d3d12::util::FillBufferResourceDesc( edram_buffer_desc, GetEDRAMBufferSize(), D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS); // The first operation will likely be drawing with ROV or a load without ROV. edram_buffer_state_ = command_processor_->IsROVUsedForEDRAM() ? D3D12_RESOURCE_STATE_UNORDERED_ACCESS : D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE, &edram_buffer_desc, edram_buffer_state_, nullptr, IID_PPV_ARGS(&edram_buffer_)))) { XELOGE("Failed to create the EDRAM buffer"); Shutdown(); return false; } edram_buffer_modified_ = false; // Create non-shader-visible descriptors of the EDRAM buffer for copying. D3D12_DESCRIPTOR_HEAP_DESC edram_buffer_descriptor_heap_desc; edram_buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; edram_buffer_descriptor_heap_desc.NumDescriptors = uint32_t(EDRAMBufferDescriptorIndex::kCount); edram_buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; edram_buffer_descriptor_heap_desc.NodeMask = 0; if (FAILED(device->CreateDescriptorHeap( &edram_buffer_descriptor_heap_desc, IID_PPV_ARGS(&edram_buffer_descriptor_heap_)))) { XELOGE("Failed to create the descriptor heap for EDRAM buffer views"); Shutdown(); return false; } edram_buffer_descriptor_heap_start_ = edram_buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart(); ui::d3d12::util::CreateRawBufferSRV( device, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kRawSRV)), edram_buffer_, GetEDRAMBufferSize()); ui::d3d12::util::CreateRawBufferUAV( device, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kRawUAV)), edram_buffer_, GetEDRAMBufferSize()); D3D12_UNORDERED_ACCESS_VIEW_DESC edram_buffer_uint32_uav_desc; edram_buffer_uint32_uav_desc.Format = DXGI_FORMAT_R32_UINT; edram_buffer_uint32_uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; edram_buffer_uint32_uav_desc.Buffer.FirstElement = 0; edram_buffer_uint32_uav_desc.Buffer.NumElements = GetEDRAMBufferSize() / sizeof(uint32_t); edram_buffer_uint32_uav_desc.Buffer.StructureByteStride = 0; edram_buffer_uint32_uav_desc.Buffer.CounterOffsetInBytes = 0; edram_buffer_uint32_uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE; device->CreateUnorderedAccessView( edram_buffer_, nullptr, &edram_buffer_uint32_uav_desc, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kUint32UAV))); // Create the root signature for EDRAM buffer load/store. D3D12_ROOT_PARAMETER load_store_root_parameters[2]; // Parameter 0 is constants (changed for each render target binding). load_store_root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; load_store_root_parameters[0].Constants.ShaderRegister = 0; load_store_root_parameters[0].Constants.RegisterSpace = 0; load_store_root_parameters[0].Constants.Num32BitValues = sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t); load_store_root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; // Parameter 1 is source and target. D3D12_DESCRIPTOR_RANGE load_store_root_ranges[2]; load_store_root_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; load_store_root_ranges[0].NumDescriptors = 1; load_store_root_ranges[0].BaseShaderRegister = 0; load_store_root_ranges[0].RegisterSpace = 0; load_store_root_ranges[0].OffsetInDescriptorsFromTableStart = 0; load_store_root_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV; load_store_root_ranges[1].NumDescriptors = 1; load_store_root_ranges[1].BaseShaderRegister = 0; load_store_root_ranges[1].RegisterSpace = 0; load_store_root_ranges[1].OffsetInDescriptorsFromTableStart = 1; load_store_root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 2; load_store_root_parameters[1].DescriptorTable.pDescriptorRanges = load_store_root_ranges; load_store_root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; D3D12_ROOT_SIGNATURE_DESC load_store_root_desc; load_store_root_desc.NumParameters = UINT(xe::countof(load_store_root_parameters)); load_store_root_desc.pParameters = load_store_root_parameters; load_store_root_desc.NumStaticSamplers = 0; load_store_root_desc.pStaticSamplers = nullptr; load_store_root_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; edram_load_store_root_signature_ = ui::d3d12::util::CreateRootSignature(provider, load_store_root_desc); if (edram_load_store_root_signature_ == nullptr) { XELOGE("Failed to create the EDRAM load/store root signature"); Shutdown(); return false; } // Create the clear root signature (the same, but with the UAV only). load_store_root_ranges[1].OffsetInDescriptorsFromTableStart = 0; load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1; ++load_store_root_parameters[1].DescriptorTable.pDescriptorRanges; edram_clear_root_signature_ = ui::d3d12::util::CreateRootSignature(provider, load_store_root_desc); if (edram_clear_root_signature_ == nullptr) { XELOGE("Failed to create the EDRAM buffer clear root signature"); Shutdown(); return false; } // Create the pipelines. bool rov_used = command_processor_->IsROVUsedForEDRAM(); // Load and store. for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) { const EDRAMLoadStoreModeInfo& mode_info = edram_load_store_mode_info_[i]; edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.load_shader, mode_info.load_shader_size, edram_load_store_root_signature_); if (!rov_used) { edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.store_shader, mode_info.store_shader_size, edram_load_store_root_signature_); } // Load shader for resolution-scaled resolves (host pixels within samples to // samples within host pixels) doesn't always exist for each mode - depth is // not resolved using drawing, for example. bool load_2x_resolve_pipeline_used = resolution_scale_2x_ && mode_info.load_2x_resolve_shader != nullptr; if (load_2x_resolve_pipeline_used) { edram_load_2x_resolve_pipelines_[i] = ui::d3d12::util::CreateComputePipeline( device, mode_info.load_2x_resolve_shader, mode_info.load_2x_resolve_shader_size, edram_load_store_root_signature_); } if (edram_load_pipelines_[i] == nullptr || (!rov_used && edram_store_pipelines_[i] == nullptr) || (load_2x_resolve_pipeline_used && edram_load_2x_resolve_pipelines_[i] == nullptr)) { XELOGE("Failed to create the EDRAM load/store pipelines for mode %u", i); Shutdown(); return false; } edram_load_pipelines_[i]->SetName(mode_info.load_pipeline_name); if (edram_store_pipelines_[i] != nullptr) { edram_store_pipelines_[i]->SetName(mode_info.store_pipeline_name); } if (edram_load_2x_resolve_pipelines_[i] != nullptr) { edram_load_pipelines_[i]->SetName( mode_info.load_2x_resolve_pipeline_name); } } // Tile single sample into a texture - 32 bits per pixel. edram_tile_sample_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_tile_sample_32bpp_cs, sizeof(edram_tile_sample_32bpp_cs), edram_load_store_root_signature_); if (edram_tile_sample_32bpp_pipeline_ == nullptr) { XELOGE("Failed to create the 32bpp EDRAM raw resolve pipeline"); Shutdown(); return false; } edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp"); // Tile single sample into a texture - 64 bits per pixel. edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs), edram_load_store_root_signature_); if (edram_tile_sample_64bpp_pipeline_ == nullptr) { XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline"); Shutdown(); return false; } edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp"); // Clear 32-bit color or unorm depth. edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs), edram_clear_root_signature_); if (edram_clear_32bpp_pipeline_ == nullptr) { XELOGE("Failed to create the EDRAM 32bpp clear pipeline"); Shutdown(); return false; } edram_clear_32bpp_pipeline_->SetName(L"EDRAM Clear 32bpp"); // Clear 64-bit color. edram_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_clear_64bpp_cs, sizeof(edram_clear_64bpp_cs), edram_clear_root_signature_); if (edram_clear_64bpp_pipeline_ == nullptr) { XELOGE("Failed to create the EDRAM 64bpp clear pipeline"); Shutdown(); return false; } edram_clear_64bpp_pipeline_->SetName(L"EDRAM Clear 64bpp"); // Clear float depth. edram_clear_depth_float_pipeline_ = ui::d3d12::util::CreateComputePipeline( device, edram_clear_depth_float_cs, sizeof(edram_clear_depth_float_cs), edram_clear_root_signature_); if (edram_clear_depth_float_pipeline_ == nullptr) { XELOGE("Failed to create the EDRAM float depth clear pipeline"); Shutdown(); return false; } edram_clear_depth_float_pipeline_->SetName(L"EDRAM Clear Float Depth"); // Create the converting resolve root signature. D3D12_ROOT_PARAMETER resolve_root_parameters[2]; // Parameter 0 is constants. resolve_root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; resolve_root_parameters[0].Constants.ShaderRegister = 0; resolve_root_parameters[0].Constants.RegisterSpace = 0; resolve_root_parameters[0].Constants.Num32BitValues = sizeof(ResolveRootConstants) / sizeof(uint32_t); resolve_root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL; // Parameter 1 is the source render target. D3D12_DESCRIPTOR_RANGE resolve_root_srv_range; resolve_root_srv_range.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; resolve_root_srv_range.NumDescriptors = 1; resolve_root_srv_range.BaseShaderRegister = 0; resolve_root_srv_range.RegisterSpace = 0; resolve_root_srv_range.OffsetInDescriptorsFromTableStart = 0; resolve_root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; resolve_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1; resolve_root_parameters[1].DescriptorTable.pDescriptorRanges = &resolve_root_srv_range; resolve_root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL; // Static sampler for resolving AA using bilinear filtering. D3D12_STATIC_SAMPLER_DESC resolve_sampler_desc; resolve_sampler_desc.Filter = D3D12_FILTER_MIN_MAG_MIP_LINEAR; resolve_sampler_desc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; resolve_sampler_desc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; resolve_sampler_desc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; resolve_sampler_desc.MipLODBias = 0.0f; resolve_sampler_desc.MaxAnisotropy = 1; resolve_sampler_desc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER; resolve_sampler_desc.BorderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_BLACK; resolve_sampler_desc.MinLOD = 0.0f; resolve_sampler_desc.MaxLOD = 0.0f; resolve_sampler_desc.ShaderRegister = 0; resolve_sampler_desc.RegisterSpace = 0; resolve_sampler_desc.ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL; D3D12_ROOT_SIGNATURE_DESC resolve_root_desc; resolve_root_desc.NumParameters = UINT(xe::countof(resolve_root_parameters)); resolve_root_desc.pParameters = resolve_root_parameters; resolve_root_desc.NumStaticSamplers = 1; resolve_root_desc.pStaticSamplers = &resolve_sampler_desc; resolve_root_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_DENY_VERTEX_SHADER_ROOT_ACCESS; resolve_root_signature_ = ui::d3d12::util::CreateRootSignature(provider, resolve_root_desc); if (resolve_root_signature_ == nullptr) { XELOGE("Failed to create the converting resolve root signature"); Shutdown(); return false; } ClearBindings(); return true; } void RenderTargetCache::Shutdown() { ClearCache(); for (auto& resolve_pipeline : resolve_pipelines_) { resolve_pipeline.pipeline->Release(); } resolve_pipelines_.clear(); ui::d3d12::util::ReleaseAndNull(resolve_root_signature_); ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_64bpp_pipeline_); ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_); for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) { ui::d3d12::util::ReleaseAndNull(edram_store_pipelines_[i]); ui::d3d12::util::ReleaseAndNull(edram_load_pipelines_[i]); } ui::d3d12::util::ReleaseAndNull(edram_clear_root_signature_); ui::d3d12::util::ReleaseAndNull(edram_load_store_root_signature_); ui::d3d12::util::ReleaseAndNull(edram_buffer_descriptor_heap_); ui::d3d12::util::ReleaseAndNull(edram_buffer_); } void RenderTargetCache::ClearCache() { for (auto resolve_target_pair : resolve_targets_) { ResolveTarget* resolve_target = resolve_target_pair.second; resolve_target->resource->Release(); delete resolve_target; } resolve_targets_.clear(); COUNT_profile_set("gpu/render_target_cache/resolve_targets", 0); for (auto render_target_pair : render_targets_) { RenderTarget* render_target = render_target_pair.second; render_target->resource->Release(); delete render_target; } render_targets_.clear(); COUNT_profile_set("gpu/render_target_cache/render_targets", 0); while (descriptor_heaps_depth_ != nullptr) { auto heap = descriptor_heaps_depth_; heap->heap->Release(); descriptor_heaps_depth_ = heap->previous; delete heap; } while (descriptor_heaps_color_ != nullptr) { auto heap = descriptor_heaps_color_; heap->heap->Release(); descriptor_heaps_color_ = heap->previous; delete heap; } #if 0 for (uint32_t i = 0; i < xe::countof(heaps_); ++i) { if (heaps_[i] != nullptr) { heaps_[i]->Release(); heaps_[i] = nullptr; } } #endif } void RenderTargetCache::BeginFrame() { // A frame does not always end in a resolve (for example, when memexport // readback happens) or something else that would surely submit the UAV // barrier, so we need to preserve the `current_` variables. if (!command_processor_->IsROVUsedForEDRAM()) { ClearBindings(); } } bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) { // There are two kinds of render target binding updates in this implementation // in case something has been changed - full and partial. // // For the RTV/DSV path, a full update involves flushing all the currently // bound render targets that have been modified to the EDRAM buffer, // allocating all the newly bound render targets in the heaps, loading them // from the EDRAM buffer and binding them. // // For the ROV path, a full update places a UAV barrier because across draws, // pixels with different SV_Positions or different sample counts (thus without // interlocking between each other) may access the same data now. Not having // the barriers causes visual glitches in many games, such as Halo 3 where the // right side of the menu and shadow maps get corrupted (at least on Nvidia). // // ("Bound" here means ever used since the last full update - and in this case // it's bound to the Direct3D 12 command list in the RTV/DSV path.) // // However, Banjo-Kazooie interleaves color/depth and depth-only writes every // draw call, and doing a full update whenever the color mask is changed is // too expensive. So, we shouldn't do a full update if the game only toggles // color writes and depth testing. Instead, we're only adding or re-enabling // render targets if color writes are being enabled (adding involves loading // the contents from the EDRAM, while re-enabling does nothing on the D3D // side). // // There are cases when simply toggling render targets may still require EDRAM // stores and thus a full update. Here's an example situation: // Draw 1: // - 32bpp RT0 0-10 MB // - 32bpp RT1 3-10 MB // - 1280x720 viewport // Draw 2: // - 32bpp RT0 0-10 MB // - Inactive RT1 // - 1280x1440 viewport // Draw 3: // - 32bpp RT0 0-10 MB // - 32bpp RT1 3-10 MB // - 1280x720 viewport // In this case, before draw 2, RT1 must be written to the EDRAM buffer, and // RT0 must be loaded, and also before draw 3 RT1 must receive the changes // made to the lower part of RT0. So, before draws 2 and 3, full updates must // be done. // // Direct3D 12 also requires all render targets to have the same size, so the // height is calculated from the EDRAM space available to the last render // target available in it. However, to make toggling render targets like in // the Banjo-Kazooie case possible, the height may be decreased only in full // updates. // TODO(Triang3l): Check if it's safe to calculate the smallest EDRAM region // without aliasing and use it for the height. This won't work if games // actually alias active render targets for some reason. // // To summarize, a full update happens if: // - Starting a new frame. // - Drawing after resolving. // - Surface pitch changed. // - Sample count changed. // - Render target is disabled and another render target got more space than // is currently available in the textures (RTV/DSV only). // - EDRAM base of a currently used RT changed. // - Format of a currently used RT changed (RTV/DSV) or pixel size of a // currently used RT changed (ROV). // - Current viewport contains unsaved data from previously used render // targets. // - New render target overlaps unsaved data from other bound render targets. // // "Previously used" and "new" in the last 2 conditions is important so if the // game has render targets aliased in the same draw call, there won't be a // full update every draw. // // A partial update happens if: // - New render target is added, but doesn't overlap unsaved data from other // currently or previously used render targets, and it doesn't require a // bigger size. auto& regs = *register_file_; #if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); #endif // FINE_GRAINED_DRAW_SCOPES bool rov_used = command_processor_->IsROVUsedForEDRAM(); auto rb_surface_info = regs.Get(); uint32_t surface_pitch = std::min(rb_surface_info.surface_pitch, 2560u); if (surface_pitch == 0) { // TODO(Triang3l): Do something if a memexport-only draw has 0 surface // pitch (never seen in any game so far, not sure if even legal). return false; } uint32_t msaa_samples_x = rb_surface_info.msaa_samples >= MsaaSamples::k4X ? 2 : 1; uint32_t msaa_samples_y = rb_surface_info.msaa_samples >= MsaaSamples::k2X ? 2 : 1; // Extract color/depth info in an unified way. bool enabled[5]; uint32_t edram_bases[5]; uint32_t formats[5]; bool formats_are_64bpp[5]; uint32_t color_mask = command_processor_->GetCurrentColorMask(pixel_shader); for (uint32_t i = 0; i < 4; ++i) { enabled[i] = (color_mask & (0xF << (i * 4))) != 0; auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[i]); edram_bases[i] = std::min(color_info.color_base, 2048u); formats[i] = uint32_t(GetBaseColorFormat(color_info.color_format)); formats_are_64bpp[i] = IsColorFormat64bpp(ColorRenderTargetFormat(formats[i])); } auto rb_depthcontrol = regs.Get(); auto rb_depth_info = regs.Get(); // 0x1 = stencil test, 0x2 = depth test. enabled[4] = rb_depthcontrol.stencil_enable || rb_depthcontrol.z_enable; edram_bases[4] = std::min(rb_depth_info.depth_base, 2048u); formats[4] = uint32_t(rb_depth_info.depth_format); formats_are_64bpp[4] = false; // Don't mark depth regions as dirty if not writing the depth. // TODO(Triang3l): Make a common function for checking if stencil writing is // really done? bool depth_readonly = !rb_depthcontrol.stencil_enable && !rb_depthcontrol.z_write_enable; bool full_update = false; // Check the following full update conditions: // - Starting a new frame. // - Drawing after resolving. // - Surface pitch changed. // - Sample count changed. // Draws are skipped if the surface pitch is 0, so a full update can be forced // in the beginning of the frame or after resolves by setting the current // pitch to 0. if (current_surface_pitch_ != surface_pitch || current_msaa_samples_ != rb_surface_info.msaa_samples) { full_update = true; } // Get the maximum height of each render target in EDRAM rows to help // clamp the dirty region heights. uint32_t edram_row_tiles_32bpp = (surface_pitch * msaa_samples_x + 79) / 80; uint32_t edram_row_tiles[5]; uint32_t edram_max_rows = UINT32_MAX; for (uint32_t i = 0; i < 5; ++i) { edram_row_tiles[i] = edram_row_tiles_32bpp * (formats_are_64bpp[i] ? 2 : 1); if (enabled[i]) { // Direct3D 12 doesn't allow render targets with different sizes, so // calculate the height from the render target closest to the end of // EDRAM. edram_max_rows = std::min(edram_max_rows, (2048 - edram_bases[i]) / edram_row_tiles[i]); } } if (edram_max_rows == UINT32_MAX) { // No render targets needed - likely a memexport-only draw, just keep using // the current state (or 0 if nothing bound yet, but nothing will be bound // anyway so it won't matter). edram_max_rows = current_edram_max_rows_; } else { if (edram_max_rows == 0) { // Some render target is totally in the end of EDRAM - can't create // textures with 0 height. return false; } } // Don't create render targets larger than x2560. edram_max_rows = std::min(edram_max_rows, 160u * msaa_samples_y); // Check the following full update conditions: // - Render target is disabled and another render target got more space than // is currently available in the textures (RTV/DSV only). if (!rov_used && edram_max_rows > current_edram_max_rows_) { full_update = true; } // Get EDRAM usage of the current draw so dirty regions can be calculated. // See D3D12CommandProcessor::UpdateFixedFunctionState for more info. int32_t window_offset_y = regs.Get().window_y_offset; auto pa_cl_vte_cntl = regs.Get(); float viewport_scale_y = pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1280.0f; float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 : std::abs(viewport_scale_y); if (regs.Get().vtx_window_offset_enable) { viewport_offset_y += float(window_offset_y); } uint32_t viewport_bottom = uint32_t(std::max( 0.0f, std::ceil(viewport_offset_y + std::abs(viewport_scale_y)))); uint32_t scissor_bottom = regs.Get().br_y; if (!regs.Get().window_offset_disable) { scissor_bottom = std::max(int32_t(scissor_bottom) + window_offset_y, 0); } uint32_t dirty_bottom = std::min(std::min(viewport_bottom, scissor_bottom), 2560u); uint32_t edram_dirty_rows = std::min((dirty_bottom * msaa_samples_y + 15) >> 4, edram_max_rows); // Check the following full update conditions: // - EDRAM base of a currently used RT changed. // - Format of a currently used RT changed (RTV/DSV) or pixel size of a // currently used RT changed (ROV). // Also build a list of render targets to attach in a partial update. uint32_t render_targets_to_attach = 0; if (!full_update) { for (uint32_t i = 0; i < 5; ++i) { if (!enabled[i]) { continue; } const RenderTargetBinding& binding = current_bindings_[i]; if (binding.is_bound) { if (binding.edram_base != edram_bases[i]) { full_update = true; break; } if (rov_used) { if (i != 4) { full_update |= IsColorFormat64bpp(binding.color_format) != formats_are_64bpp[i]; } } else { full_update |= binding.format != formats[i]; } if (full_update) { break; } } else { render_targets_to_attach |= 1 << i; } } } // Check the following full update conditions here: // - Current viewport contains unsaved data from previously used render // targets. // - New render target overlaps unsaved data from other bound render // targets. if (!full_update) { for (uint32_t i = 0; i < 5; ++i) { const RenderTargetBinding& binding_1 = current_bindings_[i]; uint32_t edram_dirty_rows_1; if (binding_1.is_bound) { if (enabled[i]) { continue; } // Checking if now overlapping a previously used render target. // binding_1 is the previously used render target. edram_dirty_rows_1 = binding_1.edram_dirty_rows; } else { if (!(render_targets_to_attach & (1 << i))) { continue; } // Checking if the new render target is overlapping any bound one. // binding_1 is the new render target. edram_dirty_rows_1 = edram_dirty_rows; } for (uint32_t j = 0; j < 5; ++j) { const RenderTargetBinding& binding_2 = current_bindings_[j]; if (!binding_2.is_bound) { continue; } uint32_t edram_dirty_rows_2; if (binding_1.is_bound) { if (!enabled[j]) { continue; } // Checking if now overlapping a previously used render target. // binding_2 is a currently used render target. edram_dirty_rows_2 = edram_dirty_rows; } else { // Checking if the new render target is overlapping any bound one. // binding_2 is another bound render target. edram_dirty_rows_2 = binding_2.edram_dirty_rows; } // Do a full update if there is overlap. if (edram_bases[i] < edram_bases[j] + edram_dirty_rows_2 * edram_row_tiles[j] && edram_bases[j] < edram_bases[i] + edram_dirty_rows_1 * edram_row_tiles[i]) { full_update = true; break; } } if (full_update) { break; } } } // Need to change the bindings. if (full_update || render_targets_to_attach) { #if 0 uint32_t heap_usage[5] = {}; #endif if (full_update) { if (rov_used) { // Place a UAV barrier because across draws, pixels with different // SV_Positions or different sample counts (thus without interlocking // between each other) may access the same data now. CommitEDRAMBufferUAVWrites(false); } else { // Export the currently bound render targets before we ruin the // bindings. StoreRenderTargetsToEDRAM(); } ClearBindings(); current_surface_pitch_ = surface_pitch; current_msaa_samples_ = rb_surface_info.msaa_samples; if (!rov_used) { current_edram_max_rows_ = edram_max_rows; } // If updating fully, need to reattach all the render targets and allocate // from scratch. for (uint32_t i = 0; i < 5; ++i) { if (enabled[i]) { render_targets_to_attach |= 1 << i; } } } else { #if 0 if (!rov_used) { // If updating partially, only need to attach new render targets. for (uint32_t i = 0; i < 5; ++i) { const RenderTargetBinding& binding = current_bindings_[i]; if (!binding.is_bound) { continue; } const RenderTarget* render_target = binding.render_target; if (render_target != nullptr) { // There are no holes between 4 MB pages in each heap. heap_usage[render_target->heap_page_first / kHeap4MBPages] += render_target->heap_page_count; } } } #endif } XELOGGPU("RT Cache: %s update - pitch %u, samples %u, RTs to attach %u", full_update ? "Full" : "Partial", surface_pitch, rb_surface_info.msaa_samples, render_targets_to_attach); #if 0 auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); #endif // Allocate new render targets and add them to the bindings list. for (uint32_t i = 0; i < 5; ++i) { if (!(render_targets_to_attach & (1 << i))) { continue; } RenderTargetBinding& binding = current_bindings_[i]; binding.is_bound = true; binding.edram_base = edram_bases[i]; binding.edram_dirty_rows = 0; binding.format = formats[i]; binding.render_target = nullptr; if (!rov_used) { RenderTargetKey key; key.width_ss_div_80 = edram_row_tiles_32bpp; key.height_ss_div_16 = current_edram_max_rows_; key.is_depth = i == 4 ? 1 : 0; key.format = formats[i]; D3D12_RESOURCE_DESC resource_desc; if (!GetResourceDesc(key, resource_desc)) { // Invalid format. continue; } #if 0 // Calculate the number of 4 MB pages of the heaps this RT will use. D3D12_RESOURCE_ALLOCATION_INFO allocation_info = device->GetResourceAllocationInfo(0, 1, &resource_desc); if (allocation_info.SizeInBytes == 0 || allocation_info.SizeInBytes > (kHeap4MBPages << 22)) { assert_always(); continue; } uint32_t heap_page_count = (uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22; // Find the heap page range for this render target. uint32_t heap_page_first = UINT32_MAX; for (uint32_t j = 0; j < 5; ++j) { if (heap_usage[j] + heap_page_count <= kHeap4MBPages) { heap_page_first = j * kHeap4MBPages + heap_usage[j]; break; } } if (heap_page_first == UINT32_MAX) { assert_always(); continue; } // Get the render target. binding.render_target = FindOrCreateRenderTarget(key, heap_page_first); if (binding.render_target == nullptr) { continue; } heap_usage[heap_page_first / kHeap4MBPages] += heap_page_count; // Inform Direct3D that we're reusing the heap for this render target. command_processor_->PushAliasingBarrier( nullptr, binding.render_target->resource); #else // If multiple render targets have the same format, assign different // instance numbers to them. uint32_t instance = 0; if (i != 4) { for (uint32_t j = 0; j < i; ++j) { const RenderTargetBinding& other_binding = current_bindings_[j]; if (other_binding.is_bound && other_binding.render_target != nullptr && other_binding.format == formats[i]) { ++instance; } } } binding.render_target = FindOrCreateRenderTarget(key, instance); #endif } } if (!rov_used) { // Sample positions when loading depth must match sample positions when // drawing. command_processor_->SetSamplePositions(rb_surface_info.msaa_samples); // Load the contents of the new render targets from the EDRAM buffer (will // change the state of the render targets to copy destination). RenderTarget* load_render_targets[5]; uint32_t load_edram_bases[5]; uint32_t load_render_target_count = 0; for (uint32_t i = 0; i < 5; ++i) { if (!(render_targets_to_attach & (1 << i))) { continue; } RenderTarget* render_target = current_bindings_[i].render_target; if (render_target == nullptr) { continue; } load_render_targets[load_render_target_count] = render_target; load_edram_bases[load_render_target_count] = edram_bases[i]; ++load_render_target_count; } if (load_render_target_count != 0) { LoadRenderTargetsFromEDRAM(load_render_target_count, load_render_targets, load_edram_bases); } // Transition the render targets to the appropriate state if needed, // compress the list of the render target because null RTV descriptors are // broken in Direct3D 12 and bind the render targets to the command list. D3D12_CPU_DESCRIPTOR_HANDLE rtv_handles[4]; uint32_t rtv_count = 0; for (uint32_t i = 0; i < 4; ++i) { const RenderTargetBinding& binding = current_bindings_[i]; RenderTarget* render_target = binding.render_target; if (!binding.is_bound || render_target == nullptr) { continue; } XELOGGPU("RT Color %u: base %u, format %u", i, edram_bases[i], formats[i]); command_processor_->PushTransitionBarrier( render_target->resource, render_target->state, D3D12_RESOURCE_STATE_RENDER_TARGET); render_target->state = D3D12_RESOURCE_STATE_RENDER_TARGET; rtv_handles[rtv_count] = render_target->handle; current_pipeline_render_targets_[rtv_count].guest_render_target = i; current_pipeline_render_targets_[rtv_count].format = GetColorDXGIFormat(ColorRenderTargetFormat(formats[i])); ++rtv_count; } for (uint32_t i = rtv_count; i < 4; ++i) { current_pipeline_render_targets_[i].guest_render_target = i; current_pipeline_render_targets_[i].format = DXGI_FORMAT_UNKNOWN; } const D3D12_CPU_DESCRIPTOR_HANDLE* dsv_handle; const RenderTargetBinding& depth_binding = current_bindings_[4]; RenderTarget* depth_render_target = depth_binding.render_target; current_pipeline_render_targets_[4].guest_render_target = 4; if (depth_binding.is_bound && depth_render_target != nullptr) { XELOGGPU("RT Depth: base %u, format %u", edram_bases[4], formats[4]); command_processor_->PushTransitionBarrier( depth_render_target->resource, depth_render_target->state, D3D12_RESOURCE_STATE_DEPTH_WRITE); depth_render_target->state = D3D12_RESOURCE_STATE_DEPTH_WRITE; dsv_handle = &depth_binding.render_target->handle; current_pipeline_render_targets_[4].format = GetDepthDXGIFormat(DepthRenderTargetFormat(formats[4])); } else { dsv_handle = nullptr; current_pipeline_render_targets_[4].format = DXGI_FORMAT_UNKNOWN; } command_processor_->SubmitBarriers(); command_processor_->GetDeferredCommandList()->D3DOMSetRenderTargets( rtv_count, rtv_handles, FALSE, dsv_handle); } } // Update the dirty regions. for (uint32_t i = 0; i < 5; ++i) { if (!enabled[i] || (i == 4 && depth_readonly)) { continue; } RenderTargetBinding& binding = current_bindings_[i]; if (!rov_used && binding.render_target == nullptr) { // Nothing to store to the EDRAM buffer if there was an error. continue; } binding.edram_dirty_rows = std::max(binding.edram_dirty_rows, edram_dirty_rows); } if (rov_used) { // The buffer will be used for ROV drawing now. TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); edram_buffer_modified_ = true; } return true; } bool RenderTargetCache::Resolve(SharedMemory* shared_memory, TextureCache* texture_cache, Memory* memory, uint32_t& written_address_out, uint32_t& written_length_out) { written_address_out = written_length_out = 0; if (!command_processor_->IsROVUsedForEDRAM()) { // Save the currently bound render targets to the EDRAM buffer that will be // used as the resolve source and clear bindings to allow render target // resources to be reused as source textures for format conversion, // resolving samples, to let format conversion bind other render targets, // and so after a clear new data will be loaded. StoreRenderTargetsToEDRAM(); ClearBindings(); } auto& regs = *register_file_; // Get the render target properties. auto rb_surface_info = regs.Get(); uint32_t surface_pitch = std::min(rb_surface_info.surface_pitch, 2560u); if (surface_pitch == 0) { return true; } // Depth info is always needed because color resolve may also clear depth. auto rb_depth_info = regs.Get(); uint32_t surface_index = regs.Get().copy_src_select; if (surface_index > 4) { assert_always(); return false; } bool surface_is_depth = surface_index == 4; uint32_t surface_edram_base; uint32_t surface_format; if (surface_is_depth) { surface_edram_base = rb_depth_info.depth_base; surface_format = uint32_t(rb_depth_info.depth_format); } else { auto color_info = regs.Get( reg::RB_COLOR_INFO::rt_register_indices[surface_index]); surface_edram_base = color_info.color_base; surface_format = uint32_t(GetBaseColorFormat(color_info.color_format)); } // Get the resolve region since both copying and clearing need it. // HACK: Vertices to use are always in vf0. const auto& fetch = regs.Get( XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0); assert_true(fetch.type == 3); assert_true(fetch.endian == Endian::k8in32); assert_true(fetch.size == 6); const uint8_t* src_vertex_address = memory->TranslatePhysical(fetch.address << 2); float vertices[6]; // Most vertices have a negative half pixel offset applied, which we reverse. float vertex_offset = regs.Get().pix_center ? 0.0f : 0.5f; for (uint32_t i = 0; i < 6; ++i) { vertices[i] = xenos::GpuSwap(xe::load(src_vertex_address + i * sizeof(float)), Endian(fetch.endian)) + vertex_offset; } // Xenos only supports rectangle copies (luckily). // // The rectangle is for both the source and the destination, according to how // it's used in Tales of Vesperia. // // Direct3D 9 gives the rectangle in source render target coordinates (for // example, in Halo 3 the sniper rifle scope has a (128,64)->(448,256) // rectangle). It doesn't adjust the EDRAM base pointer, otherwise (taking // into account that 4x MSAA is used for the scope) it would have been // (8,0)->(328,192), but it's not. However, it adjusts the destination texture // address so (0,0) relative to the destination address is (0,0) relative to // the render target. When copying, we need to adjust the pointer to the first // 32x32 tile that will actually be modified, by adding the value of // XGAddress2DTiledOffset called for left/top & ~31. The pitch and height in // RB_COPY_DEST_PITCH are actually specified for the region starting from the // first modified 32x32 tile - it does not include the padding! (In the Halo 3 // sniper rifle scope example, the pitch and height are specified as 320x192, // which is the size of the rectangle.) // // Window scissor must also be applied - in the jigsaw puzzle in Banjo-Tooie, // there are 1280x720 resolve rectangles, but only the scissored 1280x256 // needs to be copied, otherwise it overflows even beyond the EDRAM, and the // depth buffer is visible on the screen. It also ensures the coordinates are // not negative (in F.E.A.R., for example, the right tile is resolved with // vertices (-640,0)->(640,720), however, the destination texture pointer is // adjusted properly to the right half of the texture, and the source render // target has a pitch of 800). auto pa_sc_window_offset = regs.Get(); D3D12_RECT rect; rect.left = LONG(std::min(std::min(vertices[0], vertices[2]), vertices[4])); rect.right = LONG(std::max(std::max(vertices[0], vertices[2]), vertices[4])); rect.top = LONG(std::min(std::min(vertices[1], vertices[3]), vertices[5])); rect.bottom = LONG(std::max(std::max(vertices[1], vertices[3]), vertices[5])); if (regs.Get().vtx_window_offset_enable) { rect.left += pa_sc_window_offset.window_x_offset; rect.right += pa_sc_window_offset.window_x_offset; rect.top += pa_sc_window_offset.window_y_offset; rect.bottom += pa_sc_window_offset.window_y_offset; } D3D12_RECT scissor; auto pa_sc_window_scissor_tl = regs.Get(); auto pa_sc_window_scissor_br = regs.Get(); scissor.left = pa_sc_window_scissor_tl.tl_x; scissor.right = pa_sc_window_scissor_br.br_x; scissor.top = pa_sc_window_scissor_tl.tl_y; scissor.bottom = pa_sc_window_scissor_br.br_y; if (!pa_sc_window_scissor_tl.window_offset_disable) { scissor.left = std::max( LONG(scissor.left + pa_sc_window_offset.window_x_offset), LONG(0)); scissor.right = std::max( LONG(scissor.right + pa_sc_window_offset.window_x_offset), LONG(0)); scissor.top = std::max( LONG(scissor.top + pa_sc_window_offset.window_y_offset), LONG(0)); scissor.bottom = std::max( LONG(scissor.bottom + pa_sc_window_offset.window_y_offset), LONG(0)); } rect.left = std::max(rect.left, scissor.left); rect.right = std::min(rect.right, scissor.right); rect.top = std::max(rect.top, scissor.top); rect.bottom = std::min(rect.bottom, scissor.bottom); XELOGGPU( "Resolve: (%d,%d)->(%d,%d) of RT %u (pitch %u, %u sample%s, format %u) " "at %u", rect.left, rect.top, rect.right, rect.bottom, surface_index, surface_pitch, 1 << uint32_t(rb_surface_info.msaa_samples), rb_surface_info.msaa_samples != MsaaSamples::k1X ? "s" : "", surface_format, surface_edram_base); if (rect.left >= rect.right || rect.top >= rect.bottom) { // Nothing to copy. return true; } if (command_processor_->IsROVUsedForEDRAM()) { // Commit ROV writes. CommitEDRAMBufferUAVWrites(false); } // GetEDRAMLayout in ResolveCopy and ResolveClear will perform the needed // clamping to the source render target size. bool result = ResolveCopy(shared_memory, texture_cache, surface_edram_base, surface_pitch, rb_surface_info.msaa_samples, surface_is_depth, surface_format, rect, written_address_out, written_length_out); // Clear the color RT if needed. if (!surface_is_depth) { result &= ResolveClear(surface_edram_base, surface_pitch, rb_surface_info.msaa_samples, false, surface_format, rect); } // Clear the depth RT if needed (may be cleared alongside color). result &= ResolveClear(rb_depth_info.depth_base, surface_pitch, rb_surface_info.msaa_samples, true, uint32_t(rb_depth_info.depth_format), rect); return result; } bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, TextureCache* texture_cache, uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, uint32_t src_format, const D3D12_RECT& rect, uint32_t& written_address_out, uint32_t& written_length_out) { written_address_out = written_length_out = 0; auto& regs = *register_file_; auto rb_copy_control = regs.Get(); if (rb_copy_control.copy_command != xenos::CopyCommand::kRaw && rb_copy_control.copy_command != xenos::CopyCommand::kConvert) { // TODO(Triang3l): Handle kConstantOne and kNull. assert_always(); return false; } auto command_list = command_processor_->GetDeferredCommandList(); // Get format info. auto rb_copy_dest_info = regs.Get(); TextureFormat src_texture_format; bool src_64bpp; if (is_depth) { src_texture_format = DepthRenderTargetToTextureFormat(DepthRenderTargetFormat(src_format)); src_64bpp = false; } else { // Force k_16_16 and k_16_16_16_16 RTs to be always resolved via drawing, // because resolving to a k_16_16 or a k_16_16_16_16 texture should result // in unsigned texture data, unlike the render target which is signed. if (ColorRenderTargetFormat(src_format) == ColorRenderTargetFormat::k_16_16) { src_texture_format = TextureFormat::k_16_16_EDRAM; } else if (ColorRenderTargetFormat(src_format) == ColorRenderTargetFormat::k_16_16_16_16) { src_texture_format = TextureFormat::k_16_16_16_16_EDRAM; } else { src_texture_format = GetBaseFormat(ColorRenderTargetToTextureFormat( ColorRenderTargetFormat(src_format))); } src_64bpp = IsColorFormat64bpp(ColorRenderTargetFormat(src_format)); } assert_true(src_texture_format != TextureFormat::kUnknown); // The destination format is specified as k_8_8_8_8 when resolving depth, but // no format conversion is done for depth, so ignore it. TextureFormat dest_format = is_depth ? src_texture_format : GetBaseFormat(TextureFormat(rb_copy_dest_info.copy_dest_format)); const FormatInfo* dest_format_info = FormatInfo::Get(dest_format); // Get the destination region and clamp the source region to it. auto rb_copy_dest_pitch = regs.Get(); uint32_t dest_pitch = rb_copy_dest_pitch.copy_dest_pitch; uint32_t dest_height = rb_copy_dest_pitch.copy_dest_height; if (dest_pitch == 0 || dest_height == 0) { // Nothing to copy. return true; } D3D12_RECT copy_rect; copy_rect.left = rect.left; copy_rect.top = rect.top; copy_rect.right = std::min(rect.right, (rect.left & ~LONG(31)) + LONG(dest_pitch)); copy_rect.bottom = std::min(rect.bottom, (rect.top & ~LONG(31)) + LONG(dest_height)); if (copy_rect.left >= copy_rect.right || copy_rect.top >= copy_rect.bottom) { // Nothing to copy. return true; } // Validate and clamp the source region, skip parts that don't need to be // copied and calculate the number of threads needed for copying/loading. // copy_rect will be modified and will become only the source rectangle, for // the destination region, use the original rect from the arguments. uint32_t surface_pitch_tiles, row_width_ss_div_80, rows; if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base, copy_rect, surface_pitch_tiles, row_width_ss_div_80, rows)) { // Nothing to copy. return true; } // Get the destination location and adjust it to the first 32x32 tile modified // by the resolve (the pitch and the height are relative to that tile, not to // 0,0 of the resolve rectangle). uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF; // An example of a 3D resolve destination is the color grading LUT (used // starting from the developer/publisher intro) in Dead Space 3. if (rb_copy_dest_info.copy_dest_array) { dest_address += texture_util::GetTiledOffset3D( int(rect.left & ~LONG(31)), int(rect.top & ~LONG(31)), 0, dest_pitch, dest_height, xe::log2_floor(dest_format_info->bits_per_pixel >> 3)); } else { dest_address += texture_util::GetTiledOffset2D( int(rect.left & ~LONG(31)), int(rect.top & ~LONG(31)), dest_pitch, xe::log2_floor(dest_format_info->bits_per_pixel >> 3)); } if (dest_address & 0x3) { assert_always(); // Not 4-aligning may break UAV access significantly, let's hope games don't // resolve to 8bpp or 16bpp textures at very odd locations. return false; } uint32_t dest_z = rb_copy_dest_info.copy_dest_array ? rb_copy_dest_info.copy_dest_slice : 0; // See what samples we need and what we should do with them. xenos::CopySampleSelect sample_select = rb_copy_control.copy_sample_select; if (is_depth && sample_select > xenos::CopySampleSelect::k3) { assert_always(); return false; } int32_t dest_exp_bias; if (is_depth) { dest_exp_bias = 0; } else { dest_exp_bias = rb_copy_dest_info.copy_dest_exp_bias; if (ColorRenderTargetFormat(src_format) == ColorRenderTargetFormat::k_16_16 || ColorRenderTargetFormat(src_format) == ColorRenderTargetFormat::k_16_16_16_16) { // On the Xbox 360, k_16_16_EDRAM and k_16_16_16_16_EDRAM internally have // -32...32 range, but they're emulated using normalized RG16/RGBA16, so // sampling the host render target gives 1/32 of what is actually stored // there on the guest side. // http://www.students.science.uu.nl/~3220516/advancedgraphics/papers/inferred_lighting.pdf if (command_processor_->IsROVUsedForEDRAM() || cvars::d3d12_16bit_rtv_full_range) { dest_exp_bias += 5; } } } bool dest_swap = !is_depth && rb_copy_dest_info.copy_dest_swap; XELOGGPU( "Resolve: Copying samples %u to 0x%.8X (%ux%u, %cD), destination Z %u, " "destination format %s, exponent bias %d, red and blue %sswapped", uint32_t(sample_select), dest_address, dest_pitch, dest_height, rb_copy_dest_info.copy_dest_array ? '3' : '2', dest_z, dest_format_info->name, dest_exp_bias, dest_swap ? "" : "not "); // There are 2 paths for resolving in this function - they don't necessarily // have to map directly to kRaw and kConvert CopyCommands. // - Raw - when extracting a single color to a texture of the same format as // the EDRAM surface and exponent bias is not applied, or when resolving a // depth buffer (games read only one sample of it - resolving multiple // samples of a depth buffer is meaningless anyway - and apparently there's // no format conversion as well because k_8_8_8_8 is specified in the // destination format in the register, which is obviously not true, and the // texture is then read as k_24_8 or k_24_8_FLOAT). Swapping red and blue is // possible in this mode. // - Conversion - when a simple copy is not enough. The EDRAM region is loaded // to a render target resource, which is then used as a texture in a shader // performing the resolve (by sampling the texture on or between pixels with // bilinear filtering), applying exponent bias and swapping red and blue in // a format-agnostic way, then the resulting color is written to a temporary // RTV of the destination format. auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); uint32_t resolution_scale_log2 = resolution_scale_2x_ ? 1 : 0; // Check if we need to apply the hack to remove the gap on the left and top // sides of the screen caused by half-pixel offset becoming whole pixel offset // with scaled rendering resolution. bool resolution_scale_edge_clamp = resolution_scale_2x_ && cvars::d3d12_resolution_scale_resolve_edge_clamp && cvars::d3d12_half_pixel_offset && !regs.Get().pix_center; if (sample_select <= xenos::CopySampleSelect::k3 && src_texture_format == dest_format && dest_exp_bias == 0) { // ************************************************************************* // Raw copy // ************************************************************************* XELOGGPU("Resolve: Copying using a compute shader"); // Calculate the size of the region that specifically is being resolved. // Can't just use the texture height for size calculation because it's // sometimes bigger than needed (in Red Dead Redemption, an UI texture used // for the letterbox bars alpha is located within a 1280x720 resolve target, // but only 1280x208 is being resolved, and with scaled resolution the UI // texture gets ignored). This doesn't apply to 3D resolves, however, // because their tiling is more complex - some excess data will even be // marked as resolved for them if resolving not to (0,0). uint32_t dest_size; uint32_t dest_modified_start = dest_address; uint32_t dest_modified_length; if (rb_copy_dest_info.copy_dest_array) { // Depth granularity is 4 (though TiledAddress chaining is possible with 8 // granularity). dest_size = texture_util::GetGuestMipSliceStorageSize( xe::align(dest_pitch, 32u), xe::align(dest_height, 32u), 4, true, dest_format, nullptr, false); if (dest_z >= 4) { dest_modified_start += dest_size; } dest_modified_length = dest_size; dest_size *= 2; } else { dest_size = texture_util::GetGuestMipSliceStorageSize( xe::align(dest_pitch, 32u), xe::align( uint32_t((rect.top & 31) + copy_rect.bottom - copy_rect.top), 32u), 1, true, dest_format, nullptr, false); dest_modified_length = dest_size; } // Make sure we have the memory to write to. dest_address (and thus // dest_range_start) already adjusted to the first modified 32x32 tile. if (resolution_scale_2x_) { if (!texture_cache->EnsureScaledResolveBufferResident( dest_modified_start, dest_modified_length)) { return false; } } else { if (!shared_memory->MakeTilesResident(dest_modified_start, dest_modified_length)) { return false; } } // Write the source and destination descriptors. D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; if (command_processor_->RequestViewDescriptors( 0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) { return false; } TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); WriteEDRAMRawSRVDescriptor(descriptor_cpu_start); if (resolution_scale_2x_) { texture_cache->UseScaledResolveBufferForWriting(); // Can't address more than 512 MB directly on Nvidia - binding only a part // of the buffer. texture_cache->CreateScaledResolveBufferRawUAV( provider->OffsetViewDescriptor(descriptor_cpu_start, 1), dest_address >> 12, ((dest_address + dest_size - 1) >> 12) - (dest_address >> 12) + 1); } else { shared_memory->UseForWriting(); shared_memory->WriteRawUAVDescriptor( provider->OffsetViewDescriptor(descriptor_cpu_start, 1)); } command_processor_->SubmitBarriers(); // Dispatch the computation. command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_); EDRAMLoadStoreRootConstants root_constants; // Address is adjusted to the first modified tile, so using & 31 as the // destination offset. root_constants.tile_sample_dimensions[0] = uint32_t(copy_rect.right - copy_rect.left) | ((uint32_t(rect.left) & 31) << 12) | (dest_z << 17) | (uint32_t(copy_rect.left) << 20); root_constants.tile_sample_dimensions[1] = uint32_t(copy_rect.bottom - copy_rect.top) | ((uint32_t(rect.top) & 31) << 12) | (uint32_t(copy_rect.top) << 20); root_constants.tile_sample_dest_base = dest_address; if (resolution_scale_2x_) { // Can't address more than 512 MB directly on Nvidia - binding only a part // of the buffer. root_constants.tile_sample_dest_base -= dest_address & ~0xFFFu; } assert_true(dest_pitch <= 8192); root_constants.tile_sample_dest_info = ((dest_pitch + 31) >> 5) | (rb_copy_dest_info.copy_dest_array ? (((dest_height + 31) >> 5) << 9) : 0) | (uint32_t(sample_select) << 18) | (uint32_t(rb_copy_dest_info.copy_dest_endian) << 20); if (dest_swap) { root_constants.tile_sample_dest_info |= (1 << 23) | (src_format << 24); } root_constants.base_samples_2x_depth_pitch = edram_base | (resolution_scale_log2 << 13) | (resolution_scale_edge_clamp ? (1 << 14) : 0) | (is_depth ? (1 << 15) : 0) | (surface_pitch_tiles << 16); if (msaa_samples >= MsaaSamples::k2X) { root_constants.base_samples_2x_depth_pitch |= 1 << 11; if (msaa_samples >= MsaaSamples::k4X) { root_constants.base_samples_2x_depth_pitch |= 1 << 12; } } command_list->D3DSetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start); command_processor_->SetComputePipeline( src_64bpp ? edram_tile_sample_64bpp_pipeline_ : edram_tile_sample_32bpp_pipeline_); // 1 group per destination 80x16 region. uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows; if (msaa_samples >= MsaaSamples::k2X) { group_count_y = (group_count_y + 1) >> 1; if (msaa_samples >= MsaaSamples::k4X) { group_count_x = (group_count_x + 1) >> 1; } } // With 2x scaling, destination width and height are 2x bigger, and 1 group // is 80x16 destination pixels after applying the resolution scale. group_count_x <<= resolution_scale_log2; group_count_y <<= resolution_scale_log2; command_list->D3DDispatch(group_count_x, group_count_y, 1); // Commit the write. command_processor_->PushUAVBarrier( resolution_scale_2x_ ? texture_cache->GetScaledResolveBuffer() : shared_memory->GetBuffer()); // Invalidate textures and mark the range as scaled if needed. texture_cache->MarkRangeAsResolved(dest_modified_start, dest_modified_length); written_address_out = dest_modified_start; written_length_out = dest_modified_length; } else { // ************************************************************************* // Conversion and AA resolving // ************************************************************************* XELOGGPU("Resolve: Copying via drawing"); // Get everything we need for the conversion. // DXGI format (also checking whether this resolve is possible). DXGI_FORMAT dest_dxgi_format = texture_cache->GetResolveDXGIFormat(dest_format); if (dest_dxgi_format == DXGI_FORMAT_UNKNOWN) { XELOGE( "No resolve pipeline for destination format %s - tell Xenia " "developers!", FormatInfo::Get(dest_format)->name); return false; } // Resolve pipeline. ID3D12PipelineState* resolve_pipeline = GetResolvePipeline(dest_dxgi_format); if (resolve_pipeline == nullptr) { return false; } RenderTargetKey render_target_key; render_target_key.width_ss_div_80 = row_width_ss_div_80; render_target_key.height_ss_div_16 = rows; if (resolution_scale_2x_) { render_target_key.width_ss_div_80 *= 2; render_target_key.height_ss_div_16 *= 2; } render_target_key.is_depth = false; render_target_key.format = src_format; // Render target for loading the EDRAM buffer contents as a texture. RenderTarget* render_target = FindOrCreateRenderTarget(render_target_key, 0); if (render_target == nullptr) { return false; } const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint = render_target->footprints[0]; // Size of the resolved area. uint32_t copy_width = copy_rect.right - copy_rect.left; uint32_t copy_height = copy_rect.bottom - copy_rect.top; // Resolve target for output merger format conversion. #if 0 ResolveTarget* resolve_target = FindOrCreateResolveTarget(copy_width, copy_height, dest_dxgi_format, render_target->heap_page_count); #else ResolveTarget* resolve_target = FindOrCreateResolveTarget(copy_width, copy_height, dest_dxgi_format); #endif if (resolve_target == nullptr) { return false; } // Descriptors. 2 for EDRAM load, 1 for conversion. D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; if (command_processor_->RequestViewDescriptors( 0, 3, 3, descriptor_cpu_start, descriptor_gpu_start) == 0) { return false; } // Buffer for copying. D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer( std::max(render_target->copy_buffer_size, resolve_target->copy_buffer_size), copy_buffer_state); if (copy_buffer == nullptr) { return false; } // Load the EDRAM buffer contents to the copy buffer. TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); command_processor_->SubmitBarriers(); command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_); EDRAMLoadStoreRootConstants load_root_constants; load_root_constants.rt_color_depth_offset = uint32_t(footprint.Offset); load_root_constants.rt_color_depth_pitch = uint32_t(footprint.Footprint.RowPitch); load_root_constants.base_samples_2x_depth_pitch = edram_base | (resolution_scale_log2 << 13) | (surface_pitch_tiles << 16); if (msaa_samples >= MsaaSamples::k2X) { load_root_constants.base_samples_2x_depth_pitch |= 1 << 11; if (msaa_samples >= MsaaSamples::k4X) { load_root_constants.base_samples_2x_depth_pitch |= 1 << 12; } } command_list->D3DSetComputeRoot32BitConstants( 0, sizeof(load_root_constants) / sizeof(uint32_t), &load_root_constants, 0); WriteEDRAMRawSRVDescriptor(descriptor_cpu_start); ui::d3d12::util::CreateRawBufferUAV( device, provider->OffsetViewDescriptor(descriptor_cpu_start, 1), copy_buffer, render_target->copy_buffer_size); command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start); EDRAMLoadStoreMode mode = GetLoadStoreMode(false, src_format); command_processor_->SetComputePipeline( resolution_scale_2x_ ? edram_load_2x_resolve_pipelines_[size_t(mode)] : edram_load_pipelines_[size_t(mode)]); // 1 group per 80x16 samples, with both 1x and 2x resolution scales. command_list->D3DDispatch(row_width_ss_div_80, rows, 1); command_processor_->PushUAVBarrier(copy_buffer); // Go to the next descriptor set. descriptor_cpu_start = provider->OffsetViewDescriptor(descriptor_cpu_start, 2); descriptor_gpu_start = provider->OffsetViewDescriptor(descriptor_gpu_start, 2); // Copy the EDRAM buffer contents to the source texture. #if 0 command_processor_->PushAliasingBarrier(nullptr, render_target->resource); #endif command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_SOURCE); copy_buffer_state = D3D12_RESOURCE_STATE_COPY_SOURCE; command_processor_->PushTransitionBarrier(render_target->resource, render_target->state, D3D12_RESOURCE_STATE_COPY_DEST); render_target->state = D3D12_RESOURCE_STATE_COPY_DEST; command_processor_->SubmitBarriers(); D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; location_source.pResource = copy_buffer; location_source.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; location_source.PlacedFootprint = render_target->footprints[0]; location_dest.pResource = render_target->resource; location_dest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; location_dest.SubresourceIndex = 0; command_list->CopyTexture(location_dest, location_source); // Do the resolve. Render targets unbound already, safe to call // OMSetRenderTargets. #if 0 command_processor_->PushAliasingBarrier(nullptr, resolve_target->resource); #endif command_processor_->PushTransitionBarrier( render_target->resource, render_target->state, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); render_target->state = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE; command_processor_->PushTransitionBarrier( resolve_target->resource, resolve_target->state, D3D12_RESOURCE_STATE_RENDER_TARGET); resolve_target->state = D3D12_RESOURCE_STATE_RENDER_TARGET; command_list->D3DSetGraphicsRootSignature(resolve_root_signature_); ResolveRootConstants resolve_root_constants; uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0; uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0; resolve_root_constants.rect_samples_lw = (copy_rect.left << (samples_x_log2 + resolution_scale_log2)) | (copy_width << (16 + samples_x_log2 + resolution_scale_log2)); resolve_root_constants.rect_samples_th = (copy_rect.top << (samples_y_log2 + resolution_scale_log2)) | (copy_height << (16 + samples_y_log2 + resolution_scale_log2)); resolve_root_constants.source_size = (render_target_key.width_ss_div_80 * 80) | (render_target_key.height_ss_div_16 << (4 + 16)); resolve_root_constants.resolve_info = samples_y_log2 | (samples_x_log2 << 1) | (resolution_scale_edge_clamp ? (1 << 6) : 0) | ((uint32_t(dest_exp_bias) & 0x3F) << 7); if (msaa_samples == MsaaSamples::k1X) { // No offset. resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4); } else if (msaa_samples == MsaaSamples::k2X) { // -0.5 or +0.5 samples vertical offset if getting only one sample. if (sample_select == xenos::CopySampleSelect::k0) { resolve_root_constants.resolve_info |= (0 << 2) | (1 << 4); } else if (sample_select == xenos::CopySampleSelect::k1) { resolve_root_constants.resolve_info |= (2 << 2) | (1 << 4); } else { resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4); } } else { // -0.5 or +0.5 samples offsets if getting one or two samples. switch (sample_select) { case xenos::CopySampleSelect::k0: resolve_root_constants.resolve_info |= (0 << 2) | (0 << 4); break; case xenos::CopySampleSelect::k1: resolve_root_constants.resolve_info |= (2 << 2) | (0 << 4); break; case xenos::CopySampleSelect::k2: resolve_root_constants.resolve_info |= (0 << 2) | (2 << 4); break; case xenos::CopySampleSelect::k3: resolve_root_constants.resolve_info |= (2 << 2) | (2 << 4); break; case xenos::CopySampleSelect::k01: resolve_root_constants.resolve_info |= (1 << 2) | (0 << 4); break; case xenos::CopySampleSelect::k23: resolve_root_constants.resolve_info |= (1 << 2) | (2 << 4); break; default: resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4); break; } } command_list->D3DSetGraphicsRoot32BitConstants( 0, sizeof(resolve_root_constants) / sizeof(uint32_t), &resolve_root_constants, 0); D3D12_SHADER_RESOURCE_VIEW_DESC rt_srv_desc; rt_srv_desc.Format = GetColorDXGIFormat(ColorRenderTargetFormat(src_format)); rt_srv_desc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; UINT swizzle = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; if (dest_swap) { switch (ColorRenderTargetFormat(src_format)) { case ColorRenderTargetFormat::k_8_8_8_8: case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: case ColorRenderTargetFormat::k_2_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: case ColorRenderTargetFormat::k_16_16_16_16: case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: swizzle = D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(2, 1, 0, 3); break; default: break; } } if (dest_format == TextureFormat::k_6_5_5) { // Green bits of the resolve target used for blue, and blue bits used for // green. swizzle = D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING( D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(0, swizzle), D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(2, swizzle), D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(1, swizzle), D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(3, swizzle)); } rt_srv_desc.Shader4ComponentMapping = swizzle; rt_srv_desc.Texture2D.MostDetailedMip = 0; rt_srv_desc.Texture2D.MipLevels = 1; rt_srv_desc.Texture2D.PlaneSlice = 0; rt_srv_desc.Texture2D.ResourceMinLODClamp = 0.0f; device->CreateShaderResourceView(render_target->resource, &rt_srv_desc, descriptor_cpu_start); command_list->D3DSetGraphicsRootDescriptorTable(1, descriptor_gpu_start); command_processor_->SubmitBarriers(); command_processor_->SetSamplePositions(MsaaSamples::k1X); command_processor_->SetExternalGraphicsPipeline(resolve_pipeline); command_list->D3DOMSetRenderTargets(1, &resolve_target->rtv_handle, TRUE, nullptr); D3D12_VIEWPORT viewport; viewport.TopLeftX = 0.0f; viewport.TopLeftY = 0.0f; viewport.Width = float(copy_width << resolution_scale_log2); viewport.Height = float(copy_height << resolution_scale_log2); viewport.MinDepth = 0.0f; viewport.MaxDepth = 1.0f; command_list->RSSetViewport(viewport); D3D12_RECT scissor; scissor.left = 0; scissor.top = 0; scissor.right = copy_width << resolution_scale_log2; scissor.bottom = copy_height << resolution_scale_log2; command_list->RSSetScissorRect(scissor); command_list->D3DIASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); command_list->D3DDrawInstanced(3, 1, 0, 0); if (command_processor_->IsROVUsedForEDRAM()) { // Clean up - the ROV path doesn't need render targets bound and has // non-zero ForcedSampleCount. command_list->D3DOMSetRenderTargets(0, nullptr, FALSE, nullptr); } // Copy the resolve target to the buffer. command_processor_->PushTransitionBarrier(resolve_target->resource, resolve_target->state, D3D12_RESOURCE_STATE_COPY_SOURCE); resolve_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE; command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_DEST); copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST; command_processor_->SubmitBarriers(); location_source.pResource = resolve_target->resource; location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; location_source.SubresourceIndex = 0; location_dest.pResource = copy_buffer; location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; location_dest.PlacedFootprint = resolve_target->footprint; command_list->CopyTexture(location_dest, location_source); // Tile the resolved texture. The texture cache expects the buffer to be a // non-pixel-shader SRV. command_processor_->PushTransitionBarrier( copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; // dest_address already adjusted, so offsets are & 31. texture_cache->TileResolvedTexture( dest_format, dest_address, dest_pitch, dest_height, rb_copy_dest_info.copy_dest_array != 0, uint32_t(rect.left) & 31, uint32_t(rect.top) & 31, dest_z, copy_width, copy_height, rb_copy_dest_info.copy_dest_endian, copy_buffer, resolve_target->copy_buffer_size, resolve_target->footprint, &written_address_out, &written_length_out); // Done with the copy buffer. command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); } return true; } bool RenderTargetCache::ResolveClear(uint32_t edram_base, uint32_t surface_pitch, MsaaSamples msaa_samples, bool is_depth, uint32_t format, const D3D12_RECT& rect) { auto& regs = *register_file_; // Check if clearing is enabled. auto rb_copy_control = regs.Get(); if (is_depth) { if (!rb_copy_control.depth_clear_enable) { return true; } } else { if (!rb_copy_control.color_clear_enable) { return true; } } XELOGGPU("Resolve: Clearing the %s render target", is_depth ? "depth" : "color"); // Calculate the layout. bool is_64bpp = !is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format)); D3D12_RECT clear_rect = rect; uint32_t surface_pitch_tiles, row_width_ss_div_80, rows; if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base, clear_rect, surface_pitch_tiles, row_width_ss_div_80, rows)) { // Nothing to clear. return true; } uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0; uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0; // Get everything needed for clearing. auto command_list = command_processor_->GetDeferredCommandList(); auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; if (command_processor_->RequestViewDescriptors(0, 1, 1, descriptor_cpu_start, descriptor_gpu_start) == 0) { return false; } // Submit the clear. TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); command_processor_->SubmitBarriers(); EDRAMLoadStoreRootConstants root_constants; root_constants.clear_rect_lt = (clear_rect.left << samples_x_log2) | (clear_rect.top << (16 + samples_y_log2)); root_constants.clear_rect_rb = (clear_rect.right << samples_x_log2) | (clear_rect.bottom << (16 + samples_y_log2)); root_constants.base_samples_2x_depth_pitch = edram_base | (samples_y_log2 << 11) | (samples_x_log2 << 12) | (resolution_scale_2x_ ? (1 << 13) : 0) | (is_depth ? (1 << 15) : 0) | (surface_pitch_tiles << 16); // When ROV is used, there's no 32-bit depth buffer. if (!command_processor_->IsROVUsedForEDRAM() && is_depth && DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8) { root_constants.clear_depth24 = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; // 20e4 [0,2), based on CFloat24 from d3dref9.dll and on 6e4 in DirectXTex. uint32_t depth24 = root_constants.clear_depth24 >> 8; if (depth24 == 0) { root_constants.clear_depth32 = 0; } else { uint32_t mantissa = depth24 & 0xFFFFFu, exponent = depth24 >> 20; if (exponent == 0) { // Normalize the value in the resulting float. // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0) uint32_t mantissa_lzcnt = xe::lzcnt(mantissa) - (32u - 21u); exponent = 1u - mantissa_lzcnt; mantissa = (mantissa << mantissa_lzcnt) & 0xFFFFFu; } root_constants.clear_depth32 = ((exponent + 112u) << 23) | (mantissa << 3); } command_processor_->SetComputePipeline(edram_clear_depth_float_pipeline_); } else if (is_64bpp) { // TODO(Triang3l): Check which 32-bit portion is in which register. root_constants.clear_color_high = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; root_constants.clear_color_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32; command_processor_->SetComputePipeline(edram_clear_64bpp_pipeline_); } else { Register reg = is_depth ? XE_GPU_REG_RB_DEPTH_CLEAR : XE_GPU_REG_RB_COLOR_CLEAR; root_constants.clear_color_high = regs[reg].u32; command_processor_->SetComputePipeline(edram_clear_32bpp_pipeline_); } command_list->D3DSetComputeRootSignature(edram_clear_root_signature_); command_list->D3DSetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); WriteEDRAMRawUAVDescriptor(descriptor_cpu_start); command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start); // 1 group per 80x16 samples. Resolution scale handled in the shader itself. command_list->D3DDispatch(row_width_ss_div_80, rows, 1); CommitEDRAMBufferUAVWrites(true); return true; } ID3D12PipelineState* RenderTargetCache::GetResolvePipeline( DXGI_FORMAT dest_format) { // Try to find an existing pipeline. for (auto& resolve_pipeline : resolve_pipelines_) { if (resolve_pipeline.dest_format == dest_format) { return resolve_pipeline.pipeline; } } // Create a new pipeline. auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_desc = {}; pipeline_desc.pRootSignature = resolve_root_signature_; pipeline_desc.VS.pShaderBytecode = resolve_vs; pipeline_desc.VS.BytecodeLength = sizeof(resolve_vs); pipeline_desc.PS.pShaderBytecode = resolve_ps; pipeline_desc.PS.BytecodeLength = sizeof(resolve_ps); pipeline_desc.BlendState.RenderTarget[0].RenderTargetWriteMask = D3D12_COLOR_WRITE_ENABLE_ALL; pipeline_desc.SampleMask = UINT_MAX; pipeline_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID; pipeline_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE; pipeline_desc.RasterizerState.DepthClipEnable = TRUE; pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; pipeline_desc.NumRenderTargets = 1; pipeline_desc.RTVFormats[0] = dest_format; pipeline_desc.SampleDesc.Count = 1; ID3D12PipelineState* pipeline; if (FAILED(device->CreateGraphicsPipelineState(&pipeline_desc, IID_PPV_ARGS(&pipeline)))) { XELOGE("Failed to create the resolve pipeline for DXGI format %u", dest_format); return nullptr; } ResolvePipeline new_resolve_pipeline; new_resolve_pipeline.pipeline = pipeline; new_resolve_pipeline.dest_format = dest_format; resolve_pipelines_.push_back(new_resolve_pipeline); return pipeline; } RenderTargetCache::ResolveTarget* RenderTargetCache::FindOrCreateResolveTarget( #if 0 uint32_t width_unscaled, uint32_t height_unscaled, DXGI_FORMAT format, uint32_t min_heap_page_first) { #else uint32_t width_unscaled, uint32_t height_unscaled, DXGI_FORMAT format #endif ) { #if 0 assert_true(min_heap_page_first < kHeap4MBPages * 5); #endif if (width_unscaled == 0 || height_unscaled == 0 || width_unscaled > 2160 || height_unscaled > 2160) { assert_always(); return nullptr; } uint32_t width_scaled = width_unscaled, height_scaled = height_unscaled; if (resolution_scale_2x_) { width_scaled *= 2; height_scaled *= 2; } ResolveTargetKey key; key.width_div_32 = (width_scaled + 31) >> 5; key.height_div_32 = (height_scaled + 31) >> 5; key.format = format; // Try to find an existing target that isn't overlapping the resolve source. #if 0 auto found_range = resolve_targets_.equal_range(key.value); for (auto iter = found_range.first; iter != found_range.second; ++iter) { ResolveTarget* found_resolve_target = iter->second; if (found_resolve_target->heap_page_first >= min_heap_page_first) { return found_resolve_target; } } #else auto found_iter = resolve_targets_.find(key.value); if (found_iter != resolve_targets_.end()) { return found_iter->second; } #endif // Ensure the new resolve target can get an RTV descriptor. if (!EnsureRTVHeapAvailable(false)) { return nullptr; } // Allocate a new resolve target. auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); D3D12_RESOURCE_DESC resource_desc; resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; resource_desc.Alignment = 0; resource_desc.Width = key.width_div_32 << 5; resource_desc.Height = key.height_div_32 << 5; resource_desc.DepthOrArraySize = 1; resource_desc.MipLevels = 1; resource_desc.Format = format; resource_desc.SampleDesc.Count = 1; resource_desc.SampleDesc.Quality = 0; resource_desc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; resource_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; #if 0 D3D12_RESOURCE_ALLOCATION_INFO allocation_info = device->GetResourceAllocationInfo(0, 1, &resource_desc); uint32_t heap_page_count = (uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22; if (heap_page_count == 0 || heap_page_count > kHeap4MBPages) { assert_always(); XELOGE( "%ux%u resolve target with DXGI format %u can't fit in a heap, " "needs %u bytes - tell Xenia developers to increase the heap size!", uint32_t(resource_desc.Width), resource_desc.Height, format, uint32_t(allocation_info.SizeInBytes)); return nullptr; } if (kHeap4MBPages - (min_heap_page_first % kHeap4MBPages) < heap_page_count) { // Go to the next heap if no free space in the current one. min_heap_page_first = xe::round_up(min_heap_page_first, kHeap4MBPages); assert_true(min_heap_page_first < kHeap4MBPages * 5); } // Create the memory heap if it doesn't exist yet. uint32_t heap_index = min_heap_page_first / kHeap4MBPages; if (!MakeHeapResident(heap_index)) { return nullptr; } #endif // Create it. // The first action likely to be done is resolve. D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_RENDER_TARGET; ID3D12Resource* resource; #if 0 if (FAILED(device->CreatePlacedResource( heaps_[heap_index], (min_heap_page_first % kHeap4MBPages) << 22, &resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) { XELOGE( "Failed to create a placed resource for %ux%u resolve target with DXGI " "format %u at heap 4 MB pages %u:%u", uint32_t(resource_desc.Width), resource_desc.Height, format, min_heap_page_first, min_heap_page_first + heap_page_count - 1); return nullptr; } #else if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE, &resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) { XELOGE( "Failed to create a committed resource for %ux%u resolve target with " "DXGI format %u", uint32_t(resource_desc.Width), resource_desc.Height, format); return nullptr; } #endif // Create the RTV. D3D12_CPU_DESCRIPTOR_HANDLE rtv_handle = provider->OffsetRTVDescriptor(descriptor_heaps_color_->start_handle, descriptor_heaps_color_->descriptors_used); D3D12_RENDER_TARGET_VIEW_DESC rtv_desc; rtv_desc.Format = format; rtv_desc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; rtv_desc.Texture2D.MipSlice = 0; rtv_desc.Texture2D.PlaneSlice = 0; device->CreateRenderTargetView(resource, &rtv_desc, rtv_handle); ++descriptor_heaps_color_->descriptors_used; // Add the new resolve target to the cache. ResolveTarget* resolve_target = new ResolveTarget; resolve_target->resource = resource; resolve_target->state = state; resolve_target->rtv_handle.ptr = rtv_handle.ptr; resolve_target->key.value = key.value; #if 0 resolve_target->heap_page_first = min_heap_page_first; #endif UINT64 copy_buffer_size; device->GetCopyableFootprints(&resource_desc, 0, 1, 0, &resolve_target->footprint, nullptr, nullptr, ©_buffer_size); // Safety (though if width and height are aligned to 32 it will be fine, but // just in case this changes). copy_buffer_size = xe::align(copy_buffer_size, UINT64(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)); resolve_target->copy_buffer_size = uint32_t(copy_buffer_size); resolve_targets_.insert(std::make_pair(key.value, resolve_target)); COUNT_profile_set("gpu/render_target_cache/resolve_targets", resolve_targets_.size()); return resolve_target; } void RenderTargetCache::UnbindRenderTargets() { if (command_processor_->IsROVUsedForEDRAM()) { return; } StoreRenderTargetsToEDRAM(); ClearBindings(); } void RenderTargetCache::WriteEDRAMUint32UAVDescriptor( D3D12_CPU_DESCRIPTOR_HANDLE handle) { auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); device->CopyDescriptorsSimple( 1, handle, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kUint32UAV)), D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); } void RenderTargetCache::EndFrame() { UnbindRenderTargets(); } ColorRenderTargetFormat RenderTargetCache::GetBaseColorFormat( ColorRenderTargetFormat format) { switch (format) { case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: return ColorRenderTargetFormat::k_8_8_8_8; case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: return ColorRenderTargetFormat::k_2_10_10_10; case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: return ColorRenderTargetFormat::k_2_10_10_10_FLOAT; default: return format; } } DXGI_FORMAT RenderTargetCache::GetColorDXGIFormat( ColorRenderTargetFormat format) { switch (format) { case ColorRenderTargetFormat::k_8_8_8_8: case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: return DXGI_FORMAT_R8G8B8A8_UNORM; case ColorRenderTargetFormat::k_2_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: return DXGI_FORMAT_R10G10B10A2_UNORM; case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: return DXGI_FORMAT_R16G16B16A16_FLOAT; case ColorRenderTargetFormat::k_16_16: return DXGI_FORMAT_R16G16_SNORM; case ColorRenderTargetFormat::k_16_16_16_16: return DXGI_FORMAT_R16G16B16A16_SNORM; case ColorRenderTargetFormat::k_16_16_FLOAT: return DXGI_FORMAT_R16G16_FLOAT; case ColorRenderTargetFormat::k_32_FLOAT: return DXGI_FORMAT_R32_FLOAT; case ColorRenderTargetFormat::k_32_32_FLOAT: return DXGI_FORMAT_R32G32_FLOAT; default: break; } return DXGI_FORMAT_UNKNOWN; } uint32_t RenderTargetCache::GetEDRAMBufferSize() const { uint32_t size = 2048 * 5120; if (!command_processor_->IsROVUsedForEDRAM()) { // Two 10 MB pages, one containing color and integer depth data, another // with 32-bit float depth when 20e4 depth is used to allow for multipass // drawing without precision loss in case of EDRAM store/load. size *= 2; } if (resolution_scale_2x_) { size *= 4; } return size; } void RenderTargetCache::TransitionEDRAMBuffer(D3D12_RESOURCE_STATES new_state) { command_processor_->PushTransitionBarrier(edram_buffer_, edram_buffer_state_, new_state); edram_buffer_state_ = new_state; } void RenderTargetCache::CommitEDRAMBufferUAVWrites(bool force) { if (edram_buffer_modified_ || force) { command_processor_->PushUAVBarrier(edram_buffer_); } edram_buffer_modified_ = false; } void RenderTargetCache::WriteEDRAMRawSRVDescriptor( D3D12_CPU_DESCRIPTOR_HANDLE handle) { auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); device->CopyDescriptorsSimple( 1, handle, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kRawSRV)), D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); } void RenderTargetCache::WriteEDRAMRawUAVDescriptor( D3D12_CPU_DESCRIPTOR_HANDLE handle) { auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); device->CopyDescriptorsSimple( 1, handle, provider->OffsetViewDescriptor( edram_buffer_descriptor_heap_start_, uint32_t(EDRAMBufferDescriptorIndex::kRawUAV)), D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); } void RenderTargetCache::ClearBindings() { current_surface_pitch_ = 0; current_msaa_samples_ = MsaaSamples::k1X; current_edram_max_rows_ = 0; std::memset(current_bindings_, 0, sizeof(current_bindings_)); } #if 0 bool RenderTargetCache::MakeHeapResident(uint32_t heap_index) { if (heap_index >= 5) { assert_always(); return false; } if (heaps_[heap_index] != nullptr) { return true; } auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); D3D12_HEAP_DESC heap_desc = {}; heap_desc.SizeInBytes = kHeap4MBPages << 22; heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT; // TODO(Triang3l): If real MSAA is added, alignment must be 4 MB. heap_desc.Alignment = 0; heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_RT_DS_TEXTURES; if (FAILED( device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[heap_index])))) { XELOGE("Failed to create a %u MB heap for render targets", kHeap4MBPages * 4); return false; } return true; } #endif bool RenderTargetCache::EnsureRTVHeapAvailable(bool is_depth) { auto& heap = is_depth ? descriptor_heaps_depth_ : descriptor_heaps_color_; if (heap != nullptr && heap->descriptors_used < kRenderTargetDescriptorHeapSize) { return true; } auto device = command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); D3D12_DESCRIPTOR_HEAP_DESC heap_desc; heap_desc.Type = is_depth ? D3D12_DESCRIPTOR_HEAP_TYPE_DSV : D3D12_DESCRIPTOR_HEAP_TYPE_RTV; heap_desc.NumDescriptors = kRenderTargetDescriptorHeapSize; heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE; heap_desc.NodeMask = 0; ID3D12DescriptorHeap* new_d3d_heap; if (FAILED(device->CreateDescriptorHeap(&heap_desc, IID_PPV_ARGS(&new_d3d_heap)))) { XELOGE("Failed to create a heap for %u %s buffer descriptors", kRenderTargetDescriptorHeapSize, is_depth ? "depth" : "color"); return false; } RenderTargetDescriptorHeap* new_heap = new RenderTargetDescriptorHeap; new_heap->heap = new_d3d_heap; new_heap->start_handle = new_d3d_heap->GetCPUDescriptorHandleForHeapStart(); new_heap->descriptors_used = 0; new_heap->previous = heap; heap = new_heap; return true; } bool RenderTargetCache::GetResourceDesc(RenderTargetKey key, D3D12_RESOURCE_DESC& desc) { if (key.width_ss_div_80 == 0 || key.height_ss_div_16 == 0) { return false; } DXGI_FORMAT dxgi_format = key.is_depth ? GetDepthDXGIFormat(DepthRenderTargetFormat(key.format)) : GetColorDXGIFormat(ColorRenderTargetFormat(key.format)); if (dxgi_format == DXGI_FORMAT_UNKNOWN) { return false; } desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; // TODO(Triang3l): If real MSAA is added, alignment must be 4 MB. desc.Alignment = 0; desc.Width = key.width_ss_div_80 * 80; desc.Height = key.height_ss_div_16 * 16; desc.DepthOrArraySize = 1; desc.MipLevels = 1; desc.Format = dxgi_format; desc.SampleDesc.Count = 1; desc.SampleDesc.Quality = 0; desc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; desc.Flags = key.is_depth ? D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL : D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; return true; } RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( #if 0 RenderTargetKey key, uint32_t heap_page_first #else RenderTargetKey key, uint32_t instance #endif ) { #if 0 assert_true(heap_page_first < kHeap4MBPages * 5); #endif // Try to find an existing render target. auto found_range = render_targets_.equal_range(key.value); for (auto iter = found_range.first; iter != found_range.second; ++iter) { RenderTarget* found_render_target = iter->second; #if 0 if (found_render_target->heap_page_first == heap_page_first) { return found_render_target; } #else if (found_render_target->instance == instance) { return found_render_target; } #endif } D3D12_RESOURCE_DESC resource_desc; if (!GetResourceDesc(key, resource_desc)) { return nullptr; } auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); #if 0 // Get the number of heap pages needed for the render target. D3D12_RESOURCE_ALLOCATION_INFO allocation_info = device->GetResourceAllocationInfo(0, 1, &resource_desc); uint32_t heap_page_count = (uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22; if (heap_page_count == 0 || (heap_page_first % kHeap4MBPages) + heap_page_count > kHeap4MBPages) { assert_always(); return nullptr; } #endif // Ensure we can create a new descriptor in the render target heap. if (!EnsureRTVHeapAvailable(key.is_depth)) { return nullptr; } #if 0 // Create the memory heap if it doesn't exist yet. uint32_t heap_index = heap_page_first / kHeap4MBPages; if (!MakeHeapResident(heap_index)) { return nullptr; } #endif // The first action likely to be done is EDRAM buffer load. D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_COPY_DEST; ID3D12Resource* resource; #if 0 if (FAILED(device->CreatePlacedResource( heaps_[heap_index], (heap_page_first % kHeap4MBPages) << 22, &resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) { XELOGE( "Failed to create a placed resource for %ux%u %s render target with " "format %u at heap 4 MB pages %u:%u", uint32_t(resource_desc.Width), resource_desc.Height, key.is_depth ? "depth" : "color", key.format, heap_page_first, heap_page_first + heap_page_count - 1); return nullptr; } #else if (FAILED(device->CreateCommittedResource( &ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE, &resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) { XELOGE( "Failed to create a committed resource for %ux%u %s render target with " "format %u", uint32_t(resource_desc.Width), resource_desc.Height, key.is_depth ? "depth" : "color", key.format); return nullptr; } #endif // Create the descriptor for the render target. D3D12_CPU_DESCRIPTOR_HANDLE descriptor_handle; if (key.is_depth) { descriptor_handle = provider->OffsetDSVDescriptor( descriptor_heaps_depth_->start_handle, descriptor_heaps_depth_->descriptors_used); D3D12_DEPTH_STENCIL_VIEW_DESC dsv_desc; dsv_desc.Format = resource_desc.Format; dsv_desc.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D; dsv_desc.Flags = D3D12_DSV_FLAG_NONE; dsv_desc.Texture2D.MipSlice = 0; device->CreateDepthStencilView(resource, &dsv_desc, descriptor_handle); ++descriptor_heaps_depth_->descriptors_used; } else { descriptor_handle = provider->OffsetRTVDescriptor( descriptor_heaps_color_->start_handle, descriptor_heaps_color_->descriptors_used); D3D12_RENDER_TARGET_VIEW_DESC rtv_desc; rtv_desc.Format = resource_desc.Format; rtv_desc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D; rtv_desc.Texture2D.MipSlice = 0; rtv_desc.Texture2D.PlaneSlice = 0; device->CreateRenderTargetView(resource, &rtv_desc, descriptor_handle); ++descriptor_heaps_color_->descriptors_used; } // Get the layout for copying to the EDRAM buffer. RenderTarget* render_target = new RenderTarget; render_target->resource = resource; render_target->state = state; render_target->handle = descriptor_handle; render_target->key = key; #if 0 render_target->heap_page_first = heap_page_first; render_target->heap_page_count = heap_page_count; #else render_target->instance = instance; #endif UINT64 copy_buffer_size; device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0, render_target->footprints, nullptr, nullptr, ©_buffer_size); render_target->copy_buffer_size = uint32_t(copy_buffer_size); render_targets_.insert(std::make_pair(key.value, render_target)); COUNT_profile_set("gpu/render_target_cache/render_targets", render_targets_.size()); #if 0 XELOGGPU( "Created %ux%u %s render target with format %u at heap 4 MB pages %u:%u", uint32_t(resource_desc.Width), resource_desc.Height, key.is_depth ? "depth" : "color", key.format, heap_page_first, heap_page_first + heap_page_count - 1); #else XELOGGPU("Created %ux%u %s render target with format %u", uint32_t(resource_desc.Width), resource_desc.Height, key.is_depth ? "depth" : "color", key.format); #endif return render_target; } bool RenderTargetCache::GetEDRAMLayout( uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp, uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out, uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) { if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 || rect_in_out.top >= rect_in_out.bottom) { return false; } pitch_pixels = std::min(pitch_pixels, 2560u); D3D12_RECT rect = rect_in_out; rect.left = std::max(rect.left, LONG(0)); rect.top = std::max(rect.top, LONG(0)); rect.right = std::min(rect.right, LONG(pitch_pixels)); if (rect.left >= rect.right) { return false; } uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0; uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0; uint32_t sample_size_log2 = is_64bpp ? 1 : 0; uint32_t pitch_tiles = (((pitch_pixels << samples_x_log2) + 79) / 80) << sample_size_log2; // Adjust the base and the rectangle to skip tiles to the left of the left // bound of the rectangle and to the top of the top bound. uint32_t base = base_in_out; uint32_t skip = rect.top << samples_y_log2 >> 4; base += skip * pitch_tiles; skip <<= 4 - samples_y_log2; rect.top -= skip; rect.bottom -= skip; skip = (rect.left << samples_x_log2) / 80; base += skip << sample_size_log2; skip *= 80 >> samples_x_log2; rect.left -= skip; rect.right -= skip; // Calculate the number of 16-sample rows this rectangle spans. uint32_t rows = ((rect.bottom << samples_y_log2) + 15) >> 4; uint32_t rows_max = (2048 - base) / pitch_tiles; if (rows_max == 0) { return false; } if (rows > rows_max) { // Clamp the rectangle if it's partially outside of EDRAM. rows = rows_max; rect.bottom = rows_max << (4 - samples_y_log2); } base_in_out = base; rect_in_out = rect; pitch_tiles_out = pitch_tiles; row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80; rows_out = rows; return true; } RenderTargetCache::EDRAMLoadStoreMode RenderTargetCache::GetLoadStoreMode( bool is_depth, uint32_t format) { if (is_depth) { return DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8 ? EDRAMLoadStoreMode::kDepthFloat : EDRAMLoadStoreMode::kDepthUnorm; } ColorRenderTargetFormat color_format = ColorRenderTargetFormat(format); if (color_format == ColorRenderTargetFormat::k_2_10_10_10_FLOAT || color_format == ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16) { return EDRAMLoadStoreMode::kColor7e3; } return IsColorFormat64bpp(color_format) ? EDRAMLoadStoreMode::kColor64bpp : EDRAMLoadStoreMode::kColor32bpp; } void RenderTargetCache::StoreRenderTargetsToEDRAM() { if (command_processor_->IsROVUsedForEDRAM()) { return; } auto command_list = command_processor_->GetDeferredCommandList(); // Extract only the render targets that need to be stored, transition them to // copy sources and calculate copy buffer size. uint32_t store_bindings[5]; uint32_t store_binding_count = 0; uint32_t copy_buffer_size = 0; for (uint32_t i = 0; i < 5; ++i) { const RenderTargetBinding& binding = current_bindings_[i]; RenderTarget* render_target = binding.render_target; if (!binding.is_bound || render_target == nullptr || binding.edram_dirty_rows < 0) { continue; } store_bindings[store_binding_count++] = i; copy_buffer_size = std::max(copy_buffer_size, render_target->copy_buffer_size); } if (store_binding_count == 0) { return; } // Allocate descriptors for the buffers. D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) { return; } // Get the buffer for copying. D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST; ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer( copy_buffer_size, copy_buffer_state); if (copy_buffer == nullptr) { return; } // Transition the render targets that need to be stored to copy sources and // the EDRAM buffer to a UAV. for (uint32_t i = 0; i < store_binding_count; ++i) { RenderTarget* render_target = current_bindings_[store_bindings[i]].render_target; command_processor_->PushTransitionBarrier(render_target->resource, render_target->state, D3D12_RESOURCE_STATE_COPY_SOURCE); render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE; } TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS); // Set up the bindings. auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_); ui::d3d12::util::CreateRawBufferSRV(device, descriptor_cpu_start, copy_buffer, copy_buffer_size); WriteEDRAMRawUAVDescriptor( provider->OffsetViewDescriptor(descriptor_cpu_start, 1)); command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start); // Sort the bindings in ascending order of EDRAM base so data in the render // targets placed farther in EDRAM isn't lost in case of overlap. std::sort(store_bindings, store_bindings + store_binding_count, [this](uint32_t a, uint32_t b) { uint32_t base_a = current_bindings_[a].edram_base; uint32_t base_b = current_bindings_[b].edram_base; if (base_a == base_b) { // If EDRAM bases are the same (not really a valid usage, but // happens in Banjo-Tooie - in case color writing was enabled // for invalid render targets in some draw call), treat the // render targets with the lowest index as more important (it's // the primary one after all, while the rest are additional). // Depth buffer has lower priority, otherwise the Xbox Live // Arcade logo disappears. return a > b; } return base_a < base_b; }); // Calculate the dispatch width. uint32_t surface_pitch_ss = current_surface_pitch_ * (current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1); uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80; assert_true(surface_pitch_tiles != 0); // Store each render target. for (uint32_t i = 0; i < store_binding_count; ++i) { const RenderTargetBinding& binding = current_bindings_[store_bindings[i]]; const RenderTarget* render_target = binding.render_target; bool is_64bpp = false; // Transition the copy buffer to copy destination. command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_DEST); copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST; command_processor_->SubmitBarriers(); // Copy from the render target planes and set up the layout. D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; location_source.pResource = render_target->resource; location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; location_source.SubresourceIndex = 0; location_dest.pResource = copy_buffer; location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; location_dest.PlacedFootprint = render_target->footprints[0]; // TODO(Triang3l): Box for color render targets. command_list->CopyTexture(location_dest, location_source); EDRAMLoadStoreRootConstants root_constants; uint32_t rt_pitch_tiles = surface_pitch_tiles; if (!render_target->key.is_depth && IsColorFormat64bpp( ColorRenderTargetFormat(render_target->key.format))) { rt_pitch_tiles *= 2; } // TODO(Triang3l): log2(sample count, resolution scale). root_constants.base_samples_2x_depth_pitch = binding.edram_base | (rt_pitch_tiles << 16); root_constants.rt_color_depth_offset = uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_color_depth_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; if (render_target->key.is_depth) { root_constants.base_samples_2x_depth_pitch |= 1 << 15; location_source.SubresourceIndex = 1; location_dest.PlacedFootprint = render_target->footprints[1]; command_list->CopyTexture(location_dest, location_source); root_constants.rt_stencil_offset = uint32_t(location_dest.PlacedFootprint.Offset); root_constants.rt_stencil_pitch = location_dest.PlacedFootprint.Footprint.RowPitch; } // Transition the copy buffer to SRV. command_processor_->PushTransitionBarrier( copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; command_processor_->SubmitBarriers(); // Store the data. command_list->D3DSetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); command_processor_->SetComputePipeline( edram_store_pipelines_[size_t(mode)]); // 1 group per 80x16 samples. command_list->D3DDispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1); // Commit the UAV write. CommitEDRAMBufferUAVWrites(true); } command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); } void RenderTargetCache::LoadRenderTargetsFromEDRAM( uint32_t render_target_count, RenderTarget* const* render_targets, const uint32_t* edram_bases) { assert_true(render_target_count <= 5); if (render_target_count == 0 || render_target_count > 5) { return; } auto command_list = command_processor_->GetDeferredCommandList(); // Allocate descriptors for the buffers. D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) { return; } // Get the buffer for copying. uint32_t copy_buffer_size = 0; for (uint32_t i = 0; i < render_target_count; ++i) { copy_buffer_size = std::max(copy_buffer_size, render_targets[i]->copy_buffer_size); } D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer( copy_buffer_size, copy_buffer_state); if (copy_buffer == nullptr) { return; } // Transition the render targets to copy destinations and the EDRAM buffer to // a SRV. for (uint32_t i = 0; i < render_target_count; ++i) { RenderTarget* render_target = render_targets[i]; command_processor_->PushTransitionBarrier(render_target->resource, render_target->state, D3D12_RESOURCE_STATE_COPY_DEST); render_target->state = D3D12_RESOURCE_STATE_COPY_DEST; } TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); // Set up the bindings. auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); auto device = provider->GetDevice(); command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_); WriteEDRAMRawSRVDescriptor(descriptor_cpu_start); ui::d3d12::util::CreateRawBufferUAV( device, provider->OffsetViewDescriptor(descriptor_cpu_start, 1), copy_buffer, copy_buffer_size); command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start); // Load each render target. for (uint32_t i = 0; i < render_target_count; ++i) { if (edram_bases[i] >= 2048) { // Something is wrong with the load. continue; } const RenderTarget* render_target = render_targets[i]; // Get the number of EDRAM tiles per row. uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80; if (!render_target->key.is_depth && IsColorFormat64bpp( ColorRenderTargetFormat(render_target->key.format))) { edram_pitch_tiles *= 2; } // Clamp the height if somehow requested a render target that is too large. uint32_t edram_rows = std::min(render_target->key.height_ss_div_16, (2048u - edram_bases[i]) / edram_pitch_tiles); if (edram_rows == 0) { continue; } // Transition the copy buffer back to UAV if it's not the first load. command_processor_->PushTransitionBarrier( copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); copy_buffer_state = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; // Load the data. command_processor_->SubmitBarriers(); EDRAMLoadStoreRootConstants root_constants; // TODO(Triang3l): log2(sample count, resolution scale). root_constants.base_samples_2x_depth_pitch = edram_bases[i] | (edram_pitch_tiles << 16); root_constants.rt_color_depth_offset = uint32_t(render_target->footprints[0].Offset); root_constants.rt_color_depth_pitch = render_target->footprints[0].Footprint.RowPitch; if (render_target->key.is_depth) { root_constants.base_samples_2x_depth_pitch |= 1 << 15; root_constants.rt_stencil_offset = uint32_t(render_target->footprints[1].Offset); root_constants.rt_stencil_pitch = render_target->footprints[1].Footprint.RowPitch; } command_list->D3DSetComputeRoot32BitConstants( 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth, render_target->key.format); command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]); // 1 group per 80x16 samples. command_list->D3DDispatch(render_target->key.width_ss_div_80, edram_rows, 1); // Commit the UAV write and transition the copy buffer to copy source now. command_processor_->PushUAVBarrier(copy_buffer); command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_COPY_SOURCE); copy_buffer_state = D3D12_RESOURCE_STATE_COPY_SOURCE; // Copy to the render target planes. command_processor_->SubmitBarriers(); D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; location_source.pResource = copy_buffer; location_source.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; location_source.PlacedFootprint = render_target->footprints[0]; location_dest.pResource = render_target->resource; location_dest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; location_dest.SubresourceIndex = 0; command_list->CopyTexture(location_dest, location_source); if (render_target->key.is_depth) { location_source.PlacedFootprint = render_target->footprints[1]; location_dest.SubresourceIndex = 1; command_list->CopyTexture(location_dest, location_source); } } command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); } } // namespace d3d12 } // namespace gpu } // namespace xe