xenia/src/xenia/gpu/d3d12/render_target_cache.cc

2816 lines
117 KiB
C++

/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2018 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/d3d12/render_target_cache.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/texture_util.h"
#include "xenia/ui/d3d12/d3d12_util.h"
DEFINE_bool(d3d12_16bit_rtv_full_range, true,
"Use full -32...32 range for RG16 and RGBA16 render targets "
"(at the expense of blending correctness) without ROV.",
"D3D12");
DEFINE_bool(d3d12_resolution_scale_resolve_edge_clamp, true,
"When using resolution scale, apply the hack that duplicates the "
"right/lower subpixel in the left and top sides of render target "
"resolve areas to eliminate the gap caused by half-pixel offset "
"(this is necessary for certain games like GTA IV to work).",
"D3D12");
DECLARE_bool(d3d12_half_pixel_offset);
namespace xe {
namespace gpu {
namespace d3d12 {
// Generated with `xb buildhlsl`.
#include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_clear_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_2x_resolve_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_2x_resolve_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_2x_resolve_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_color_7e3_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_load_depth_unorm_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_color_7e3_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_float_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_store_depth_unorm_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_tile_sample_32bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/edram_tile_sample_64bpp_cs.h"
#include "xenia/gpu/d3d12/shaders/dxbc/resolve_ps.h"
#include "xenia/gpu/d3d12/shaders/dxbc/resolve_vs.h"
#if 0
constexpr uint32_t RenderTargetCache::kHeap4MBPages;
#endif
constexpr uint32_t RenderTargetCache::kRenderTargetDescriptorHeapSize;
const RenderTargetCache::EDRAMLoadStoreModeInfo
RenderTargetCache::edram_load_store_mode_info_[size_t(
RenderTargetCache::EDRAMLoadStoreMode::kCount)] = {
{edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs),
L"EDRAM Load 32bpp Color", edram_store_color_32bpp_cs,
sizeof(edram_store_color_32bpp_cs), L"EDRAM Store 32bpp Color",
edram_load_color_32bpp_2x_resolve_cs,
sizeof(edram_load_color_32bpp_2x_resolve_cs),
L"EDRAM Load 32bpp Color for 2x Resolve"},
{edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs),
L"EDRAM Load 64bpp Color", edram_store_color_64bpp_cs,
sizeof(edram_store_color_64bpp_cs), L"EDRAM Store 64bpp Color",
edram_load_color_64bpp_2x_resolve_cs,
sizeof(edram_load_color_64bpp_2x_resolve_cs),
L"EDRAM Load 64bpp Color for 2x Resolve"},
{edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs),
L"EDRAM Load 7e3 Color", edram_store_color_7e3_cs,
sizeof(edram_store_color_7e3_cs), L"EDRAM Store 7e3 Color",
edram_load_color_7e3_2x_resolve_cs,
sizeof(edram_load_color_7e3_2x_resolve_cs),
L"EDRAM Load 7e3 Color for 2x Resolve"},
{edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs),
L"EDRAM Load UNorm Depth", edram_store_depth_unorm_cs,
sizeof(edram_store_depth_unorm_cs), L"EDRAM Store UNorm Depth",
nullptr, 0, nullptr},
{edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs),
L"EDRAM Load Float Depth", edram_store_depth_float_cs,
sizeof(edram_store_depth_float_cs), L"EDRAM Store Float Depth",
nullptr, 0, nullptr},
};
RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor,
RegisterFile* register_file)
: command_processor_(command_processor), register_file_(register_file) {}
RenderTargetCache::~RenderTargetCache() { Shutdown(); }
bool RenderTargetCache::Initialize(const TextureCache* texture_cache) {
// EDRAM buffer size depends on this.
resolution_scale_2x_ = texture_cache->IsResolutionScale2X();
assert_false(resolution_scale_2x_ &&
!command_processor_->IsROVUsedForEDRAM());
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
// Create the buffer for reinterpreting EDRAM contents.
// No need to clear it in the first frame, memory is zeroed out when allocated
// on Windows.
D3D12_RESOURCE_DESC edram_buffer_desc;
ui::d3d12::util::FillBufferResourceDesc(
edram_buffer_desc, GetEDRAMBufferSize(),
D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
// The first operation will likely be drawing with ROV or a load without ROV.
edram_buffer_state_ = command_processor_->IsROVUsedForEDRAM()
? D3D12_RESOURCE_STATE_UNORDERED_ACCESS
: D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE,
&edram_buffer_desc, edram_buffer_state_, nullptr,
IID_PPV_ARGS(&edram_buffer_)))) {
XELOGE("Failed to create the EDRAM buffer");
Shutdown();
return false;
}
edram_buffer_modified_ = false;
// Create non-shader-visible descriptors of the EDRAM buffer for copying.
D3D12_DESCRIPTOR_HEAP_DESC edram_buffer_descriptor_heap_desc;
edram_buffer_descriptor_heap_desc.Type =
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
edram_buffer_descriptor_heap_desc.NumDescriptors =
uint32_t(EDRAMBufferDescriptorIndex::kCount);
edram_buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
edram_buffer_descriptor_heap_desc.NodeMask = 0;
if (FAILED(device->CreateDescriptorHeap(
&edram_buffer_descriptor_heap_desc,
IID_PPV_ARGS(&edram_buffer_descriptor_heap_)))) {
XELOGE("Failed to create the descriptor heap for EDRAM buffer views");
Shutdown();
return false;
}
edram_buffer_descriptor_heap_start_ =
edram_buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart();
ui::d3d12::util::CreateRawBufferSRV(
device,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kRawSRV)),
edram_buffer_, GetEDRAMBufferSize());
ui::d3d12::util::CreateRawBufferUAV(
device,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kRawUAV)),
edram_buffer_, GetEDRAMBufferSize());
D3D12_UNORDERED_ACCESS_VIEW_DESC edram_buffer_uint32_uav_desc;
edram_buffer_uint32_uav_desc.Format = DXGI_FORMAT_R32_UINT;
edram_buffer_uint32_uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
edram_buffer_uint32_uav_desc.Buffer.FirstElement = 0;
edram_buffer_uint32_uav_desc.Buffer.NumElements =
GetEDRAMBufferSize() / sizeof(uint32_t);
edram_buffer_uint32_uav_desc.Buffer.StructureByteStride = 0;
edram_buffer_uint32_uav_desc.Buffer.CounterOffsetInBytes = 0;
edram_buffer_uint32_uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
device->CreateUnorderedAccessView(
edram_buffer_, nullptr, &edram_buffer_uint32_uav_desc,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kUint32UAV)));
// Create the root signature for EDRAM buffer load/store.
D3D12_ROOT_PARAMETER load_store_root_parameters[2];
// Parameter 0 is constants (changed for each render target binding).
load_store_root_parameters[0].ParameterType =
D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
load_store_root_parameters[0].Constants.ShaderRegister = 0;
load_store_root_parameters[0].Constants.RegisterSpace = 0;
load_store_root_parameters[0].Constants.Num32BitValues =
sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t);
load_store_root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
// Parameter 1 is source and target.
D3D12_DESCRIPTOR_RANGE load_store_root_ranges[2];
load_store_root_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
load_store_root_ranges[0].NumDescriptors = 1;
load_store_root_ranges[0].BaseShaderRegister = 0;
load_store_root_ranges[0].RegisterSpace = 0;
load_store_root_ranges[0].OffsetInDescriptorsFromTableStart = 0;
load_store_root_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
load_store_root_ranges[1].NumDescriptors = 1;
load_store_root_ranges[1].BaseShaderRegister = 0;
load_store_root_ranges[1].RegisterSpace = 0;
load_store_root_ranges[1].OffsetInDescriptorsFromTableStart = 1;
load_store_root_parameters[1].ParameterType =
D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 2;
load_store_root_parameters[1].DescriptorTable.pDescriptorRanges =
load_store_root_ranges;
load_store_root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
D3D12_ROOT_SIGNATURE_DESC load_store_root_desc;
load_store_root_desc.NumParameters =
UINT(xe::countof(load_store_root_parameters));
load_store_root_desc.pParameters = load_store_root_parameters;
load_store_root_desc.NumStaticSamplers = 0;
load_store_root_desc.pStaticSamplers = nullptr;
load_store_root_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
edram_load_store_root_signature_ =
ui::d3d12::util::CreateRootSignature(provider, load_store_root_desc);
if (edram_load_store_root_signature_ == nullptr) {
XELOGE("Failed to create the EDRAM load/store root signature");
Shutdown();
return false;
}
// Create the clear root signature (the same, but with the UAV only).
load_store_root_ranges[1].OffsetInDescriptorsFromTableStart = 0;
load_store_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1;
++load_store_root_parameters[1].DescriptorTable.pDescriptorRanges;
edram_clear_root_signature_ =
ui::d3d12::util::CreateRootSignature(provider, load_store_root_desc);
if (edram_clear_root_signature_ == nullptr) {
XELOGE("Failed to create the EDRAM buffer clear root signature");
Shutdown();
return false;
}
// Create the pipelines.
bool rov_used = command_processor_->IsROVUsedForEDRAM();
// Load and store.
for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) {
const EDRAMLoadStoreModeInfo& mode_info = edram_load_store_mode_info_[i];
edram_load_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
device, mode_info.load_shader, mode_info.load_shader_size,
edram_load_store_root_signature_);
if (!rov_used) {
edram_store_pipelines_[i] = ui::d3d12::util::CreateComputePipeline(
device, mode_info.store_shader, mode_info.store_shader_size,
edram_load_store_root_signature_);
}
// Load shader for resolution-scaled resolves (host pixels within samples to
// samples within host pixels) doesn't always exist for each mode - depth is
// not resolved using drawing, for example.
bool load_2x_resolve_pipeline_used =
resolution_scale_2x_ && mode_info.load_2x_resolve_shader != nullptr;
if (load_2x_resolve_pipeline_used) {
edram_load_2x_resolve_pipelines_[i] =
ui::d3d12::util::CreateComputePipeline(
device, mode_info.load_2x_resolve_shader,
mode_info.load_2x_resolve_shader_size,
edram_load_store_root_signature_);
}
if (edram_load_pipelines_[i] == nullptr ||
(!rov_used && edram_store_pipelines_[i] == nullptr) ||
(load_2x_resolve_pipeline_used &&
edram_load_2x_resolve_pipelines_[i] == nullptr)) {
XELOGE("Failed to create the EDRAM load/store pipelines for mode %u", i);
Shutdown();
return false;
}
edram_load_pipelines_[i]->SetName(mode_info.load_pipeline_name);
if (edram_store_pipelines_[i] != nullptr) {
edram_store_pipelines_[i]->SetName(mode_info.store_pipeline_name);
}
if (edram_load_2x_resolve_pipelines_[i] != nullptr) {
edram_load_pipelines_[i]->SetName(
mode_info.load_2x_resolve_pipeline_name);
}
}
// Tile single sample into a texture - 32 bits per pixel.
edram_tile_sample_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_tile_sample_32bpp_cs, sizeof(edram_tile_sample_32bpp_cs),
edram_load_store_root_signature_);
if (edram_tile_sample_32bpp_pipeline_ == nullptr) {
XELOGE("Failed to create the 32bpp EDRAM raw resolve pipeline");
Shutdown();
return false;
}
edram_tile_sample_32bpp_pipeline_->SetName(L"EDRAM Raw Resolve 32bpp");
// Tile single sample into a texture - 64 bits per pixel.
edram_tile_sample_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_tile_sample_64bpp_cs, sizeof(edram_tile_sample_64bpp_cs),
edram_load_store_root_signature_);
if (edram_tile_sample_64bpp_pipeline_ == nullptr) {
XELOGE("Failed to create the 64bpp EDRAM raw resolve pipeline");
Shutdown();
return false;
}
edram_tile_sample_64bpp_pipeline_->SetName(L"EDRAM Raw Resolve 64bpp");
// Clear 32-bit color or unorm depth.
edram_clear_32bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_clear_32bpp_cs, sizeof(edram_clear_32bpp_cs),
edram_clear_root_signature_);
if (edram_clear_32bpp_pipeline_ == nullptr) {
XELOGE("Failed to create the EDRAM 32bpp clear pipeline");
Shutdown();
return false;
}
edram_clear_32bpp_pipeline_->SetName(L"EDRAM Clear 32bpp");
// Clear 64-bit color.
edram_clear_64bpp_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_clear_64bpp_cs, sizeof(edram_clear_64bpp_cs),
edram_clear_root_signature_);
if (edram_clear_64bpp_pipeline_ == nullptr) {
XELOGE("Failed to create the EDRAM 64bpp clear pipeline");
Shutdown();
return false;
}
edram_clear_64bpp_pipeline_->SetName(L"EDRAM Clear 64bpp");
// Clear float depth.
edram_clear_depth_float_pipeline_ = ui::d3d12::util::CreateComputePipeline(
device, edram_clear_depth_float_cs, sizeof(edram_clear_depth_float_cs),
edram_clear_root_signature_);
if (edram_clear_depth_float_pipeline_ == nullptr) {
XELOGE("Failed to create the EDRAM float depth clear pipeline");
Shutdown();
return false;
}
edram_clear_depth_float_pipeline_->SetName(L"EDRAM Clear Float Depth");
// Create the converting resolve root signature.
D3D12_ROOT_PARAMETER resolve_root_parameters[2];
// Parameter 0 is constants.
resolve_root_parameters[0].ParameterType =
D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
resolve_root_parameters[0].Constants.ShaderRegister = 0;
resolve_root_parameters[0].Constants.RegisterSpace = 0;
resolve_root_parameters[0].Constants.Num32BitValues =
sizeof(ResolveRootConstants) / sizeof(uint32_t);
resolve_root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL;
// Parameter 1 is the source render target.
D3D12_DESCRIPTOR_RANGE resolve_root_srv_range;
resolve_root_srv_range.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
resolve_root_srv_range.NumDescriptors = 1;
resolve_root_srv_range.BaseShaderRegister = 0;
resolve_root_srv_range.RegisterSpace = 0;
resolve_root_srv_range.OffsetInDescriptorsFromTableStart = 0;
resolve_root_parameters[1].ParameterType =
D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
resolve_root_parameters[1].DescriptorTable.NumDescriptorRanges = 1;
resolve_root_parameters[1].DescriptorTable.pDescriptorRanges =
&resolve_root_srv_range;
resolve_root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL;
// Static sampler for resolving AA using bilinear filtering.
D3D12_STATIC_SAMPLER_DESC resolve_sampler_desc;
resolve_sampler_desc.Filter = D3D12_FILTER_MIN_MAG_MIP_LINEAR;
resolve_sampler_desc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
resolve_sampler_desc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
resolve_sampler_desc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
resolve_sampler_desc.MipLODBias = 0.0f;
resolve_sampler_desc.MaxAnisotropy = 1;
resolve_sampler_desc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER;
resolve_sampler_desc.BorderColor = D3D12_STATIC_BORDER_COLOR_OPAQUE_BLACK;
resolve_sampler_desc.MinLOD = 0.0f;
resolve_sampler_desc.MaxLOD = 0.0f;
resolve_sampler_desc.ShaderRegister = 0;
resolve_sampler_desc.RegisterSpace = 0;
resolve_sampler_desc.ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL;
D3D12_ROOT_SIGNATURE_DESC resolve_root_desc;
resolve_root_desc.NumParameters = UINT(xe::countof(resolve_root_parameters));
resolve_root_desc.pParameters = resolve_root_parameters;
resolve_root_desc.NumStaticSamplers = 1;
resolve_root_desc.pStaticSamplers = &resolve_sampler_desc;
resolve_root_desc.Flags =
D3D12_ROOT_SIGNATURE_FLAG_DENY_VERTEX_SHADER_ROOT_ACCESS;
resolve_root_signature_ =
ui::d3d12::util::CreateRootSignature(provider, resolve_root_desc);
if (resolve_root_signature_ == nullptr) {
XELOGE("Failed to create the converting resolve root signature");
Shutdown();
return false;
}
ClearBindings();
return true;
}
void RenderTargetCache::Shutdown() {
ClearCache();
for (auto& resolve_pipeline : resolve_pipelines_) {
resolve_pipeline.pipeline->Release();
}
resolve_pipelines_.clear();
ui::d3d12::util::ReleaseAndNull(resolve_root_signature_);
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_64bpp_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_tile_sample_32bpp_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_clear_depth_float_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_clear_64bpp_pipeline_);
ui::d3d12::util::ReleaseAndNull(edram_clear_32bpp_pipeline_);
for (uint32_t i = 0; i < uint32_t(EDRAMLoadStoreMode::kCount); ++i) {
ui::d3d12::util::ReleaseAndNull(edram_store_pipelines_[i]);
ui::d3d12::util::ReleaseAndNull(edram_load_pipelines_[i]);
}
ui::d3d12::util::ReleaseAndNull(edram_clear_root_signature_);
ui::d3d12::util::ReleaseAndNull(edram_load_store_root_signature_);
ui::d3d12::util::ReleaseAndNull(edram_buffer_descriptor_heap_);
ui::d3d12::util::ReleaseAndNull(edram_buffer_);
}
void RenderTargetCache::ClearCache() {
for (auto resolve_target_pair : resolve_targets_) {
ResolveTarget* resolve_target = resolve_target_pair.second;
resolve_target->resource->Release();
delete resolve_target;
}
resolve_targets_.clear();
COUNT_profile_set("gpu/render_target_cache/resolve_targets", 0);
for (auto render_target_pair : render_targets_) {
RenderTarget* render_target = render_target_pair.second;
render_target->resource->Release();
delete render_target;
}
render_targets_.clear();
COUNT_profile_set("gpu/render_target_cache/render_targets", 0);
while (descriptor_heaps_depth_ != nullptr) {
auto heap = descriptor_heaps_depth_;
heap->heap->Release();
descriptor_heaps_depth_ = heap->previous;
delete heap;
}
while (descriptor_heaps_color_ != nullptr) {
auto heap = descriptor_heaps_color_;
heap->heap->Release();
descriptor_heaps_color_ = heap->previous;
delete heap;
}
#if 0
for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
if (heaps_[i] != nullptr) {
heaps_[i]->Release();
heaps_[i] = nullptr;
}
}
#endif
}
void RenderTargetCache::BeginFrame() {
// A frame does not always end in a resolve (for example, when memexport
// readback happens) or something else that would surely submit the UAV
// barrier, so we need to preserve the `current_` variables.
if (!command_processor_->IsROVUsedForEDRAM()) {
ClearBindings();
}
}
bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
// There are two kinds of render target binding updates in this implementation
// in case something has been changed - full and partial.
//
// For the RTV/DSV path, a full update involves flushing all the currently
// bound render targets that have been modified to the EDRAM buffer,
// allocating all the newly bound render targets in the heaps, loading them
// from the EDRAM buffer and binding them.
//
// For the ROV path, a full update places a UAV barrier because across draws,
// pixels with different SV_Positions or different sample counts (thus without
// interlocking between each other) may access the same data now. Not having
// the barriers causes visual glitches in many games, such as Halo 3 where the
// right side of the menu and shadow maps get corrupted (at least on Nvidia).
//
// ("Bound" here means ever used since the last full update - and in this case
// it's bound to the Direct3D 12 command list in the RTV/DSV path.)
//
// However, Banjo-Kazooie interleaves color/depth and depth-only writes every
// draw call, and doing a full update whenever the color mask is changed is
// too expensive. So, we shouldn't do a full update if the game only toggles
// color writes and depth testing. Instead, we're only adding or re-enabling
// render targets if color writes are being enabled (adding involves loading
// the contents from the EDRAM, while re-enabling does nothing on the D3D
// side).
//
// There are cases when simply toggling render targets may still require EDRAM
// stores and thus a full update. Here's an example situation:
// Draw 1:
// - 32bpp RT0 0-10 MB
// - 32bpp RT1 3-10 MB
// - 1280x720 viewport
// Draw 2:
// - 32bpp RT0 0-10 MB
// - Inactive RT1
// - 1280x1440 viewport
// Draw 3:
// - 32bpp RT0 0-10 MB
// - 32bpp RT1 3-10 MB
// - 1280x720 viewport
// In this case, before draw 2, RT1 must be written to the EDRAM buffer, and
// RT0 must be loaded, and also before draw 3 RT1 must receive the changes
// made to the lower part of RT0. So, before draws 2 and 3, full updates must
// be done.
//
// Direct3D 12 also requires all render targets to have the same size, so the
// height is calculated from the EDRAM space available to the last render
// target available in it. However, to make toggling render targets like in
// the Banjo-Kazooie case possible, the height may be decreased only in full
// updates.
// TODO(Triang3l): Check if it's safe to calculate the smallest EDRAM region
// without aliasing and use it for the height. This won't work if games
// actually alias active render targets for some reason.
//
// To summarize, a full update happens if:
// - Starting a new frame.
// - Drawing after resolving.
// - Surface pitch changed.
// - Sample count changed.
// - Render target is disabled and another render target got more space than
// is currently available in the textures (RTV/DSV only).
// - EDRAM base of a currently used RT changed.
// - Format of a currently used RT changed (RTV/DSV) or pixel size of a
// currently used RT changed (ROV).
// - Current viewport contains unsaved data from previously used render
// targets.
// - New render target overlaps unsaved data from other bound render targets.
//
// "Previously used" and "new" in the last 2 conditions is important so if the
// game has render targets aliased in the same draw call, there won't be a
// full update every draw.
//
// A partial update happens if:
// - New render target is added, but doesn't overlap unsaved data from other
// currently or previously used render targets, and it doesn't require a
// bigger size.
auto& regs = *register_file_;
#if FINE_GRAINED_DRAW_SCOPES
SCOPE_profile_cpu_f("gpu");
#endif // FINE_GRAINED_DRAW_SCOPES
bool rov_used = command_processor_->IsROVUsedForEDRAM();
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
uint32_t surface_pitch = std::min(rb_surface_info.surface_pitch, 2560u);
if (surface_pitch == 0) {
// TODO(Triang3l): Do something if a memexport-only draw has 0 surface
// pitch (never seen in any game so far, not sure if even legal).
return false;
}
uint32_t msaa_samples_x =
rb_surface_info.msaa_samples >= MsaaSamples::k4X ? 2 : 1;
uint32_t msaa_samples_y =
rb_surface_info.msaa_samples >= MsaaSamples::k2X ? 2 : 1;
// Extract color/depth info in an unified way.
bool enabled[5];
uint32_t edram_bases[5];
uint32_t formats[5];
bool formats_are_64bpp[5];
uint32_t color_mask = command_processor_->GetCurrentColorMask(pixel_shader);
for (uint32_t i = 0; i < 4; ++i) {
enabled[i] = (color_mask & (0xF << (i * 4))) != 0;
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
reg::RB_COLOR_INFO::rt_register_indices[i]);
edram_bases[i] = std::min(color_info.color_base, 2048u);
formats[i] = uint32_t(GetBaseColorFormat(color_info.color_format));
formats_are_64bpp[i] =
IsColorFormat64bpp(ColorRenderTargetFormat(formats[i]));
}
auto rb_depthcontrol = regs.Get<reg::RB_DEPTHCONTROL>();
auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();
// 0x1 = stencil test, 0x2 = depth test.
enabled[4] = rb_depthcontrol.stencil_enable || rb_depthcontrol.z_enable;
edram_bases[4] = std::min(rb_depth_info.depth_base, 2048u);
formats[4] = uint32_t(rb_depth_info.depth_format);
formats_are_64bpp[4] = false;
// Don't mark depth regions as dirty if not writing the depth.
// TODO(Triang3l): Make a common function for checking if stencil writing is
// really done?
bool depth_readonly =
!rb_depthcontrol.stencil_enable && !rb_depthcontrol.z_write_enable;
bool full_update = false;
// Check the following full update conditions:
// - Starting a new frame.
// - Drawing after resolving.
// - Surface pitch changed.
// - Sample count changed.
// Draws are skipped if the surface pitch is 0, so a full update can be forced
// in the beginning of the frame or after resolves by setting the current
// pitch to 0.
if (current_surface_pitch_ != surface_pitch ||
current_msaa_samples_ != rb_surface_info.msaa_samples) {
full_update = true;
}
// Get the maximum height of each render target in EDRAM rows to help
// clamp the dirty region heights.
uint32_t edram_row_tiles_32bpp = (surface_pitch * msaa_samples_x + 79) / 80;
uint32_t edram_row_tiles[5];
uint32_t edram_max_rows = UINT32_MAX;
for (uint32_t i = 0; i < 5; ++i) {
edram_row_tiles[i] = edram_row_tiles_32bpp * (formats_are_64bpp[i] ? 2 : 1);
if (enabled[i]) {
// Direct3D 12 doesn't allow render targets with different sizes, so
// calculate the height from the render target closest to the end of
// EDRAM.
edram_max_rows = std::min(edram_max_rows,
(2048 - edram_bases[i]) / edram_row_tiles[i]);
}
}
if (edram_max_rows == UINT32_MAX) {
// No render targets needed - likely a memexport-only draw, just keep using
// the current state (or 0 if nothing bound yet, but nothing will be bound
// anyway so it won't matter).
edram_max_rows = current_edram_max_rows_;
} else {
if (edram_max_rows == 0) {
// Some render target is totally in the end of EDRAM - can't create
// textures with 0 height.
return false;
}
}
// Don't create render targets larger than x2560.
edram_max_rows = std::min(edram_max_rows, 160u * msaa_samples_y);
// Check the following full update conditions:
// - Render target is disabled and another render target got more space than
// is currently available in the textures (RTV/DSV only).
if (!rov_used && edram_max_rows > current_edram_max_rows_) {
full_update = true;
}
// Get EDRAM usage of the current draw so dirty regions can be calculated.
// See D3D12CommandProcessor::UpdateFixedFunctionState for more info.
int32_t window_offset_y =
regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
float viewport_scale_y = pa_cl_vte_cntl.vport_y_scale_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
: 1280.0f;
float viewport_offset_y = pa_cl_vte_cntl.vport_y_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
: std::abs(viewport_scale_y);
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
viewport_offset_y += float(window_offset_y);
}
uint32_t viewport_bottom = uint32_t(std::max(
0.0f, std::ceil(viewport_offset_y + std::abs(viewport_scale_y))));
uint32_t scissor_bottom = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y;
if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
scissor_bottom = std::max(int32_t(scissor_bottom) + window_offset_y, 0);
}
uint32_t dirty_bottom =
std::min(std::min(viewport_bottom, scissor_bottom), 2560u);
uint32_t edram_dirty_rows =
std::min((dirty_bottom * msaa_samples_y + 15) >> 4, edram_max_rows);
// Check the following full update conditions:
// - EDRAM base of a currently used RT changed.
// - Format of a currently used RT changed (RTV/DSV) or pixel size of a
// currently used RT changed (ROV).
// Also build a list of render targets to attach in a partial update.
uint32_t render_targets_to_attach = 0;
if (!full_update) {
for (uint32_t i = 0; i < 5; ++i) {
if (!enabled[i]) {
continue;
}
const RenderTargetBinding& binding = current_bindings_[i];
if (binding.is_bound) {
if (binding.edram_base != edram_bases[i]) {
full_update = true;
break;
}
if (rov_used) {
if (i != 4) {
full_update |= IsColorFormat64bpp(binding.color_format) !=
formats_are_64bpp[i];
}
} else {
full_update |= binding.format != formats[i];
}
if (full_update) {
break;
}
} else {
render_targets_to_attach |= 1 << i;
}
}
}
// Check the following full update conditions here:
// - Current viewport contains unsaved data from previously used render
// targets.
// - New render target overlaps unsaved data from other bound render
// targets.
if (!full_update) {
for (uint32_t i = 0; i < 5; ++i) {
const RenderTargetBinding& binding_1 = current_bindings_[i];
uint32_t edram_dirty_rows_1;
if (binding_1.is_bound) {
if (enabled[i]) {
continue;
}
// Checking if now overlapping a previously used render target.
// binding_1 is the previously used render target.
edram_dirty_rows_1 = binding_1.edram_dirty_rows;
} else {
if (!(render_targets_to_attach & (1 << i))) {
continue;
}
// Checking if the new render target is overlapping any bound one.
// binding_1 is the new render target.
edram_dirty_rows_1 = edram_dirty_rows;
}
for (uint32_t j = 0; j < 5; ++j) {
const RenderTargetBinding& binding_2 = current_bindings_[j];
if (!binding_2.is_bound) {
continue;
}
uint32_t edram_dirty_rows_2;
if (binding_1.is_bound) {
if (!enabled[j]) {
continue;
}
// Checking if now overlapping a previously used render target.
// binding_2 is a currently used render target.
edram_dirty_rows_2 = edram_dirty_rows;
} else {
// Checking if the new render target is overlapping any bound one.
// binding_2 is another bound render target.
edram_dirty_rows_2 = binding_2.edram_dirty_rows;
}
// Do a full update if there is overlap.
if (edram_bases[i] <
edram_bases[j] + edram_dirty_rows_2 * edram_row_tiles[j] &&
edram_bases[j] <
edram_bases[i] + edram_dirty_rows_1 * edram_row_tiles[i]) {
full_update = true;
break;
}
}
if (full_update) {
break;
}
}
}
// Need to change the bindings.
if (full_update || render_targets_to_attach) {
#if 0
uint32_t heap_usage[5] = {};
#endif
if (full_update) {
if (rov_used) {
// Place a UAV barrier because across draws, pixels with different
// SV_Positions or different sample counts (thus without interlocking
// between each other) may access the same data now.
CommitEDRAMBufferUAVWrites(false);
} else {
// Export the currently bound render targets before we ruin the
// bindings.
StoreRenderTargetsToEDRAM();
}
ClearBindings();
current_surface_pitch_ = surface_pitch;
current_msaa_samples_ = rb_surface_info.msaa_samples;
if (!rov_used) {
current_edram_max_rows_ = edram_max_rows;
}
// If updating fully, need to reattach all the render targets and allocate
// from scratch.
for (uint32_t i = 0; i < 5; ++i) {
if (enabled[i]) {
render_targets_to_attach |= 1 << i;
}
}
} else {
#if 0
if (!rov_used) {
// If updating partially, only need to attach new render targets.
for (uint32_t i = 0; i < 5; ++i) {
const RenderTargetBinding& binding = current_bindings_[i];
if (!binding.is_bound) {
continue;
}
const RenderTarget* render_target = binding.render_target;
if (render_target != nullptr) {
// There are no holes between 4 MB pages in each heap.
heap_usage[render_target->heap_page_first / kHeap4MBPages] +=
render_target->heap_page_count;
}
}
}
#endif
}
XELOGGPU("RT Cache: %s update - pitch %u, samples %u, RTs to attach %u",
full_update ? "Full" : "Partial", surface_pitch,
rb_surface_info.msaa_samples, render_targets_to_attach);
#if 0
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
#endif
// Allocate new render targets and add them to the bindings list.
for (uint32_t i = 0; i < 5; ++i) {
if (!(render_targets_to_attach & (1 << i))) {
continue;
}
RenderTargetBinding& binding = current_bindings_[i];
binding.is_bound = true;
binding.edram_base = edram_bases[i];
binding.edram_dirty_rows = 0;
binding.format = formats[i];
binding.render_target = nullptr;
if (!rov_used) {
RenderTargetKey key;
key.width_ss_div_80 = edram_row_tiles_32bpp;
key.height_ss_div_16 = current_edram_max_rows_;
key.is_depth = i == 4 ? 1 : 0;
key.format = formats[i];
D3D12_RESOURCE_DESC resource_desc;
if (!GetResourceDesc(key, resource_desc)) {
// Invalid format.
continue;
}
#if 0
// Calculate the number of 4 MB pages of the heaps this RT will use.
D3D12_RESOURCE_ALLOCATION_INFO allocation_info =
device->GetResourceAllocationInfo(0, 1, &resource_desc);
if (allocation_info.SizeInBytes == 0 ||
allocation_info.SizeInBytes > (kHeap4MBPages << 22)) {
assert_always();
continue;
}
uint32_t heap_page_count =
(uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22;
// Find the heap page range for this render target.
uint32_t heap_page_first = UINT32_MAX;
for (uint32_t j = 0; j < 5; ++j) {
if (heap_usage[j] + heap_page_count <= kHeap4MBPages) {
heap_page_first = j * kHeap4MBPages + heap_usage[j];
break;
}
}
if (heap_page_first == UINT32_MAX) {
assert_always();
continue;
}
// Get the render target.
binding.render_target = FindOrCreateRenderTarget(key, heap_page_first);
if (binding.render_target == nullptr) {
continue;
}
heap_usage[heap_page_first / kHeap4MBPages] += heap_page_count;
// Inform Direct3D that we're reusing the heap for this render target.
command_processor_->PushAliasingBarrier(
nullptr, binding.render_target->resource);
#else
// If multiple render targets have the same format, assign different
// instance numbers to them.
uint32_t instance = 0;
if (i != 4) {
for (uint32_t j = 0; j < i; ++j) {
const RenderTargetBinding& other_binding = current_bindings_[j];
if (other_binding.is_bound &&
other_binding.render_target != nullptr &&
other_binding.format == formats[i]) {
++instance;
}
}
}
binding.render_target = FindOrCreateRenderTarget(key, instance);
#endif
}
}
if (!rov_used) {
// Sample positions when loading depth must match sample positions when
// drawing.
command_processor_->SetSamplePositions(rb_surface_info.msaa_samples);
// Load the contents of the new render targets from the EDRAM buffer (will
// change the state of the render targets to copy destination).
RenderTarget* load_render_targets[5];
uint32_t load_edram_bases[5];
uint32_t load_render_target_count = 0;
for (uint32_t i = 0; i < 5; ++i) {
if (!(render_targets_to_attach & (1 << i))) {
continue;
}
RenderTarget* render_target = current_bindings_[i].render_target;
if (render_target == nullptr) {
continue;
}
load_render_targets[load_render_target_count] = render_target;
load_edram_bases[load_render_target_count] = edram_bases[i];
++load_render_target_count;
}
if (load_render_target_count != 0) {
LoadRenderTargetsFromEDRAM(load_render_target_count,
load_render_targets, load_edram_bases);
}
// Transition the render targets to the appropriate state if needed,
// compress the list of the render target because null RTV descriptors are
// broken in Direct3D 12 and bind the render targets to the command list.
D3D12_CPU_DESCRIPTOR_HANDLE rtv_handles[4];
uint32_t rtv_count = 0;
for (uint32_t i = 0; i < 4; ++i) {
const RenderTargetBinding& binding = current_bindings_[i];
RenderTarget* render_target = binding.render_target;
if (!binding.is_bound || render_target == nullptr) {
continue;
}
XELOGGPU("RT Color %u: base %u, format %u", i, edram_bases[i],
formats[i]);
command_processor_->PushTransitionBarrier(
render_target->resource, render_target->state,
D3D12_RESOURCE_STATE_RENDER_TARGET);
render_target->state = D3D12_RESOURCE_STATE_RENDER_TARGET;
rtv_handles[rtv_count] = render_target->handle;
current_pipeline_render_targets_[rtv_count].guest_render_target = i;
current_pipeline_render_targets_[rtv_count].format =
GetColorDXGIFormat(ColorRenderTargetFormat(formats[i]));
++rtv_count;
}
for (uint32_t i = rtv_count; i < 4; ++i) {
current_pipeline_render_targets_[i].guest_render_target = i;
current_pipeline_render_targets_[i].format = DXGI_FORMAT_UNKNOWN;
}
const D3D12_CPU_DESCRIPTOR_HANDLE* dsv_handle;
const RenderTargetBinding& depth_binding = current_bindings_[4];
RenderTarget* depth_render_target = depth_binding.render_target;
current_pipeline_render_targets_[4].guest_render_target = 4;
if (depth_binding.is_bound && depth_render_target != nullptr) {
XELOGGPU("RT Depth: base %u, format %u", edram_bases[4], formats[4]);
command_processor_->PushTransitionBarrier(
depth_render_target->resource, depth_render_target->state,
D3D12_RESOURCE_STATE_DEPTH_WRITE);
depth_render_target->state = D3D12_RESOURCE_STATE_DEPTH_WRITE;
dsv_handle = &depth_binding.render_target->handle;
current_pipeline_render_targets_[4].format =
GetDepthDXGIFormat(DepthRenderTargetFormat(formats[4]));
} else {
dsv_handle = nullptr;
current_pipeline_render_targets_[4].format = DXGI_FORMAT_UNKNOWN;
}
command_processor_->SubmitBarriers();
command_processor_->GetDeferredCommandList()->D3DOMSetRenderTargets(
rtv_count, rtv_handles, FALSE, dsv_handle);
}
}
// Update the dirty regions.
for (uint32_t i = 0; i < 5; ++i) {
if (!enabled[i] || (i == 4 && depth_readonly)) {
continue;
}
RenderTargetBinding& binding = current_bindings_[i];
if (!rov_used && binding.render_target == nullptr) {
// Nothing to store to the EDRAM buffer if there was an error.
continue;
}
binding.edram_dirty_rows =
std::max(binding.edram_dirty_rows, edram_dirty_rows);
}
if (rov_used) {
// The buffer will be used for ROV drawing now.
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
edram_buffer_modified_ = true;
}
return true;
}
bool RenderTargetCache::Resolve(SharedMemory* shared_memory,
TextureCache* texture_cache, Memory* memory,
uint32_t& written_address_out,
uint32_t& written_length_out) {
written_address_out = written_length_out = 0;
if (!command_processor_->IsROVUsedForEDRAM()) {
// Save the currently bound render targets to the EDRAM buffer that will be
// used as the resolve source and clear bindings to allow render target
// resources to be reused as source textures for format conversion,
// resolving samples, to let format conversion bind other render targets,
// and so after a clear new data will be loaded.
StoreRenderTargetsToEDRAM();
ClearBindings();
}
auto& regs = *register_file_;
// Get the render target properties.
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
uint32_t surface_pitch = std::min(rb_surface_info.surface_pitch, 2560u);
if (surface_pitch == 0) {
return true;
}
// Depth info is always needed because color resolve may also clear depth.
auto rb_depth_info = regs.Get<reg::RB_DEPTH_INFO>();
uint32_t surface_index = regs.Get<reg::RB_COPY_CONTROL>().copy_src_select;
if (surface_index > 4) {
assert_always();
return false;
}
bool surface_is_depth = surface_index == 4;
uint32_t surface_edram_base;
uint32_t surface_format;
if (surface_is_depth) {
surface_edram_base = rb_depth_info.depth_base;
surface_format = uint32_t(rb_depth_info.depth_format);
} else {
auto color_info = regs.Get<reg::RB_COLOR_INFO>(
reg::RB_COLOR_INFO::rt_register_indices[surface_index]);
surface_edram_base = color_info.color_base;
surface_format = uint32_t(GetBaseColorFormat(color_info.color_format));
}
// Get the resolve region since both copying and clearing need it.
// HACK: Vertices to use are always in vf0.
const auto& fetch = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0);
assert_true(fetch.type == 3);
assert_true(fetch.endian == Endian::k8in32);
assert_true(fetch.size == 6);
const uint8_t* src_vertex_address =
memory->TranslatePhysical(fetch.address << 2);
float vertices[6];
// Most vertices have a negative half pixel offset applied, which we reverse.
float vertex_offset =
regs.Get<reg::PA_SU_VTX_CNTL>().pix_center ? 0.0f : 0.5f;
for (uint32_t i = 0; i < 6; ++i) {
vertices[i] =
xenos::GpuSwap(xe::load<float>(src_vertex_address + i * sizeof(float)),
Endian(fetch.endian)) +
vertex_offset;
}
// Xenos only supports rectangle copies (luckily).
//
// The rectangle is for both the source and the destination, according to how
// it's used in Tales of Vesperia.
//
// Direct3D 9 gives the rectangle in source render target coordinates (for
// example, in Halo 3 the sniper rifle scope has a (128,64)->(448,256)
// rectangle). It doesn't adjust the EDRAM base pointer, otherwise (taking
// into account that 4x MSAA is used for the scope) it would have been
// (8,0)->(328,192), but it's not. However, it adjusts the destination texture
// address so (0,0) relative to the destination address is (0,0) relative to
// the render target. When copying, we need to adjust the pointer to the first
// 32x32 tile that will actually be modified, by adding the value of
// XGAddress2DTiledOffset called for left/top & ~31. The pitch and height in
// RB_COPY_DEST_PITCH are actually specified for the region starting from the
// first modified 32x32 tile - it does not include the padding! (In the Halo 3
// sniper rifle scope example, the pitch and height are specified as 320x192,
// which is the size of the rectangle.)
//
// Window scissor must also be applied - in the jigsaw puzzle in Banjo-Tooie,
// there are 1280x720 resolve rectangles, but only the scissored 1280x256
// needs to be copied, otherwise it overflows even beyond the EDRAM, and the
// depth buffer is visible on the screen. It also ensures the coordinates are
// not negative (in F.E.A.R., for example, the right tile is resolved with
// vertices (-640,0)->(640,720), however, the destination texture pointer is
// adjusted properly to the right half of the texture, and the source render
// target has a pitch of 800).
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
D3D12_RECT rect;
rect.left = LONG(std::min(std::min(vertices[0], vertices[2]), vertices[4]));
rect.right = LONG(std::max(std::max(vertices[0], vertices[2]), vertices[4]));
rect.top = LONG(std::min(std::min(vertices[1], vertices[3]), vertices[5]));
rect.bottom = LONG(std::max(std::max(vertices[1], vertices[3]), vertices[5]));
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
rect.left += pa_sc_window_offset.window_x_offset;
rect.right += pa_sc_window_offset.window_x_offset;
rect.top += pa_sc_window_offset.window_y_offset;
rect.bottom += pa_sc_window_offset.window_y_offset;
}
D3D12_RECT scissor;
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
scissor.left = pa_sc_window_scissor_tl.tl_x;
scissor.right = pa_sc_window_scissor_br.br_x;
scissor.top = pa_sc_window_scissor_tl.tl_y;
scissor.bottom = pa_sc_window_scissor_br.br_y;
if (!pa_sc_window_scissor_tl.window_offset_disable) {
scissor.left = std::max(
LONG(scissor.left + pa_sc_window_offset.window_x_offset), LONG(0));
scissor.right = std::max(
LONG(scissor.right + pa_sc_window_offset.window_x_offset), LONG(0));
scissor.top = std::max(
LONG(scissor.top + pa_sc_window_offset.window_y_offset), LONG(0));
scissor.bottom = std::max(
LONG(scissor.bottom + pa_sc_window_offset.window_y_offset), LONG(0));
}
rect.left = std::max(rect.left, scissor.left);
rect.right = std::min(rect.right, scissor.right);
rect.top = std::max(rect.top, scissor.top);
rect.bottom = std::min(rect.bottom, scissor.bottom);
XELOGGPU(
"Resolve: (%d,%d)->(%d,%d) of RT %u (pitch %u, %u sample%s, format %u) "
"at %u",
rect.left, rect.top, rect.right, rect.bottom, surface_index,
surface_pitch, 1 << uint32_t(rb_surface_info.msaa_samples),
rb_surface_info.msaa_samples != MsaaSamples::k1X ? "s" : "",
surface_format, surface_edram_base);
if (rect.left >= rect.right || rect.top >= rect.bottom) {
// Nothing to copy.
return true;
}
if (command_processor_->IsROVUsedForEDRAM()) {
// Commit ROV writes.
CommitEDRAMBufferUAVWrites(false);
}
// GetEDRAMLayout in ResolveCopy and ResolveClear will perform the needed
// clamping to the source render target size.
bool result = ResolveCopy(shared_memory, texture_cache, surface_edram_base,
surface_pitch, rb_surface_info.msaa_samples,
surface_is_depth, surface_format, rect,
written_address_out, written_length_out);
// Clear the color RT if needed.
if (!surface_is_depth) {
result &=
ResolveClear(surface_edram_base, surface_pitch,
rb_surface_info.msaa_samples, false, surface_format, rect);
}
// Clear the depth RT if needed (may be cleared alongside color).
result &= ResolveClear(rb_depth_info.depth_base, surface_pitch,
rb_surface_info.msaa_samples, true,
uint32_t(rb_depth_info.depth_format), rect);
return result;
}
bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
TextureCache* texture_cache,
uint32_t edram_base, uint32_t surface_pitch,
MsaaSamples msaa_samples, bool is_depth,
uint32_t src_format, const D3D12_RECT& rect,
uint32_t& written_address_out,
uint32_t& written_length_out) {
written_address_out = written_length_out = 0;
auto& regs = *register_file_;
auto rb_copy_control = regs.Get<reg::RB_COPY_CONTROL>();
if (rb_copy_control.copy_command != xenos::CopyCommand::kRaw &&
rb_copy_control.copy_command != xenos::CopyCommand::kConvert) {
// TODO(Triang3l): Handle kConstantOne and kNull.
assert_always();
return false;
}
auto command_list = command_processor_->GetDeferredCommandList();
// Get format info.
auto rb_copy_dest_info = regs.Get<reg::RB_COPY_DEST_INFO>();
TextureFormat src_texture_format;
bool src_64bpp;
if (is_depth) {
src_texture_format =
DepthRenderTargetToTextureFormat(DepthRenderTargetFormat(src_format));
src_64bpp = false;
} else {
// Force k_16_16 and k_16_16_16_16 RTs to be always resolved via drawing,
// because resolving to a k_16_16 or a k_16_16_16_16 texture should result
// in unsigned texture data, unlike the render target which is signed.
if (ColorRenderTargetFormat(src_format) ==
ColorRenderTargetFormat::k_16_16) {
src_texture_format = TextureFormat::k_16_16_EDRAM;
} else if (ColorRenderTargetFormat(src_format) ==
ColorRenderTargetFormat::k_16_16_16_16) {
src_texture_format = TextureFormat::k_16_16_16_16_EDRAM;
} else {
src_texture_format = GetBaseFormat(ColorRenderTargetToTextureFormat(
ColorRenderTargetFormat(src_format)));
}
src_64bpp = IsColorFormat64bpp(ColorRenderTargetFormat(src_format));
}
assert_true(src_texture_format != TextureFormat::kUnknown);
// The destination format is specified as k_8_8_8_8 when resolving depth, but
// no format conversion is done for depth, so ignore it.
TextureFormat dest_format =
is_depth
? src_texture_format
: GetBaseFormat(TextureFormat(rb_copy_dest_info.copy_dest_format));
const FormatInfo* dest_format_info = FormatInfo::Get(dest_format);
// Get the destination region and clamp the source region to it.
auto rb_copy_dest_pitch = regs.Get<reg::RB_COPY_DEST_PITCH>();
uint32_t dest_pitch = rb_copy_dest_pitch.copy_dest_pitch;
uint32_t dest_height = rb_copy_dest_pitch.copy_dest_height;
if (dest_pitch == 0 || dest_height == 0) {
// Nothing to copy.
return true;
}
D3D12_RECT copy_rect;
copy_rect.left = rect.left;
copy_rect.top = rect.top;
copy_rect.right =
std::min(rect.right, (rect.left & ~LONG(31)) + LONG(dest_pitch));
copy_rect.bottom =
std::min(rect.bottom, (rect.top & ~LONG(31)) + LONG(dest_height));
if (copy_rect.left >= copy_rect.right || copy_rect.top >= copy_rect.bottom) {
// Nothing to copy.
return true;
}
// Validate and clamp the source region, skip parts that don't need to be
// copied and calculate the number of threads needed for copying/loading.
// copy_rect will be modified and will become only the source rectangle, for
// the destination region, use the original rect from the arguments.
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
if (!GetEDRAMLayout(surface_pitch, msaa_samples, src_64bpp, edram_base,
copy_rect, surface_pitch_tiles, row_width_ss_div_80,
rows)) {
// Nothing to copy.
return true;
}
// Get the destination location and adjust it to the first 32x32 tile modified
// by the resolve (the pitch and the height are relative to that tile, not to
// 0,0 of the resolve rectangle).
uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF;
// An example of a 3D resolve destination is the color grading LUT (used
// starting from the developer/publisher intro) in Dead Space 3.
if (rb_copy_dest_info.copy_dest_array) {
dest_address += texture_util::GetTiledOffset3D(
int(rect.left & ~LONG(31)), int(rect.top & ~LONG(31)), 0, dest_pitch,
dest_height, xe::log2_floor(dest_format_info->bits_per_pixel >> 3));
} else {
dest_address += texture_util::GetTiledOffset2D(
int(rect.left & ~LONG(31)), int(rect.top & ~LONG(31)), dest_pitch,
xe::log2_floor(dest_format_info->bits_per_pixel >> 3));
}
if (dest_address & 0x3) {
assert_always();
// Not 4-aligning may break UAV access significantly, let's hope games don't
// resolve to 8bpp or 16bpp textures at very odd locations.
return false;
}
uint32_t dest_z =
rb_copy_dest_info.copy_dest_array ? rb_copy_dest_info.copy_dest_slice : 0;
// See what samples we need and what we should do with them.
xenos::CopySampleSelect sample_select = rb_copy_control.copy_sample_select;
if (is_depth && sample_select > xenos::CopySampleSelect::k3) {
assert_always();
return false;
}
int32_t dest_exp_bias;
if (is_depth) {
dest_exp_bias = 0;
} else {
dest_exp_bias = rb_copy_dest_info.copy_dest_exp_bias;
if (ColorRenderTargetFormat(src_format) ==
ColorRenderTargetFormat::k_16_16 ||
ColorRenderTargetFormat(src_format) ==
ColorRenderTargetFormat::k_16_16_16_16) {
// On the Xbox 360, k_16_16_EDRAM and k_16_16_16_16_EDRAM internally have
// -32...32 range, but they're emulated using normalized RG16/RGBA16, so
// sampling the host render target gives 1/32 of what is actually stored
// there on the guest side.
// http://www.students.science.uu.nl/~3220516/advancedgraphics/papers/inferred_lighting.pdf
if (command_processor_->IsROVUsedForEDRAM() ||
cvars::d3d12_16bit_rtv_full_range) {
dest_exp_bias += 5;
}
}
}
bool dest_swap = !is_depth && rb_copy_dest_info.copy_dest_swap;
XELOGGPU(
"Resolve: Copying samples %u to 0x%.8X (%ux%u, %cD), destination Z %u, "
"destination format %s, exponent bias %d, red and blue %sswapped",
uint32_t(sample_select), dest_address, dest_pitch, dest_height,
rb_copy_dest_info.copy_dest_array ? '3' : '2', dest_z,
dest_format_info->name, dest_exp_bias, dest_swap ? "" : "not ");
// There are 2 paths for resolving in this function - they don't necessarily
// have to map directly to kRaw and kConvert CopyCommands.
// - Raw - when extracting a single color to a texture of the same format as
// the EDRAM surface and exponent bias is not applied, or when resolving a
// depth buffer (games read only one sample of it - resolving multiple
// samples of a depth buffer is meaningless anyway - and apparently there's
// no format conversion as well because k_8_8_8_8 is specified in the
// destination format in the register, which is obviously not true, and the
// texture is then read as k_24_8 or k_24_8_FLOAT). Swapping red and blue is
// possible in this mode.
// - Conversion - when a simple copy is not enough. The EDRAM region is loaded
// to a render target resource, which is then used as a texture in a shader
// performing the resolve (by sampling the texture on or between pixels with
// bilinear filtering), applying exponent bias and swapping red and blue in
// a format-agnostic way, then the resulting color is written to a temporary
// RTV of the destination format.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
uint32_t resolution_scale_log2 = resolution_scale_2x_ ? 1 : 0;
// Check if we need to apply the hack to remove the gap on the left and top
// sides of the screen caused by half-pixel offset becoming whole pixel offset
// with scaled rendering resolution.
bool resolution_scale_edge_clamp =
resolution_scale_2x_ &&
cvars::d3d12_resolution_scale_resolve_edge_clamp &&
cvars::d3d12_half_pixel_offset &&
!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center;
if (sample_select <= xenos::CopySampleSelect::k3 &&
src_texture_format == dest_format && dest_exp_bias == 0) {
// *************************************************************************
// Raw copy
// *************************************************************************
XELOGGPU("Resolve: Copying using a compute shader");
// Calculate the size of the region that specifically is being resolved.
// Can't just use the texture height for size calculation because it's
// sometimes bigger than needed (in Red Dead Redemption, an UI texture used
// for the letterbox bars alpha is located within a 1280x720 resolve target,
// but only 1280x208 is being resolved, and with scaled resolution the UI
// texture gets ignored). This doesn't apply to 3D resolves, however,
// because their tiling is more complex - some excess data will even be
// marked as resolved for them if resolving not to (0,0).
uint32_t dest_size;
uint32_t dest_modified_start = dest_address;
uint32_t dest_modified_length;
if (rb_copy_dest_info.copy_dest_array) {
// Depth granularity is 4 (though TiledAddress chaining is possible with 8
// granularity).
dest_size = texture_util::GetGuestMipSliceStorageSize(
xe::align(dest_pitch, 32u), xe::align(dest_height, 32u), 4, true,
dest_format, nullptr, false);
if (dest_z >= 4) {
dest_modified_start += dest_size;
}
dest_modified_length = dest_size;
dest_size *= 2;
} else {
dest_size = texture_util::GetGuestMipSliceStorageSize(
xe::align(dest_pitch, 32u),
xe::align(
uint32_t((rect.top & 31) + copy_rect.bottom - copy_rect.top),
32u),
1, true, dest_format, nullptr, false);
dest_modified_length = dest_size;
}
// Make sure we have the memory to write to. dest_address (and thus
// dest_range_start) already adjusted to the first modified 32x32 tile.
if (resolution_scale_2x_) {
if (!texture_cache->EnsureScaledResolveBufferResident(
dest_modified_start, dest_modified_length)) {
return false;
}
} else {
if (!shared_memory->MakeTilesResident(dest_modified_start,
dest_modified_length)) {
return false;
}
}
// Write the source and destination descriptors.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(
0, 2, 2, descriptor_cpu_start, descriptor_gpu_start) == 0) {
return false;
}
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
WriteEDRAMRawSRVDescriptor(descriptor_cpu_start);
if (resolution_scale_2x_) {
texture_cache->UseScaledResolveBufferForWriting();
// Can't address more than 512 MB directly on Nvidia - binding only a part
// of the buffer.
texture_cache->CreateScaledResolveBufferRawUAV(
provider->OffsetViewDescriptor(descriptor_cpu_start, 1),
dest_address >> 12,
((dest_address + dest_size - 1) >> 12) - (dest_address >> 12) + 1);
} else {
shared_memory->UseForWriting();
shared_memory->WriteRawUAVDescriptor(
provider->OffsetViewDescriptor(descriptor_cpu_start, 1));
}
command_processor_->SubmitBarriers();
// Dispatch the computation.
command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_);
EDRAMLoadStoreRootConstants root_constants;
// Address is adjusted to the first modified tile, so using & 31 as the
// destination offset.
root_constants.tile_sample_dimensions[0] =
uint32_t(copy_rect.right - copy_rect.left) |
((uint32_t(rect.left) & 31) << 12) | (dest_z << 17) |
(uint32_t(copy_rect.left) << 20);
root_constants.tile_sample_dimensions[1] =
uint32_t(copy_rect.bottom - copy_rect.top) |
((uint32_t(rect.top) & 31) << 12) | (uint32_t(copy_rect.top) << 20);
root_constants.tile_sample_dest_base = dest_address;
if (resolution_scale_2x_) {
// Can't address more than 512 MB directly on Nvidia - binding only a part
// of the buffer.
root_constants.tile_sample_dest_base -= dest_address & ~0xFFFu;
}
assert_true(dest_pitch <= 8192);
root_constants.tile_sample_dest_info =
((dest_pitch + 31) >> 5) |
(rb_copy_dest_info.copy_dest_array ? (((dest_height + 31) >> 5) << 9)
: 0) |
(uint32_t(sample_select) << 18) |
(uint32_t(rb_copy_dest_info.copy_dest_endian) << 20);
if (dest_swap) {
root_constants.tile_sample_dest_info |= (1 << 23) | (src_format << 24);
}
root_constants.base_samples_2x_depth_pitch =
edram_base | (resolution_scale_log2 << 13) |
(resolution_scale_edge_clamp ? (1 << 14) : 0) |
(is_depth ? (1 << 15) : 0) | (surface_pitch_tiles << 16);
if (msaa_samples >= MsaaSamples::k2X) {
root_constants.base_samples_2x_depth_pitch |= 1 << 11;
if (msaa_samples >= MsaaSamples::k4X) {
root_constants.base_samples_2x_depth_pitch |= 1 << 12;
}
}
command_list->D3DSetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start);
command_processor_->SetComputePipeline(
src_64bpp ? edram_tile_sample_64bpp_pipeline_
: edram_tile_sample_32bpp_pipeline_);
// 1 group per destination 80x16 region.
uint32_t group_count_x = row_width_ss_div_80, group_count_y = rows;
if (msaa_samples >= MsaaSamples::k2X) {
group_count_y = (group_count_y + 1) >> 1;
if (msaa_samples >= MsaaSamples::k4X) {
group_count_x = (group_count_x + 1) >> 1;
}
}
// With 2x scaling, destination width and height are 2x bigger, and 1 group
// is 80x16 destination pixels after applying the resolution scale.
group_count_x <<= resolution_scale_log2;
group_count_y <<= resolution_scale_log2;
command_list->D3DDispatch(group_count_x, group_count_y, 1);
// Commit the write.
command_processor_->PushUAVBarrier(
resolution_scale_2x_ ? texture_cache->GetScaledResolveBuffer()
: shared_memory->GetBuffer());
// Invalidate textures and mark the range as scaled if needed.
texture_cache->MarkRangeAsResolved(dest_modified_start,
dest_modified_length);
written_address_out = dest_modified_start;
written_length_out = dest_modified_length;
} else {
// *************************************************************************
// Conversion and AA resolving
// *************************************************************************
XELOGGPU("Resolve: Copying via drawing");
// Get everything we need for the conversion.
// DXGI format (also checking whether this resolve is possible).
DXGI_FORMAT dest_dxgi_format =
texture_cache->GetResolveDXGIFormat(dest_format);
if (dest_dxgi_format == DXGI_FORMAT_UNKNOWN) {
XELOGE(
"No resolve pipeline for destination format %s - tell Xenia "
"developers!",
FormatInfo::Get(dest_format)->name);
return false;
}
// Resolve pipeline.
ID3D12PipelineState* resolve_pipeline =
GetResolvePipeline(dest_dxgi_format);
if (resolve_pipeline == nullptr) {
return false;
}
RenderTargetKey render_target_key;
render_target_key.width_ss_div_80 = row_width_ss_div_80;
render_target_key.height_ss_div_16 = rows;
if (resolution_scale_2x_) {
render_target_key.width_ss_div_80 *= 2;
render_target_key.height_ss_div_16 *= 2;
}
render_target_key.is_depth = false;
render_target_key.format = src_format;
// Render target for loading the EDRAM buffer contents as a texture.
RenderTarget* render_target =
FindOrCreateRenderTarget(render_target_key, 0);
if (render_target == nullptr) {
return false;
}
const D3D12_PLACED_SUBRESOURCE_FOOTPRINT& footprint =
render_target->footprints[0];
// Size of the resolved area.
uint32_t copy_width = copy_rect.right - copy_rect.left;
uint32_t copy_height = copy_rect.bottom - copy_rect.top;
// Resolve target for output merger format conversion.
#if 0
ResolveTarget* resolve_target =
FindOrCreateResolveTarget(copy_width, copy_height, dest_dxgi_format,
render_target->heap_page_count);
#else
ResolveTarget* resolve_target =
FindOrCreateResolveTarget(copy_width, copy_height, dest_dxgi_format);
#endif
if (resolve_target == nullptr) {
return false;
}
// Descriptors. 2 for EDRAM load, 1 for conversion.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(
0, 3, 3, descriptor_cpu_start, descriptor_gpu_start) == 0) {
return false;
}
// Buffer for copying.
D3D12_RESOURCE_STATES copy_buffer_state =
D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
std::max(render_target->copy_buffer_size,
resolve_target->copy_buffer_size),
copy_buffer_state);
if (copy_buffer == nullptr) {
return false;
}
// Load the EDRAM buffer contents to the copy buffer.
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
command_processor_->SubmitBarriers();
command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_);
EDRAMLoadStoreRootConstants load_root_constants;
load_root_constants.rt_color_depth_offset = uint32_t(footprint.Offset);
load_root_constants.rt_color_depth_pitch =
uint32_t(footprint.Footprint.RowPitch);
load_root_constants.base_samples_2x_depth_pitch =
edram_base | (resolution_scale_log2 << 13) |
(surface_pitch_tiles << 16);
if (msaa_samples >= MsaaSamples::k2X) {
load_root_constants.base_samples_2x_depth_pitch |= 1 << 11;
if (msaa_samples >= MsaaSamples::k4X) {
load_root_constants.base_samples_2x_depth_pitch |= 1 << 12;
}
}
command_list->D3DSetComputeRoot32BitConstants(
0, sizeof(load_root_constants) / sizeof(uint32_t), &load_root_constants,
0);
WriteEDRAMRawSRVDescriptor(descriptor_cpu_start);
ui::d3d12::util::CreateRawBufferUAV(
device, provider->OffsetViewDescriptor(descriptor_cpu_start, 1),
copy_buffer, render_target->copy_buffer_size);
command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start);
EDRAMLoadStoreMode mode = GetLoadStoreMode(false, src_format);
command_processor_->SetComputePipeline(
resolution_scale_2x_ ? edram_load_2x_resolve_pipelines_[size_t(mode)]
: edram_load_pipelines_[size_t(mode)]);
// 1 group per 80x16 samples, with both 1x and 2x resolution scales.
command_list->D3DDispatch(row_width_ss_div_80, rows, 1);
command_processor_->PushUAVBarrier(copy_buffer);
// Go to the next descriptor set.
descriptor_cpu_start =
provider->OffsetViewDescriptor(descriptor_cpu_start, 2);
descriptor_gpu_start =
provider->OffsetViewDescriptor(descriptor_gpu_start, 2);
// Copy the EDRAM buffer contents to the source texture.
#if 0
command_processor_->PushAliasingBarrier(nullptr, render_target->resource);
#endif
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_COPY_SOURCE);
copy_buffer_state = D3D12_RESOURCE_STATE_COPY_SOURCE;
command_processor_->PushTransitionBarrier(render_target->resource,
render_target->state,
D3D12_RESOURCE_STATE_COPY_DEST);
render_target->state = D3D12_RESOURCE_STATE_COPY_DEST;
command_processor_->SubmitBarriers();
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
location_source.pResource = copy_buffer;
location_source.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
location_source.PlacedFootprint = render_target->footprints[0];
location_dest.pResource = render_target->resource;
location_dest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
location_dest.SubresourceIndex = 0;
command_list->CopyTexture(location_dest, location_source);
// Do the resolve. Render targets unbound already, safe to call
// OMSetRenderTargets.
#if 0
command_processor_->PushAliasingBarrier(nullptr, resolve_target->resource);
#endif
command_processor_->PushTransitionBarrier(
render_target->resource, render_target->state,
D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
render_target->state = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
command_processor_->PushTransitionBarrier(
resolve_target->resource, resolve_target->state,
D3D12_RESOURCE_STATE_RENDER_TARGET);
resolve_target->state = D3D12_RESOURCE_STATE_RENDER_TARGET;
command_list->D3DSetGraphicsRootSignature(resolve_root_signature_);
ResolveRootConstants resolve_root_constants;
uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0;
uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0;
resolve_root_constants.rect_samples_lw =
(copy_rect.left << (samples_x_log2 + resolution_scale_log2)) |
(copy_width << (16 + samples_x_log2 + resolution_scale_log2));
resolve_root_constants.rect_samples_th =
(copy_rect.top << (samples_y_log2 + resolution_scale_log2)) |
(copy_height << (16 + samples_y_log2 + resolution_scale_log2));
resolve_root_constants.source_size =
(render_target_key.width_ss_div_80 * 80) |
(render_target_key.height_ss_div_16 << (4 + 16));
resolve_root_constants.resolve_info =
samples_y_log2 | (samples_x_log2 << 1) |
(resolution_scale_edge_clamp ? (1 << 6) : 0) |
((uint32_t(dest_exp_bias) & 0x3F) << 7);
if (msaa_samples == MsaaSamples::k1X) {
// No offset.
resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4);
} else if (msaa_samples == MsaaSamples::k2X) {
// -0.5 or +0.5 samples vertical offset if getting only one sample.
if (sample_select == xenos::CopySampleSelect::k0) {
resolve_root_constants.resolve_info |= (0 << 2) | (1 << 4);
} else if (sample_select == xenos::CopySampleSelect::k1) {
resolve_root_constants.resolve_info |= (2 << 2) | (1 << 4);
} else {
resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4);
}
} else {
// -0.5 or +0.5 samples offsets if getting one or two samples.
switch (sample_select) {
case xenos::CopySampleSelect::k0:
resolve_root_constants.resolve_info |= (0 << 2) | (0 << 4);
break;
case xenos::CopySampleSelect::k1:
resolve_root_constants.resolve_info |= (2 << 2) | (0 << 4);
break;
case xenos::CopySampleSelect::k2:
resolve_root_constants.resolve_info |= (0 << 2) | (2 << 4);
break;
case xenos::CopySampleSelect::k3:
resolve_root_constants.resolve_info |= (2 << 2) | (2 << 4);
break;
case xenos::CopySampleSelect::k01:
resolve_root_constants.resolve_info |= (1 << 2) | (0 << 4);
break;
case xenos::CopySampleSelect::k23:
resolve_root_constants.resolve_info |= (1 << 2) | (2 << 4);
break;
default:
resolve_root_constants.resolve_info |= (1 << 2) | (1 << 4);
break;
}
}
command_list->D3DSetGraphicsRoot32BitConstants(
0, sizeof(resolve_root_constants) / sizeof(uint32_t),
&resolve_root_constants, 0);
D3D12_SHADER_RESOURCE_VIEW_DESC rt_srv_desc;
rt_srv_desc.Format =
GetColorDXGIFormat(ColorRenderTargetFormat(src_format));
rt_srv_desc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
UINT swizzle = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
if (dest_swap) {
switch (ColorRenderTargetFormat(src_format)) {
case ColorRenderTargetFormat::k_8_8_8_8:
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
case ColorRenderTargetFormat::k_2_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case ColorRenderTargetFormat::k_16_16_16_16:
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
swizzle = D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(2, 1, 0, 3);
break;
default:
break;
}
}
if (dest_format == TextureFormat::k_6_5_5) {
// Green bits of the resolve target used for blue, and blue bits used for
// green.
swizzle = D3D12_ENCODE_SHADER_4_COMPONENT_MAPPING(
D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(0, swizzle),
D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(2, swizzle),
D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(1, swizzle),
D3D12_DECODE_SHADER_4_COMPONENT_MAPPING(3, swizzle));
}
rt_srv_desc.Shader4ComponentMapping = swizzle;
rt_srv_desc.Texture2D.MostDetailedMip = 0;
rt_srv_desc.Texture2D.MipLevels = 1;
rt_srv_desc.Texture2D.PlaneSlice = 0;
rt_srv_desc.Texture2D.ResourceMinLODClamp = 0.0f;
device->CreateShaderResourceView(render_target->resource, &rt_srv_desc,
descriptor_cpu_start);
command_list->D3DSetGraphicsRootDescriptorTable(1, descriptor_gpu_start);
command_processor_->SubmitBarriers();
command_processor_->SetSamplePositions(MsaaSamples::k1X);
command_processor_->SetExternalGraphicsPipeline(resolve_pipeline);
command_list->D3DOMSetRenderTargets(1, &resolve_target->rtv_handle, TRUE,
nullptr);
D3D12_VIEWPORT viewport;
viewport.TopLeftX = 0.0f;
viewport.TopLeftY = 0.0f;
viewport.Width = float(copy_width << resolution_scale_log2);
viewport.Height = float(copy_height << resolution_scale_log2);
viewport.MinDepth = 0.0f;
viewport.MaxDepth = 1.0f;
command_list->RSSetViewport(viewport);
D3D12_RECT scissor;
scissor.left = 0;
scissor.top = 0;
scissor.right = copy_width << resolution_scale_log2;
scissor.bottom = copy_height << resolution_scale_log2;
command_list->RSSetScissorRect(scissor);
command_list->D3DIASetPrimitiveTopology(
D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
command_list->D3DDrawInstanced(3, 1, 0, 0);
if (command_processor_->IsROVUsedForEDRAM()) {
// Clean up - the ROV path doesn't need render targets bound and has
// non-zero ForcedSampleCount.
command_list->D3DOMSetRenderTargets(0, nullptr, FALSE, nullptr);
}
// Copy the resolve target to the buffer.
command_processor_->PushTransitionBarrier(resolve_target->resource,
resolve_target->state,
D3D12_RESOURCE_STATE_COPY_SOURCE);
resolve_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE;
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_COPY_DEST);
copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
command_processor_->SubmitBarriers();
location_source.pResource = resolve_target->resource;
location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
location_source.SubresourceIndex = 0;
location_dest.pResource = copy_buffer;
location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
location_dest.PlacedFootprint = resolve_target->footprint;
command_list->CopyTexture(location_dest, location_source);
// Tile the resolved texture. The texture cache expects the buffer to be a
// non-pixel-shader SRV.
command_processor_->PushTransitionBarrier(
copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
// dest_address already adjusted, so offsets are & 31.
texture_cache->TileResolvedTexture(
dest_format, dest_address, dest_pitch, dest_height,
rb_copy_dest_info.copy_dest_array != 0, uint32_t(rect.left) & 31,
uint32_t(rect.top) & 31, dest_z, copy_width, copy_height,
rb_copy_dest_info.copy_dest_endian, copy_buffer,
resolve_target->copy_buffer_size, resolve_target->footprint,
&written_address_out, &written_length_out);
// Done with the copy buffer.
command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
}
return true;
}
bool RenderTargetCache::ResolveClear(uint32_t edram_base,
uint32_t surface_pitch,
MsaaSamples msaa_samples, bool is_depth,
uint32_t format, const D3D12_RECT& rect) {
auto& regs = *register_file_;
// Check if clearing is enabled.
auto rb_copy_control = regs.Get<reg::RB_COPY_CONTROL>();
if (is_depth) {
if (!rb_copy_control.depth_clear_enable) {
return true;
}
} else {
if (!rb_copy_control.color_clear_enable) {
return true;
}
}
XELOGGPU("Resolve: Clearing the %s render target",
is_depth ? "depth" : "color");
// Calculate the layout.
bool is_64bpp =
!is_depth && IsColorFormat64bpp(ColorRenderTargetFormat(format));
D3D12_RECT clear_rect = rect;
uint32_t surface_pitch_tiles, row_width_ss_div_80, rows;
if (!GetEDRAMLayout(surface_pitch, msaa_samples, is_64bpp, edram_base,
clear_rect, surface_pitch_tiles, row_width_ss_div_80,
rows)) {
// Nothing to clear.
return true;
}
uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0;
uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0;
// Get everything needed for clearing.
auto command_list = command_processor_->GetDeferredCommandList();
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(0, 1, 1, descriptor_cpu_start,
descriptor_gpu_start) == 0) {
return false;
}
// Submit the clear.
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
command_processor_->SubmitBarriers();
EDRAMLoadStoreRootConstants root_constants;
root_constants.clear_rect_lt = (clear_rect.left << samples_x_log2) |
(clear_rect.top << (16 + samples_y_log2));
root_constants.clear_rect_rb = (clear_rect.right << samples_x_log2) |
(clear_rect.bottom << (16 + samples_y_log2));
root_constants.base_samples_2x_depth_pitch =
edram_base | (samples_y_log2 << 11) | (samples_x_log2 << 12) |
(resolution_scale_2x_ ? (1 << 13) : 0) | (is_depth ? (1 << 15) : 0) |
(surface_pitch_tiles << 16);
// When ROV is used, there's no 32-bit depth buffer.
if (!command_processor_->IsROVUsedForEDRAM() && is_depth &&
DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8) {
root_constants.clear_depth24 = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
// 20e4 [0,2), based on CFloat24 from d3dref9.dll and on 6e4 in DirectXTex.
uint32_t depth24 = root_constants.clear_depth24 >> 8;
if (depth24 == 0) {
root_constants.clear_depth32 = 0;
} else {
uint32_t mantissa = depth24 & 0xFFFFFu, exponent = depth24 >> 20;
if (exponent == 0) {
// Normalize the value in the resulting float.
// do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0)
uint32_t mantissa_lzcnt = xe::lzcnt(mantissa) - (32u - 21u);
exponent = 1u - mantissa_lzcnt;
mantissa = (mantissa << mantissa_lzcnt) & 0xFFFFFu;
}
root_constants.clear_depth32 =
((exponent + 112u) << 23) | (mantissa << 3);
}
command_processor_->SetComputePipeline(edram_clear_depth_float_pipeline_);
} else if (is_64bpp) {
// TODO(Triang3l): Check which 32-bit portion is in which register.
root_constants.clear_color_high = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
root_constants.clear_color_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
command_processor_->SetComputePipeline(edram_clear_64bpp_pipeline_);
} else {
Register reg =
is_depth ? XE_GPU_REG_RB_DEPTH_CLEAR : XE_GPU_REG_RB_COLOR_CLEAR;
root_constants.clear_color_high = regs[reg].u32;
command_processor_->SetComputePipeline(edram_clear_32bpp_pipeline_);
}
command_list->D3DSetComputeRootSignature(edram_clear_root_signature_);
command_list->D3DSetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
WriteEDRAMRawUAVDescriptor(descriptor_cpu_start);
command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start);
// 1 group per 80x16 samples. Resolution scale handled in the shader itself.
command_list->D3DDispatch(row_width_ss_div_80, rows, 1);
CommitEDRAMBufferUAVWrites(true);
return true;
}
ID3D12PipelineState* RenderTargetCache::GetResolvePipeline(
DXGI_FORMAT dest_format) {
// Try to find an existing pipeline.
for (auto& resolve_pipeline : resolve_pipelines_) {
if (resolve_pipeline.dest_format == dest_format) {
return resolve_pipeline.pipeline;
}
}
// Create a new pipeline.
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_GRAPHICS_PIPELINE_STATE_DESC pipeline_desc = {};
pipeline_desc.pRootSignature = resolve_root_signature_;
pipeline_desc.VS.pShaderBytecode = resolve_vs;
pipeline_desc.VS.BytecodeLength = sizeof(resolve_vs);
pipeline_desc.PS.pShaderBytecode = resolve_ps;
pipeline_desc.PS.BytecodeLength = sizeof(resolve_ps);
pipeline_desc.BlendState.RenderTarget[0].RenderTargetWriteMask =
D3D12_COLOR_WRITE_ENABLE_ALL;
pipeline_desc.SampleMask = UINT_MAX;
pipeline_desc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
pipeline_desc.RasterizerState.CullMode = D3D12_CULL_MODE_NONE;
pipeline_desc.RasterizerState.DepthClipEnable = TRUE;
pipeline_desc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
pipeline_desc.NumRenderTargets = 1;
pipeline_desc.RTVFormats[0] = dest_format;
pipeline_desc.SampleDesc.Count = 1;
ID3D12PipelineState* pipeline;
if (FAILED(device->CreateGraphicsPipelineState(&pipeline_desc,
IID_PPV_ARGS(&pipeline)))) {
XELOGE("Failed to create the resolve pipeline for DXGI format %u",
dest_format);
return nullptr;
}
ResolvePipeline new_resolve_pipeline;
new_resolve_pipeline.pipeline = pipeline;
new_resolve_pipeline.dest_format = dest_format;
resolve_pipelines_.push_back(new_resolve_pipeline);
return pipeline;
}
RenderTargetCache::ResolveTarget* RenderTargetCache::FindOrCreateResolveTarget(
#if 0
uint32_t width_unscaled, uint32_t height_unscaled, DXGI_FORMAT format,
uint32_t min_heap_page_first) {
#else
uint32_t width_unscaled, uint32_t height_unscaled, DXGI_FORMAT format
#endif
) {
#if 0
assert_true(min_heap_page_first < kHeap4MBPages * 5);
#endif
if (width_unscaled == 0 || height_unscaled == 0 || width_unscaled > 2160 ||
height_unscaled > 2160) {
assert_always();
return nullptr;
}
uint32_t width_scaled = width_unscaled, height_scaled = height_unscaled;
if (resolution_scale_2x_) {
width_scaled *= 2;
height_scaled *= 2;
}
ResolveTargetKey key;
key.width_div_32 = (width_scaled + 31) >> 5;
key.height_div_32 = (height_scaled + 31) >> 5;
key.format = format;
// Try to find an existing target that isn't overlapping the resolve source.
#if 0
auto found_range = resolve_targets_.equal_range(key.value);
for (auto iter = found_range.first; iter != found_range.second; ++iter) {
ResolveTarget* found_resolve_target = iter->second;
if (found_resolve_target->heap_page_first >= min_heap_page_first) {
return found_resolve_target;
}
}
#else
auto found_iter = resolve_targets_.find(key.value);
if (found_iter != resolve_targets_.end()) {
return found_iter->second;
}
#endif
// Ensure the new resolve target can get an RTV descriptor.
if (!EnsureRTVHeapAvailable(false)) {
return nullptr;
}
// Allocate a new resolve target.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
D3D12_RESOURCE_DESC resource_desc;
resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
resource_desc.Alignment = 0;
resource_desc.Width = key.width_div_32 << 5;
resource_desc.Height = key.height_div_32 << 5;
resource_desc.DepthOrArraySize = 1;
resource_desc.MipLevels = 1;
resource_desc.Format = format;
resource_desc.SampleDesc.Count = 1;
resource_desc.SampleDesc.Quality = 0;
resource_desc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
resource_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
#if 0
D3D12_RESOURCE_ALLOCATION_INFO allocation_info =
device->GetResourceAllocationInfo(0, 1, &resource_desc);
uint32_t heap_page_count =
(uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22;
if (heap_page_count == 0 || heap_page_count > kHeap4MBPages) {
assert_always();
XELOGE(
"%ux%u resolve target with DXGI format %u can't fit in a heap, "
"needs %u bytes - tell Xenia developers to increase the heap size!",
uint32_t(resource_desc.Width), resource_desc.Height, format,
uint32_t(allocation_info.SizeInBytes));
return nullptr;
}
if (kHeap4MBPages - (min_heap_page_first % kHeap4MBPages) < heap_page_count) {
// Go to the next heap if no free space in the current one.
min_heap_page_first = xe::round_up(min_heap_page_first, kHeap4MBPages);
assert_true(min_heap_page_first < kHeap4MBPages * 5);
}
// Create the memory heap if it doesn't exist yet.
uint32_t heap_index = min_heap_page_first / kHeap4MBPages;
if (!MakeHeapResident(heap_index)) {
return nullptr;
}
#endif
// Create it.
// The first action likely to be done is resolve.
D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_RENDER_TARGET;
ID3D12Resource* resource;
#if 0
if (FAILED(device->CreatePlacedResource(
heaps_[heap_index], (min_heap_page_first % kHeap4MBPages) << 22,
&resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) {
XELOGE(
"Failed to create a placed resource for %ux%u resolve target with DXGI "
"format %u at heap 4 MB pages %u:%u",
uint32_t(resource_desc.Width), resource_desc.Height, format,
min_heap_page_first, min_heap_page_first + heap_page_count - 1);
return nullptr;
}
#else
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE,
&resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) {
XELOGE(
"Failed to create a committed resource for %ux%u resolve target with "
"DXGI format %u",
uint32_t(resource_desc.Width), resource_desc.Height, format);
return nullptr;
}
#endif
// Create the RTV.
D3D12_CPU_DESCRIPTOR_HANDLE rtv_handle =
provider->OffsetRTVDescriptor(descriptor_heaps_color_->start_handle,
descriptor_heaps_color_->descriptors_used);
D3D12_RENDER_TARGET_VIEW_DESC rtv_desc;
rtv_desc.Format = format;
rtv_desc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;
rtv_desc.Texture2D.MipSlice = 0;
rtv_desc.Texture2D.PlaneSlice = 0;
device->CreateRenderTargetView(resource, &rtv_desc, rtv_handle);
++descriptor_heaps_color_->descriptors_used;
// Add the new resolve target to the cache.
ResolveTarget* resolve_target = new ResolveTarget;
resolve_target->resource = resource;
resolve_target->state = state;
resolve_target->rtv_handle.ptr = rtv_handle.ptr;
resolve_target->key.value = key.value;
#if 0
resolve_target->heap_page_first = min_heap_page_first;
#endif
UINT64 copy_buffer_size;
device->GetCopyableFootprints(&resource_desc, 0, 1, 0,
&resolve_target->footprint, nullptr, nullptr,
&copy_buffer_size);
// Safety (though if width and height are aligned to 32 it will be fine, but
// just in case this changes).
copy_buffer_size =
xe::align(copy_buffer_size, UINT64(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT));
resolve_target->copy_buffer_size = uint32_t(copy_buffer_size);
resolve_targets_.insert(std::make_pair(key.value, resolve_target));
COUNT_profile_set("gpu/render_target_cache/resolve_targets",
resolve_targets_.size());
return resolve_target;
}
void RenderTargetCache::UnbindRenderTargets() {
if (command_processor_->IsROVUsedForEDRAM()) {
return;
}
StoreRenderTargetsToEDRAM();
ClearBindings();
}
void RenderTargetCache::WriteEDRAMUint32UAVDescriptor(
D3D12_CPU_DESCRIPTOR_HANDLE handle) {
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
device->CopyDescriptorsSimple(
1, handle,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kUint32UAV)),
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
}
void RenderTargetCache::EndFrame() { UnbindRenderTargets(); }
ColorRenderTargetFormat RenderTargetCache::GetBaseColorFormat(
ColorRenderTargetFormat format) {
switch (format) {
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
return ColorRenderTargetFormat::k_8_8_8_8;
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
return ColorRenderTargetFormat::k_2_10_10_10;
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
return ColorRenderTargetFormat::k_2_10_10_10_FLOAT;
default:
return format;
}
}
DXGI_FORMAT RenderTargetCache::GetColorDXGIFormat(
ColorRenderTargetFormat format) {
switch (format) {
case ColorRenderTargetFormat::k_8_8_8_8:
case ColorRenderTargetFormat::k_8_8_8_8_GAMMA:
return DXGI_FORMAT_R8G8B8A8_UNORM;
case ColorRenderTargetFormat::k_2_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
return DXGI_FORMAT_R10G10B10A2_UNORM;
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT:
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
return DXGI_FORMAT_R16G16B16A16_FLOAT;
case ColorRenderTargetFormat::k_16_16:
return DXGI_FORMAT_R16G16_SNORM;
case ColorRenderTargetFormat::k_16_16_16_16:
return DXGI_FORMAT_R16G16B16A16_SNORM;
case ColorRenderTargetFormat::k_16_16_FLOAT:
return DXGI_FORMAT_R16G16_FLOAT;
case ColorRenderTargetFormat::k_32_FLOAT:
return DXGI_FORMAT_R32_FLOAT;
case ColorRenderTargetFormat::k_32_32_FLOAT:
return DXGI_FORMAT_R32G32_FLOAT;
default:
break;
}
return DXGI_FORMAT_UNKNOWN;
}
uint32_t RenderTargetCache::GetEDRAMBufferSize() const {
uint32_t size = 2048 * 5120;
if (!command_processor_->IsROVUsedForEDRAM()) {
// Two 10 MB pages, one containing color and integer depth data, another
// with 32-bit float depth when 20e4 depth is used to allow for multipass
// drawing without precision loss in case of EDRAM store/load.
size *= 2;
}
if (resolution_scale_2x_) {
size *= 4;
}
return size;
}
void RenderTargetCache::TransitionEDRAMBuffer(D3D12_RESOURCE_STATES new_state) {
command_processor_->PushTransitionBarrier(edram_buffer_, edram_buffer_state_,
new_state);
edram_buffer_state_ = new_state;
}
void RenderTargetCache::CommitEDRAMBufferUAVWrites(bool force) {
if (edram_buffer_modified_ || force) {
command_processor_->PushUAVBarrier(edram_buffer_);
}
edram_buffer_modified_ = false;
}
void RenderTargetCache::WriteEDRAMRawSRVDescriptor(
D3D12_CPU_DESCRIPTOR_HANDLE handle) {
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
device->CopyDescriptorsSimple(
1, handle,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kRawSRV)),
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
}
void RenderTargetCache::WriteEDRAMRawUAVDescriptor(
D3D12_CPU_DESCRIPTOR_HANDLE handle) {
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
device->CopyDescriptorsSimple(
1, handle,
provider->OffsetViewDescriptor(
edram_buffer_descriptor_heap_start_,
uint32_t(EDRAMBufferDescriptorIndex::kRawUAV)),
D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
}
void RenderTargetCache::ClearBindings() {
current_surface_pitch_ = 0;
current_msaa_samples_ = MsaaSamples::k1X;
current_edram_max_rows_ = 0;
std::memset(current_bindings_, 0, sizeof(current_bindings_));
}
#if 0
bool RenderTargetCache::MakeHeapResident(uint32_t heap_index) {
if (heap_index >= 5) {
assert_always();
return false;
}
if (heaps_[heap_index] != nullptr) {
return true;
}
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_HEAP_DESC heap_desc = {};
heap_desc.SizeInBytes = kHeap4MBPages << 22;
heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
// TODO(Triang3l): If real MSAA is added, alignment must be 4 MB.
heap_desc.Alignment = 0;
heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_RT_DS_TEXTURES;
if (FAILED(
device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[heap_index])))) {
XELOGE("Failed to create a %u MB heap for render targets",
kHeap4MBPages * 4);
return false;
}
return true;
}
#endif
bool RenderTargetCache::EnsureRTVHeapAvailable(bool is_depth) {
auto& heap = is_depth ? descriptor_heaps_depth_ : descriptor_heaps_color_;
if (heap != nullptr &&
heap->descriptors_used < kRenderTargetDescriptorHeapSize) {
return true;
}
auto device =
command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice();
D3D12_DESCRIPTOR_HEAP_DESC heap_desc;
heap_desc.Type = is_depth ? D3D12_DESCRIPTOR_HEAP_TYPE_DSV
: D3D12_DESCRIPTOR_HEAP_TYPE_RTV;
heap_desc.NumDescriptors = kRenderTargetDescriptorHeapSize;
heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
heap_desc.NodeMask = 0;
ID3D12DescriptorHeap* new_d3d_heap;
if (FAILED(device->CreateDescriptorHeap(&heap_desc,
IID_PPV_ARGS(&new_d3d_heap)))) {
XELOGE("Failed to create a heap for %u %s buffer descriptors",
kRenderTargetDescriptorHeapSize, is_depth ? "depth" : "color");
return false;
}
RenderTargetDescriptorHeap* new_heap = new RenderTargetDescriptorHeap;
new_heap->heap = new_d3d_heap;
new_heap->start_handle = new_d3d_heap->GetCPUDescriptorHandleForHeapStart();
new_heap->descriptors_used = 0;
new_heap->previous = heap;
heap = new_heap;
return true;
}
bool RenderTargetCache::GetResourceDesc(RenderTargetKey key,
D3D12_RESOURCE_DESC& desc) {
if (key.width_ss_div_80 == 0 || key.height_ss_div_16 == 0) {
return false;
}
DXGI_FORMAT dxgi_format =
key.is_depth ? GetDepthDXGIFormat(DepthRenderTargetFormat(key.format))
: GetColorDXGIFormat(ColorRenderTargetFormat(key.format));
if (dxgi_format == DXGI_FORMAT_UNKNOWN) {
return false;
}
desc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
// TODO(Triang3l): If real MSAA is added, alignment must be 4 MB.
desc.Alignment = 0;
desc.Width = key.width_ss_div_80 * 80;
desc.Height = key.height_ss_div_16 * 16;
desc.DepthOrArraySize = 1;
desc.MipLevels = 1;
desc.Format = dxgi_format;
desc.SampleDesc.Count = 1;
desc.SampleDesc.Quality = 0;
desc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
desc.Flags = key.is_depth ? D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL
: D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
return true;
}
RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget(
#if 0
RenderTargetKey key, uint32_t heap_page_first
#else
RenderTargetKey key, uint32_t instance
#endif
) {
#if 0
assert_true(heap_page_first < kHeap4MBPages * 5);
#endif
// Try to find an existing render target.
auto found_range = render_targets_.equal_range(key.value);
for (auto iter = found_range.first; iter != found_range.second; ++iter) {
RenderTarget* found_render_target = iter->second;
#if 0
if (found_render_target->heap_page_first == heap_page_first) {
return found_render_target;
}
#else
if (found_render_target->instance == instance) {
return found_render_target;
}
#endif
}
D3D12_RESOURCE_DESC resource_desc;
if (!GetResourceDesc(key, resource_desc)) {
return nullptr;
}
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
#if 0
// Get the number of heap pages needed for the render target.
D3D12_RESOURCE_ALLOCATION_INFO allocation_info =
device->GetResourceAllocationInfo(0, 1, &resource_desc);
uint32_t heap_page_count =
(uint32_t(allocation_info.SizeInBytes) + ((4 << 20) - 1)) >> 22;
if (heap_page_count == 0 ||
(heap_page_first % kHeap4MBPages) + heap_page_count > kHeap4MBPages) {
assert_always();
return nullptr;
}
#endif
// Ensure we can create a new descriptor in the render target heap.
if (!EnsureRTVHeapAvailable(key.is_depth)) {
return nullptr;
}
#if 0
// Create the memory heap if it doesn't exist yet.
uint32_t heap_index = heap_page_first / kHeap4MBPages;
if (!MakeHeapResident(heap_index)) {
return nullptr;
}
#endif
// The first action likely to be done is EDRAM buffer load.
D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_COPY_DEST;
ID3D12Resource* resource;
#if 0
if (FAILED(device->CreatePlacedResource(
heaps_[heap_index], (heap_page_first % kHeap4MBPages) << 22,
&resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) {
XELOGE(
"Failed to create a placed resource for %ux%u %s render target with "
"format %u at heap 4 MB pages %u:%u",
uint32_t(resource_desc.Width), resource_desc.Height,
key.is_depth ? "depth" : "color", key.format, heap_page_first,
heap_page_first + heap_page_count - 1);
return nullptr;
}
#else
if (FAILED(device->CreateCommittedResource(
&ui::d3d12::util::kHeapPropertiesDefault, D3D12_HEAP_FLAG_NONE,
&resource_desc, state, nullptr, IID_PPV_ARGS(&resource)))) {
XELOGE(
"Failed to create a committed resource for %ux%u %s render target with "
"format %u",
uint32_t(resource_desc.Width), resource_desc.Height,
key.is_depth ? "depth" : "color", key.format);
return nullptr;
}
#endif
// Create the descriptor for the render target.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_handle;
if (key.is_depth) {
descriptor_handle = provider->OffsetDSVDescriptor(
descriptor_heaps_depth_->start_handle,
descriptor_heaps_depth_->descriptors_used);
D3D12_DEPTH_STENCIL_VIEW_DESC dsv_desc;
dsv_desc.Format = resource_desc.Format;
dsv_desc.ViewDimension = D3D12_DSV_DIMENSION_TEXTURE2D;
dsv_desc.Flags = D3D12_DSV_FLAG_NONE;
dsv_desc.Texture2D.MipSlice = 0;
device->CreateDepthStencilView(resource, &dsv_desc, descriptor_handle);
++descriptor_heaps_depth_->descriptors_used;
} else {
descriptor_handle = provider->OffsetRTVDescriptor(
descriptor_heaps_color_->start_handle,
descriptor_heaps_color_->descriptors_used);
D3D12_RENDER_TARGET_VIEW_DESC rtv_desc;
rtv_desc.Format = resource_desc.Format;
rtv_desc.ViewDimension = D3D12_RTV_DIMENSION_TEXTURE2D;
rtv_desc.Texture2D.MipSlice = 0;
rtv_desc.Texture2D.PlaneSlice = 0;
device->CreateRenderTargetView(resource, &rtv_desc, descriptor_handle);
++descriptor_heaps_color_->descriptors_used;
}
// Get the layout for copying to the EDRAM buffer.
RenderTarget* render_target = new RenderTarget;
render_target->resource = resource;
render_target->state = state;
render_target->handle = descriptor_handle;
render_target->key = key;
#if 0
render_target->heap_page_first = heap_page_first;
render_target->heap_page_count = heap_page_count;
#else
render_target->instance = instance;
#endif
UINT64 copy_buffer_size;
device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0,
render_target->footprints, nullptr, nullptr,
&copy_buffer_size);
render_target->copy_buffer_size = uint32_t(copy_buffer_size);
render_targets_.insert(std::make_pair(key.value, render_target));
COUNT_profile_set("gpu/render_target_cache/render_targets",
render_targets_.size());
#if 0
XELOGGPU(
"Created %ux%u %s render target with format %u at heap 4 MB pages %u:%u",
uint32_t(resource_desc.Width), resource_desc.Height,
key.is_depth ? "depth" : "color", key.format, heap_page_first,
heap_page_first + heap_page_count - 1);
#else
XELOGGPU("Created %ux%u %s render target with format %u",
uint32_t(resource_desc.Width), resource_desc.Height,
key.is_depth ? "depth" : "color", key.format);
#endif
return render_target;
}
bool RenderTargetCache::GetEDRAMLayout(
uint32_t pitch_pixels, MsaaSamples msaa_samples, bool is_64bpp,
uint32_t& base_in_out, D3D12_RECT& rect_in_out, uint32_t& pitch_tiles_out,
uint32_t& row_width_ss_div_80_out, uint32_t& rows_out) {
if (pitch_pixels == 0 || rect_in_out.right <= 0 || rect_in_out.bottom <= 0 ||
rect_in_out.top >= rect_in_out.bottom) {
return false;
}
pitch_pixels = std::min(pitch_pixels, 2560u);
D3D12_RECT rect = rect_in_out;
rect.left = std::max(rect.left, LONG(0));
rect.top = std::max(rect.top, LONG(0));
rect.right = std::min(rect.right, LONG(pitch_pixels));
if (rect.left >= rect.right) {
return false;
}
uint32_t samples_x_log2 = msaa_samples >= MsaaSamples::k4X ? 1 : 0;
uint32_t samples_y_log2 = msaa_samples >= MsaaSamples::k2X ? 1 : 0;
uint32_t sample_size_log2 = is_64bpp ? 1 : 0;
uint32_t pitch_tiles = (((pitch_pixels << samples_x_log2) + 79) / 80)
<< sample_size_log2;
// Adjust the base and the rectangle to skip tiles to the left of the left
// bound of the rectangle and to the top of the top bound.
uint32_t base = base_in_out;
uint32_t skip = rect.top << samples_y_log2 >> 4;
base += skip * pitch_tiles;
skip <<= 4 - samples_y_log2;
rect.top -= skip;
rect.bottom -= skip;
skip = (rect.left << samples_x_log2) / 80;
base += skip << sample_size_log2;
skip *= 80 >> samples_x_log2;
rect.left -= skip;
rect.right -= skip;
// Calculate the number of 16-sample rows this rectangle spans.
uint32_t rows = ((rect.bottom << samples_y_log2) + 15) >> 4;
uint32_t rows_max = (2048 - base) / pitch_tiles;
if (rows_max == 0) {
return false;
}
if (rows > rows_max) {
// Clamp the rectangle if it's partially outside of EDRAM.
rows = rows_max;
rect.bottom = rows_max << (4 - samples_y_log2);
}
base_in_out = base;
rect_in_out = rect;
pitch_tiles_out = pitch_tiles;
row_width_ss_div_80_out = ((rect.right << samples_x_log2) + 79) / 80;
rows_out = rows;
return true;
}
RenderTargetCache::EDRAMLoadStoreMode RenderTargetCache::GetLoadStoreMode(
bool is_depth, uint32_t format) {
if (is_depth) {
return DepthRenderTargetFormat(format) == DepthRenderTargetFormat::kD24FS8
? EDRAMLoadStoreMode::kDepthFloat
: EDRAMLoadStoreMode::kDepthUnorm;
}
ColorRenderTargetFormat color_format = ColorRenderTargetFormat(format);
if (color_format == ColorRenderTargetFormat::k_2_10_10_10_FLOAT ||
color_format ==
ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16) {
return EDRAMLoadStoreMode::kColor7e3;
}
return IsColorFormat64bpp(color_format) ? EDRAMLoadStoreMode::kColor64bpp
: EDRAMLoadStoreMode::kColor32bpp;
}
void RenderTargetCache::StoreRenderTargetsToEDRAM() {
if (command_processor_->IsROVUsedForEDRAM()) {
return;
}
auto command_list = command_processor_->GetDeferredCommandList();
// Extract only the render targets that need to be stored, transition them to
// copy sources and calculate copy buffer size.
uint32_t store_bindings[5];
uint32_t store_binding_count = 0;
uint32_t copy_buffer_size = 0;
for (uint32_t i = 0; i < 5; ++i) {
const RenderTargetBinding& binding = current_bindings_[i];
RenderTarget* render_target = binding.render_target;
if (!binding.is_bound || render_target == nullptr ||
binding.edram_dirty_rows < 0) {
continue;
}
store_bindings[store_binding_count++] = i;
copy_buffer_size =
std::max(copy_buffer_size, render_target->copy_buffer_size);
}
if (store_binding_count == 0) {
return;
}
// Allocate descriptors for the buffers.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start,
descriptor_gpu_start) == 0) {
return;
}
// Get the buffer for copying.
D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
copy_buffer_size, copy_buffer_state);
if (copy_buffer == nullptr) {
return;
}
// Transition the render targets that need to be stored to copy sources and
// the EDRAM buffer to a UAV.
for (uint32_t i = 0; i < store_binding_count; ++i) {
RenderTarget* render_target =
current_bindings_[store_bindings[i]].render_target;
command_processor_->PushTransitionBarrier(render_target->resource,
render_target->state,
D3D12_RESOURCE_STATE_COPY_SOURCE);
render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE;
}
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
// Set up the bindings.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_);
ui::d3d12::util::CreateRawBufferSRV(device, descriptor_cpu_start, copy_buffer,
copy_buffer_size);
WriteEDRAMRawUAVDescriptor(
provider->OffsetViewDescriptor(descriptor_cpu_start, 1));
command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start);
// Sort the bindings in ascending order of EDRAM base so data in the render
// targets placed farther in EDRAM isn't lost in case of overlap.
std::sort(store_bindings, store_bindings + store_binding_count,
[this](uint32_t a, uint32_t b) {
uint32_t base_a = current_bindings_[a].edram_base;
uint32_t base_b = current_bindings_[b].edram_base;
if (base_a == base_b) {
// If EDRAM bases are the same (not really a valid usage, but
// happens in Banjo-Tooie - in case color writing was enabled
// for invalid render targets in some draw call), treat the
// render targets with the lowest index as more important (it's
// the primary one after all, while the rest are additional).
// Depth buffer has lower priority, otherwise the Xbox Live
// Arcade logo disappears.
return a > b;
}
return base_a < base_b;
});
// Calculate the dispatch width.
uint32_t surface_pitch_ss =
current_surface_pitch_ *
(current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1);
uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80;
assert_true(surface_pitch_tiles != 0);
// Store each render target.
for (uint32_t i = 0; i < store_binding_count; ++i) {
const RenderTargetBinding& binding = current_bindings_[store_bindings[i]];
const RenderTarget* render_target = binding.render_target;
bool is_64bpp = false;
// Transition the copy buffer to copy destination.
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_COPY_DEST);
copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST;
command_processor_->SubmitBarriers();
// Copy from the render target planes and set up the layout.
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
location_source.pResource = render_target->resource;
location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
location_source.SubresourceIndex = 0;
location_dest.pResource = copy_buffer;
location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
location_dest.PlacedFootprint = render_target->footprints[0];
// TODO(Triang3l): Box for color render targets.
command_list->CopyTexture(location_dest, location_source);
EDRAMLoadStoreRootConstants root_constants;
uint32_t rt_pitch_tiles = surface_pitch_tiles;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
rt_pitch_tiles *= 2;
}
// TODO(Triang3l): log2(sample count, resolution scale).
root_constants.base_samples_2x_depth_pitch =
binding.edram_base | (rt_pitch_tiles << 16);
root_constants.rt_color_depth_offset =
uint32_t(location_dest.PlacedFootprint.Offset);
root_constants.rt_color_depth_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch;
if (render_target->key.is_depth) {
root_constants.base_samples_2x_depth_pitch |= 1 << 15;
location_source.SubresourceIndex = 1;
location_dest.PlacedFootprint = render_target->footprints[1];
command_list->CopyTexture(location_dest, location_source);
root_constants.rt_stencil_offset =
uint32_t(location_dest.PlacedFootprint.Offset);
root_constants.rt_stencil_pitch =
location_dest.PlacedFootprint.Footprint.RowPitch;
}
// Transition the copy buffer to SRV.
command_processor_->PushTransitionBarrier(
copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
command_processor_->SubmitBarriers();
// Store the data.
command_list->D3DSetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
render_target->key.format);
command_processor_->SetComputePipeline(
edram_store_pipelines_[size_t(mode)]);
// 1 group per 80x16 samples.
command_list->D3DDispatch(surface_pitch_tiles, binding.edram_dirty_rows, 1);
// Commit the UAV write.
CommitEDRAMBufferUAVWrites(true);
}
command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
}
void RenderTargetCache::LoadRenderTargetsFromEDRAM(
uint32_t render_target_count, RenderTarget* const* render_targets,
const uint32_t* edram_bases) {
assert_true(render_target_count <= 5);
if (render_target_count == 0 || render_target_count > 5) {
return;
}
auto command_list = command_processor_->GetDeferredCommandList();
// Allocate descriptors for the buffers.
D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start;
D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start;
if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start,
descriptor_gpu_start) == 0) {
return;
}
// Get the buffer for copying.
uint32_t copy_buffer_size = 0;
for (uint32_t i = 0; i < render_target_count; ++i) {
copy_buffer_size =
std::max(copy_buffer_size, render_targets[i]->copy_buffer_size);
}
D3D12_RESOURCE_STATES copy_buffer_state =
D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer(
copy_buffer_size, copy_buffer_state);
if (copy_buffer == nullptr) {
return;
}
// Transition the render targets to copy destinations and the EDRAM buffer to
// a SRV.
for (uint32_t i = 0; i < render_target_count; ++i) {
RenderTarget* render_target = render_targets[i];
command_processor_->PushTransitionBarrier(render_target->resource,
render_target->state,
D3D12_RESOURCE_STATE_COPY_DEST);
render_target->state = D3D12_RESOURCE_STATE_COPY_DEST;
}
TransitionEDRAMBuffer(D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
// Set up the bindings.
auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider();
auto device = provider->GetDevice();
command_list->D3DSetComputeRootSignature(edram_load_store_root_signature_);
WriteEDRAMRawSRVDescriptor(descriptor_cpu_start);
ui::d3d12::util::CreateRawBufferUAV(
device, provider->OffsetViewDescriptor(descriptor_cpu_start, 1),
copy_buffer, copy_buffer_size);
command_list->D3DSetComputeRootDescriptorTable(1, descriptor_gpu_start);
// Load each render target.
for (uint32_t i = 0; i < render_target_count; ++i) {
if (edram_bases[i] >= 2048) {
// Something is wrong with the load.
continue;
}
const RenderTarget* render_target = render_targets[i];
// Get the number of EDRAM tiles per row.
uint32_t edram_pitch_tiles = render_target->key.width_ss_div_80;
if (!render_target->key.is_depth &&
IsColorFormat64bpp(
ColorRenderTargetFormat(render_target->key.format))) {
edram_pitch_tiles *= 2;
}
// Clamp the height if somehow requested a render target that is too large.
uint32_t edram_rows =
std::min(render_target->key.height_ss_div_16,
(2048u - edram_bases[i]) / edram_pitch_tiles);
if (edram_rows == 0) {
continue;
}
// Transition the copy buffer back to UAV if it's not the first load.
command_processor_->PushTransitionBarrier(
copy_buffer, copy_buffer_state, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
copy_buffer_state = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
// Load the data.
command_processor_->SubmitBarriers();
EDRAMLoadStoreRootConstants root_constants;
// TODO(Triang3l): log2(sample count, resolution scale).
root_constants.base_samples_2x_depth_pitch =
edram_bases[i] | (edram_pitch_tiles << 16);
root_constants.rt_color_depth_offset =
uint32_t(render_target->footprints[0].Offset);
root_constants.rt_color_depth_pitch =
render_target->footprints[0].Footprint.RowPitch;
if (render_target->key.is_depth) {
root_constants.base_samples_2x_depth_pitch |= 1 << 15;
root_constants.rt_stencil_offset =
uint32_t(render_target->footprints[1].Offset);
root_constants.rt_stencil_pitch =
render_target->footprints[1].Footprint.RowPitch;
}
command_list->D3DSetComputeRoot32BitConstants(
0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0);
EDRAMLoadStoreMode mode = GetLoadStoreMode(render_target->key.is_depth,
render_target->key.format);
command_processor_->SetComputePipeline(edram_load_pipelines_[size_t(mode)]);
// 1 group per 80x16 samples.
command_list->D3DDispatch(render_target->key.width_ss_div_80, edram_rows,
1);
// Commit the UAV write and transition the copy buffer to copy source now.
command_processor_->PushUAVBarrier(copy_buffer);
command_processor_->PushTransitionBarrier(copy_buffer, copy_buffer_state,
D3D12_RESOURCE_STATE_COPY_SOURCE);
copy_buffer_state = D3D12_RESOURCE_STATE_COPY_SOURCE;
// Copy to the render target planes.
command_processor_->SubmitBarriers();
D3D12_TEXTURE_COPY_LOCATION location_source, location_dest;
location_source.pResource = copy_buffer;
location_source.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
location_source.PlacedFootprint = render_target->footprints[0];
location_dest.pResource = render_target->resource;
location_dest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
location_dest.SubresourceIndex = 0;
command_list->CopyTexture(location_dest, location_source);
if (render_target->key.is_depth) {
location_source.PlacedFootprint = render_target->footprints[1];
location_dest.SubresourceIndex = 1;
command_list->CopyTexture(location_dest, location_source);
}
}
command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state);
}
} // namespace d3d12
} // namespace gpu
} // namespace xe