From ddc8f17fa5236ab1e4265a6a65eeb54debf59462 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Tue, 21 Aug 2018 23:05:41 +0300 Subject: [PATCH] [D3D12] Depth untiling, update depth resolve documentation --- src/xenia/gpu/d3d12/render_target_cache.cc | 28 +++++++++---------- .../shaders/texture_load_depth_float.cs.hlsl | 25 +++++++++++++++++ .../shaders/texture_load_depth_unorm.cs.hlsl | 25 +++++++++++++++++ src/xenia/gpu/d3d12/texture_cache.cc | 10 +++++-- src/xenia/gpu/d3d12/texture_cache.h | 2 ++ 5 files changed, 73 insertions(+), 17 deletions(-) create mode 100644 src/xenia/gpu/d3d12/shaders/texture_load_depth_float.cs.hlsl create mode 100644 src/xenia/gpu/d3d12/shaders/texture_load_depth_unorm.cs.hlsl diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index 9b99677e0..e2dc48be4 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -922,8 +922,11 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, } assert_true(src_texture_format != TextureFormat::kUnknown); src_texture_format = GetBaseFormat(src_texture_format); + // The destination format is specified as k_8_8_8_8 when resolving depth, + // apparently there's no format conversion. TextureFormat dest_format = - GetBaseFormat(TextureFormat((dest_info >> 7) & 0x3F)); + is_depth ? src_texture_format + : GetBaseFormat(TextureFormat((dest_info >> 7) & 0x3F)); // Get the destination location. uint32_t dest_address = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32 & 0x1FFFFFFF; @@ -946,30 +949,25 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory, // There are 3 paths for resolving in this function - they don't necessarily // have to map directly to kRaw and kConvert CopyCommands. + // - Depth - tiling raw D24S8 or D24FS8 directly from the EDRAM buffer to the + // shared memory. Only 1 sample is resolved from a depth buffer, and it + // looks like format conversion can't be done when resolving depth buffers + // since k_8_8_8_8 is specified as the destination format, while the texture + // is being used as k_24_8 or k_24_8_FLOAT. // - Raw color - when the source is single-sampled and has the same format as // the destination, and there's no need to apply exponent bias. A regular // EDRAM load is done to a buffer, and the buffer is then tiled to the // shared memory. Because swapping red and blue is very common, this path // supports swapping. - // - Depth to depth - when the source and the destination formats are - // renderable depth-stencil ones (D24S8 or D24FS8). A single sample is - // taken from the EDRAM buffer, converted between D24 and D24F if needed, - // and tiled directly to the shared memory buffer. // - Conversion - when a simple copy is not enough. The EDRAM region is loaded // to a render target resource, which is then used as a texture in a shader // performing the resolve (by sampling the texture on or between pixels with // bilinear filtering), applying exponent bias and swapping red and blue in // a format-agnostic way, then the resulting color is written to a temporary - // RTV of the destination format. This also works for converting depth to - // 16-bit or 32-bit. - if (dest_format == TextureFormat::k_24_8 || - dest_format == TextureFormat::k_24_8_FLOAT) { - // Depth to depth. - XELOGGPU("Resolving to a depth texture"); - if (!is_depth) { - return false; - } - // TODO(Triang3l): Depth to depth. + // RTV of the destination format. + if (is_depth) { + // Depth. + // TODO(Triang3l): Resolve depth. return false; } else if (src_texture_format == dest_format && msaa_samples == MsaaSamples::k1X && dest_exp_bias == 0) { diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_depth_float.cs.hlsl new file mode 100644 index 000000000..2f88746eb --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_load_depth_float.cs.hlsl @@ -0,0 +1,25 @@ +#include "pixel_formats.hlsli" +#include "texture_copy.hlsli" + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 depth texels (24-bit float depth converted to 32-bit, can't + // read stencil in shaders anyway because it would require a separate + // DXGI_FORMAT_X32_TYPELESS_G8X24_UINT SRV). + uint3 block_index = xe_thread_id; + block_index.x <<= 2u; + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { + return; + } + uint4 block_offsets_guest = + XeTextureCopyGuestBlockOffsets(block_index, 4u, 2u); + uint4 blocks = uint4(xe_texture_copy_source.Load(block_offsets_guest.x), + xe_texture_copy_source.Load(block_offsets_guest.y), + xe_texture_copy_source.Load(block_offsets_guest.z), + xe_texture_copy_source.Load(block_offsets_guest.w)); + blocks = XeByteSwap(blocks, xe_texture_copy_endianness); + uint block_offset_host = XeTextureHostLinearOffset( + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 4u) + xe_texture_copy_host_base; + xe_texture_copy_dest.Store4(block_offset_host, XeFloat20e4To32(blocks >> 8u)); +} diff --git a/src/xenia/gpu/d3d12/shaders/texture_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/texture_load_depth_unorm.cs.hlsl new file mode 100644 index 000000000..98d89b3a1 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/texture_load_depth_unorm.cs.hlsl @@ -0,0 +1,25 @@ +#include "texture_copy.hlsli" + +[numthreads(8, 32, 1)] +void main(uint3 xe_thread_id : SV_DispatchThreadID) { + // 1 thread = 4 depth texels (24-bit unorm depth converted to 32-bit, can't + // read stencil in shaders anyway because it would require a separate + // DXGI_FORMAT_X24_TYPELESS_G8_UINT SRV). + uint3 block_index = xe_thread_id; + block_index.x <<= 2u; + [branch] if (any(block_index >= xe_texture_copy_size_blocks)) { + return; + } + uint4 block_offsets_guest = + XeTextureCopyGuestBlockOffsets(block_index, 4u, 2u); + uint4 blocks = uint4(xe_texture_copy_source.Load(block_offsets_guest.x), + xe_texture_copy_source.Load(block_offsets_guest.y), + xe_texture_copy_source.Load(block_offsets_guest.z), + xe_texture_copy_source.Load(block_offsets_guest.w)); + blocks = XeByteSwap(blocks, xe_texture_copy_endianness); + uint block_offset_host = XeTextureHostLinearOffset( + block_index, xe_texture_copy_size_blocks.y, xe_texture_copy_host_pitch, + 4u) + xe_texture_copy_host_base; + xe_texture_copy_dest.Store4(block_offset_host, + asuint(float4(blocks >> 8u) / 16777215.0)); +} diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index b2d0e96e5..ebf37a31f 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -31,6 +31,8 @@ namespace d3d12 { #include "xenia/gpu/d3d12/shaders/bin/texture_load_64bpb_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_8bpb_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_ctx1_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/texture_load_depth_float_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/texture_load_depth_unorm_cs.h" #include "xenia/gpu/d3d12/shaders/bin/texture_load_dxt3a_cs.h" const TextureCache::HostFormat TextureCache::host_formats_[64] = { @@ -56,8 +58,10 @@ const TextureCache::HostFormat TextureCache::host_formats_[64] = { {DXGI_FORMAT_BC2_UNORM, CopyMode::k128bpb}, // k_DXT2_3 {DXGI_FORMAT_BC3_UNORM, CopyMode::k128bpb}, // k_DXT4_5 {DXGI_FORMAT_R16G16B16A16_UNORM, CopyMode::k64bpb}, // k_16_16_16_16_EDRAM - {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_24_8 - {DXGI_FORMAT_UNKNOWN, CopyMode::kUnknown}, // k_24_8_FLOAT + // R32_FLOAT for depth because shaders would require an additional SRV to + // sample stencil, which we don't provide. + {DXGI_FORMAT_R32_FLOAT, CopyMode::kDepthUnorm}, // k_24_8 + {DXGI_FORMAT_R32_FLOAT, CopyMode::kDepthFloat}, // k_24_8_FLOAT {DXGI_FORMAT_R16_UNORM, CopyMode::k16bpb}, // k_16 {DXGI_FORMAT_R16G16_UNORM, CopyMode::k32bpb}, // k_16_16 {DXGI_FORMAT_R16G16B16A16_UNORM, CopyMode::k64bpb}, // k_16_16_16_16 @@ -112,6 +116,8 @@ const TextureCache::CopyModeInfo TextureCache::copy_mode_info_[] = { {texture_load_128bpb_cs, sizeof(texture_load_128bpb_cs)}, {texture_load_dxt3a_cs, sizeof(texture_load_dxt3a_cs)}, {texture_load_ctx1_cs, sizeof(texture_load_ctx1_cs)}, + {texture_load_depth_unorm_cs, sizeof(texture_load_depth_unorm_cs)}, + {texture_load_depth_float_cs, sizeof(texture_load_depth_float_cs)}, }; TextureCache::TextureCache(D3D12CommandProcessor* command_processor, diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h index b9f35f19b..e717126af 100644 --- a/src/xenia/gpu/d3d12/texture_cache.h +++ b/src/xenia/gpu/d3d12/texture_cache.h @@ -92,6 +92,8 @@ class TextureCache { k128bpb, kDXT3A, kCTX1, + kDepthUnorm, + kDepthFloat, kCount,