diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 33d9f3448..3dcdf4da9 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -377,7 +377,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestScratchGPUBuffer( barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; barrier.Transition.pResource = scratch_buffer_; - barrier.Transition.Subresource = 0; + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; barrier.Transition.StateBefore = scratch_buffer_state_; barrier.Transition.StateAfter = state; GetCurrentCommandList()->ResourceBarrier(1, &barrier); @@ -489,6 +489,10 @@ bool D3D12CommandProcessor::SetupContext() { render_target_cache_ = std::make_unique(this, register_file_); + if (!render_target_cache_->Initialize()) { + XELOGE("Failed to initialize the render target cache"); + return false; + } return true; } diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc index aa015c858..39d1585c1 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.cc +++ b/src/xenia/gpu/d3d12/render_target_cache.cc @@ -21,13 +21,176 @@ namespace xe { namespace gpu { namespace d3d12 { +// Generated with `xb buildhlsl`. +#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_64bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_load_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_float_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_load_depth_unorm_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_32bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_64bpp_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_store_color_7e3_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_float_cs.h" +#include "xenia/gpu/d3d12/shaders/bin/edram_store_depth_unorm_cs.h" + +const RenderTargetCache::EDRAMLoadStorePipelineInfo + RenderTargetCache::edram_load_store_pipeline_info_[size_t( + RenderTargetCache::EDRAMLoadStorePipelineIndex::kCount)] = { + {edram_load_color_32bpp_cs, sizeof(edram_load_color_32bpp_cs), + L"EDRAM Load 32bpp Color"}, + {edram_store_color_32bpp_cs, sizeof(edram_store_color_32bpp_cs), + L"EDRAM Store 32bpp Color"}, + {edram_load_color_64bpp_cs, sizeof(edram_load_color_64bpp_cs), + L"EDRAM Load 64bpp Color"}, + {edram_store_color_64bpp_cs, sizeof(edram_store_color_64bpp_cs), + L"EDRAM Store 64bpp Color"}, + {edram_load_color_7e3_cs, sizeof(edram_load_color_7e3_cs), + L"EDRAM Load 7e3 Color"}, + {edram_store_color_7e3_cs, sizeof(edram_store_color_7e3_cs), + L"EDRAM Store 7e3 Color"}, + {edram_load_depth_unorm_cs, sizeof(edram_load_depth_unorm_cs), + L"EDRAM Load UNorm Depth"}, + {edram_store_depth_unorm_cs, sizeof(edram_store_depth_unorm_cs), + L"EDRAM Store UNorm Depth"}, + {edram_load_depth_float_cs, sizeof(edram_load_depth_float_cs), + L"EDRAM Load Float Depth"}, + {edram_store_depth_float_cs, sizeof(edram_store_depth_float_cs), + L"EDRAM Store Float Depth"}, +}; + RenderTargetCache::RenderTargetCache(D3D12CommandProcessor* command_processor, RegisterFile* register_file) : command_processor_(command_processor), register_file_(register_file) {} RenderTargetCache::~RenderTargetCache() { Shutdown(); } -void RenderTargetCache::Shutdown() { ClearCache(); } +bool RenderTargetCache::Initialize() { + auto device = + command_processor_->GetD3D12Context()->GetD3D12Provider()->GetDevice(); + + // Create the buffer for reinterpreting EDRAM contents. + D3D12_RESOURCE_DESC edram_buffer_desc; + edram_buffer_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + edram_buffer_desc.Alignment = 0; + // First 10 MB is guest pixel data, second 10 MB is 32-bit depth when using + // D24FS8 so loads/stores don't corrupt multipass rendering. + edram_buffer_desc.Width = 2 * 2048 * 5120; + edram_buffer_desc.Height = 1; + edram_buffer_desc.DepthOrArraySize = 1; + edram_buffer_desc.MipLevels = 1; + edram_buffer_desc.Format = DXGI_FORMAT_UNKNOWN; + edram_buffer_desc.SampleDesc.Count = 1; + edram_buffer_desc.SampleDesc.Quality = 0; + edram_buffer_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + edram_buffer_desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + D3D12_HEAP_PROPERTIES edram_buffer_heap_properties = {}; + edram_buffer_heap_properties.Type = D3D12_HEAP_TYPE_DEFAULT; + // The first operation will be a clear. + edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + if (FAILED(device->CreateCommittedResource( + &edram_buffer_heap_properties, D3D12_HEAP_FLAG_NONE, + &edram_buffer_desc, edram_buffer_state_, nullptr, + IID_PPV_ARGS(&edram_buffer_)))) { + XELOGE("Failed to create the EDRAM buffer"); + return false; + } + edram_buffer_cleared_ = false; + + // Create the root signature for EDRAM buffer load/store. + D3D12_ROOT_PARAMETER root_parameters[2]; + // Parameter 0 is constants (changed for each render target binding). + root_parameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; + root_parameters[0].Constants.ShaderRegister = 0; + root_parameters[0].Constants.RegisterSpace = 0; + root_parameters[0].Constants.Num32BitValues = + sizeof(EDRAMLoadStoreRootConstants) / sizeof(uint32_t); + root_parameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + // Parameter 1 is source and target. + D3D12_DESCRIPTOR_RANGE root_load_store_ranges[2]; + root_load_store_ranges[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; + root_load_store_ranges[0].NumDescriptors = 1; + root_load_store_ranges[0].BaseShaderRegister = 0; + root_load_store_ranges[0].RegisterSpace = 0; + root_load_store_ranges[0].OffsetInDescriptorsFromTableStart = 0; + root_load_store_ranges[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV; + root_load_store_ranges[1].NumDescriptors = 1; + root_load_store_ranges[1].BaseShaderRegister = 0; + root_load_store_ranges[1].RegisterSpace = 0; + root_load_store_ranges[1].OffsetInDescriptorsFromTableStart = 1; + root_parameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + root_parameters[1].DescriptorTable.NumDescriptorRanges = 2; + root_parameters[1].DescriptorTable.pDescriptorRanges = root_load_store_ranges; + root_parameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + D3D12_ROOT_SIGNATURE_DESC root_signature_desc; + root_signature_desc.NumParameters = UINT(xe::countof(root_parameters)); + root_signature_desc.pParameters = root_parameters; + root_signature_desc.NumStaticSamplers = 0; + root_signature_desc.pStaticSamplers = nullptr; + root_signature_desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + ID3DBlob* root_signature_blob; + ID3DBlob* root_signature_error_blob = nullptr; + if (FAILED(D3D12SerializeRootSignature( + &root_signature_desc, D3D_ROOT_SIGNATURE_VERSION_1, + &root_signature_blob, &root_signature_error_blob))) { + XELOGE("Failed to serialize the EDRAM buffer load/store root signature"); + if (root_signature_error_blob != nullptr) { + XELOGE("%s", reinterpret_cast( + root_signature_error_blob->GetBufferPointer())); + root_signature_error_blob->Release(); + } + Shutdown(); + return false; + } + if (root_signature_error_blob != nullptr) { + root_signature_error_blob->Release(); + } + if (FAILED(device->CreateRootSignature( + 0, root_signature_blob->GetBufferPointer(), + root_signature_blob->GetBufferSize(), + IID_PPV_ARGS(&edram_load_store_root_signature_)))) { + XELOGE("Failed to create the EDRAM buffer load/store root signature"); + root_signature_blob->Release(); + Shutdown(); + return false; + } + root_signature_blob->Release(); + + // Create the load/store pipelines. + D3D12_COMPUTE_PIPELINE_STATE_DESC pipeline_desc; + pipeline_desc.pRootSignature = edram_load_store_root_signature_; + pipeline_desc.NodeMask = 0; + pipeline_desc.CachedPSO.pCachedBlob = nullptr; + pipeline_desc.CachedPSO.CachedBlobSizeInBytes = 0; + pipeline_desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + for (uint32_t i = 0; i < uint32_t(EDRAMLoadStorePipelineIndex::kCount); ++i) { + const EDRAMLoadStorePipelineInfo& pipeline_info = + edram_load_store_pipeline_info_[i]; + pipeline_desc.CS.pShaderBytecode = pipeline_info.shader; + pipeline_desc.CS.BytecodeLength = pipeline_info.shader_size; + if (FAILED(device->CreateComputePipelineState( + &pipeline_desc, IID_PPV_ARGS(&edram_load_store_pipelines_[i])))) { + XELOGE("Failed to create EDRAM load/store pipeline for mode %u", i); + Shutdown(); + return false; + } + } + + return true; +} + +void RenderTargetCache::Shutdown() { + ClearCache(); + + if (edram_load_store_root_signature_ != nullptr) { + edram_load_store_root_signature_->Release(); + edram_load_store_root_signature_ = nullptr; + } + + if (edram_buffer_ != nullptr) { + edram_buffer_->Release(); + edram_buffer_ = nullptr; + } +} void RenderTargetCache::ClearCache() { for (auto render_target_pair : render_targets_) { @@ -334,7 +497,7 @@ bool RenderTargetCache::UpdateRenderTargets() { uint32_t heap_usage[5] = {}; if (full_update) { // Export the currently bound render targets before we ruin the bindings. - WriteRenderTargetsToEDRAM(); + StoreRenderTargetsToEDRAM(); ClearBindings(); current_surface_pitch_ = surface_pitch; @@ -527,7 +690,7 @@ bool RenderTargetCache::UpdateRenderTargets() { } void RenderTargetCache::EndFrame() { - WriteRenderTargetsToEDRAM(); + StoreRenderTargetsToEDRAM(); ClearBindings(); } @@ -709,6 +872,7 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( } ++descriptor_heap->descriptors_used; + // Get the layout for copying to the EDRAM buffer. RenderTarget* render_target = new RenderTarget; render_target->resource = resource; render_target->state = state; @@ -716,11 +880,245 @@ RenderTargetCache::RenderTarget* RenderTargetCache::FindOrCreateRenderTarget( render_target->key = key; render_target->heap_page_first = heap_page_first; render_target->heap_page_count = heap_page_count; + UINT64 copy_buffer_size; + device->GetCopyableFootprints(&resource_desc, 0, key.is_depth ? 2 : 1, 0, + render_target->footprints, nullptr, nullptr, + ©_buffer_size); + render_target->copy_buffer_size = uint32_t(copy_buffer_size); render_targets_.insert(std::make_pair(key.value, render_target)); return render_target; } -void RenderTargetCache::WriteRenderTargetsToEDRAM() {} +void RenderTargetCache::StoreRenderTargetsToEDRAM() { + auto command_list = command_processor_->GetCurrentCommandList(); + if (command_list == nullptr) { + return; + } + + uint32_t surface_pitch_ss = + current_surface_pitch_ * + (current_msaa_samples_ >= MsaaSamples::k4X ? 2 : 1); + uint32_t surface_pitch_tiles = (surface_pitch_ss + 79) / 80; + assert_true(surface_pitch_tiles != 0); + + // TODO(Triang3l): Clear the buffer if calling for the first time. + + uint32_t store_bindings[5]; + uint32_t store_binding_count = 0; + + D3D12_RESOURCE_BARRIER barriers[6]; + uint32_t barrier_count; + + // Extract only the render targets that need to be stored, transition them to + // copy sources and calculate intermediate buffer size. + uint32_t copy_buffer_size = 0; + barrier_count = 0; + for (uint32_t i = 0; i < 5; ++i) { + const RenderTargetBinding& binding = current_bindings_[i]; + RenderTarget* render_target = binding.render_target; + // TODO(Triang3l): Change edram_dirty_length to dirty row count. + if (!binding.is_bound || render_target == nullptr || + binding.edram_dirty_length < surface_pitch_tiles) { + continue; + } + store_bindings[store_binding_count] = i; + copy_buffer_size = + std::max(copy_buffer_size, render_target->copy_buffer_size); + ++store_binding_count; + if (render_target->state != D3D12_RESOURCE_STATE_COPY_SOURCE) { + D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++]; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = render_target->resource; + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barrier.Transition.StateBefore = render_target->state; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; + render_target->state = D3D12_RESOURCE_STATE_COPY_SOURCE; + } + } + if (store_binding_count == 0) { + return; + } + if (edram_buffer_state_ != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { + // Also transition the EDRAM buffer to UAV. + D3D12_RESOURCE_BARRIER& barrier = barriers[barrier_count++]; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = edram_buffer_; + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barrier.Transition.StateBefore = edram_buffer_state_; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + edram_buffer_state_ = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + } + if (barrier_count != 0) { + command_list->ResourceBarrier(barrier_count, barriers); + } + + // Allocate descriptors for the buffers. + D3D12_CPU_DESCRIPTOR_HANDLE descriptor_cpu_start; + D3D12_GPU_DESCRIPTOR_HANDLE descriptor_gpu_start; + if (command_processor_->RequestViewDescriptors(0, 2, 2, descriptor_cpu_start, + descriptor_gpu_start) == 0) { + return; + } + + // Get the buffer for copying. + D3D12_RESOURCE_STATES copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST; + ID3D12Resource* copy_buffer = command_processor_->RequestScratchGPUBuffer( + copy_buffer_size, copy_buffer_state); + if (copy_buffer == nullptr) { + return; + } + + // Prepare for writing. + auto provider = command_processor_->GetD3D12Context()->GetD3D12Provider(); + auto device = provider->GetDevice(); + auto descriptor_size_view = provider->GetDescriptorSizeView(); + D3D12_SHADER_RESOURCE_VIEW_DESC srv_desc; + srv_desc.Format = DXGI_FORMAT_R32_TYPELESS; + srv_desc.ViewDimension = D3D12_SRV_DIMENSION_BUFFER; + srv_desc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srv_desc.Buffer.FirstElement = 0; + srv_desc.Buffer.NumElements = copy_buffer_size >> 2; + srv_desc.Buffer.StructureByteStride = 0; + srv_desc.Buffer.Flags = D3D12_BUFFER_SRV_FLAG_RAW; + device->CreateShaderResourceView(copy_buffer, &srv_desc, + descriptor_cpu_start); + D3D12_UNORDERED_ACCESS_VIEW_DESC uav_desc; + uav_desc.Format = DXGI_FORMAT_R32_TYPELESS; + uav_desc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER; + uav_desc.Buffer.FirstElement = 0; + uav_desc.Buffer.NumElements = 2 * 2048 * 1280; + uav_desc.Buffer.StructureByteStride = 0; + uav_desc.Buffer.CounterOffsetInBytes = 0; + uav_desc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_RAW; + D3D12_CPU_DESCRIPTOR_HANDLE uav_cpu_handle; + uav_cpu_handle.ptr = descriptor_cpu_start.ptr + descriptor_size_view; + device->CreateUnorderedAccessView(edram_buffer_, nullptr, &uav_desc, + uav_cpu_handle); + command_list->SetComputeRootSignature(edram_load_store_root_signature_); + command_list->SetComputeRootDescriptorTable(1, descriptor_gpu_start); + + // Sort the bindings in ascending order of EDRAM base so data in the render + // targets placed farther in EDRAM isn't lost in case of overlap. + std::sort( + store_bindings, store_bindings + store_binding_count, + [this](uint32_t a, uint32_t b) { + if (current_bindings_[a].edram_base < current_bindings_[b].edram_base) { + return true; + } + return a < b; + }); + + // Store each render target. + for (uint32_t i = 0; i < store_binding_count; ++i) { + const RenderTargetBinding& binding = current_bindings_[store_bindings[i]]; + const RenderTarget* render_target = binding.render_target; + EDRAMLoadStorePipelineIndex pipeline_index; + bool is_64bpp = false; + if (render_target->key.is_depth) { + if (DepthRenderTargetFormat(render_target->key.format) == + DepthRenderTargetFormat::kD24FS8) { + pipeline_index = EDRAMLoadStorePipelineIndex::kDepthFloatStore; + } else { + pipeline_index = EDRAMLoadStorePipelineIndex::kDepthUnormStore; + } + } else { + switch (ColorRenderTargetFormat(render_target->key.format)) { + case ColorRenderTargetFormat::k_8_8_8_8: + case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: + case ColorRenderTargetFormat::k_2_10_10_10: + case ColorRenderTargetFormat::k_16_16: + case ColorRenderTargetFormat::k_16_16_FLOAT: + case ColorRenderTargetFormat::k_2_10_10_10_AS_16_16_16_16: + case ColorRenderTargetFormat::k_32_FLOAT: + pipeline_index = EDRAMLoadStorePipelineIndex::kColor32bppStore; + break; + case ColorRenderTargetFormat::k_16_16_16_16: + case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: + case ColorRenderTargetFormat::k_32_32_FLOAT: + pipeline_index = EDRAMLoadStorePipelineIndex::kColor64bppStore; + is_64bpp = true; + break; + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT: + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: + pipeline_index = EDRAMLoadStorePipelineIndex::kColor7e3Store; + break; + default: + assert_unhandled_case(render_target->key.format); + continue; + } + } + + D3D12_TEXTURE_COPY_LOCATION location_source, location_dest; + location_source.pResource = render_target->resource; + location_source.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + location_source.SubresourceIndex = 0; + location_dest.pResource = copy_buffer; + location_dest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; + location_dest.PlacedFootprint = render_target->footprints[0]; + // TODO(Triang3l): Box for color render targets. + command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, + nullptr); + EDRAMLoadStoreRootConstants root_constants; + root_constants.base_tiles = binding.edram_base; + root_constants.pitch_tiles = surface_pitch_tiles * (is_64bpp ? 2 : 1); + root_constants.rt_color_depth_pitch = + location_dest.PlacedFootprint.Footprint.RowPitch; + if (render_target->key.is_depth) { + location_source.SubresourceIndex = 1; + location_dest.PlacedFootprint = render_target->footprints[1]; + command_list->CopyTextureRegion(&location_dest, 0, 0, 0, &location_source, + nullptr); + root_constants.rt_stencil_offset = + uint32_t(location_dest.PlacedFootprint.Offset); + root_constants.rt_stencil_pitch = + location_dest.PlacedFootprint.Footprint.RowPitch; + } + + // Transition the copy buffer to SRV. + barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barriers[0].Transition.pResource = copy_buffer; + barriers[0].Transition.Subresource = + D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; + barriers[0].Transition.StateAfter = + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + copy_buffer_state = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + command_list->ResourceBarrier(1, barriers); + + // Store the data. + command_list->SetComputeRoot32BitConstants( + 0, sizeof(root_constants) / sizeof(uint32_t), &root_constants, 0); + command_processor_->SetPipeline( + edram_load_store_pipelines_[size_t(pipeline_index)]); + command_list->Dispatch( + root_constants.pitch_tiles, + binding.edram_dirty_length / root_constants.pitch_tiles, 1); + + // Commit the UAV write and prepare for copying again. + barrier_count = 1; + barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_UAV; + barriers[0].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barriers[0].UAV.pResource = edram_buffer_; + if (i + 1 < store_binding_count) { + barrier_count = 2; + barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barriers[1].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barriers[1].Transition.pResource = copy_buffer; + barriers[1].Transition.Subresource = + D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + barriers[1].Transition.StateBefore = + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST; + copy_buffer_state = D3D12_RESOURCE_STATE_COPY_DEST; + } + command_list->ResourceBarrier(barrier_count, barriers); + } + + command_processor_->ReleaseScratchGPUBuffer(copy_buffer, copy_buffer_state); +} } // namespace d3d12 } // namespace gpu diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h index b300f3e8d..9f5ff69f2 100644 --- a/src/xenia/gpu/d3d12/render_target_cache.h +++ b/src/xenia/gpu/d3d12/render_target_cache.h @@ -201,6 +201,7 @@ class RenderTargetCache { RegisterFile* register_file); ~RenderTargetCache(); + bool Initialize(); void Shutdown(); void ClearCache(); @@ -233,6 +234,27 @@ class RenderTargetCache { } private: + enum class EDRAMLoadStorePipelineIndex { + kColor32bppLoad, + kColor32bppStore, + kColor64bppLoad, + kColor64bppStore, + kColor7e3Load, + kColor7e3Store, + kDepthUnormLoad, + kDepthUnormStore, + kDepthFloatLoad, + kDepthFloatStore, + + kCount + }; + + struct EDRAMLoadStorePipelineInfo { + const void* shader; + size_t shader_size; + const WCHAR* name; + }; + union RenderTargetKey { struct { // Supersampled (_ss - scaled 2x if needed) dimensions, divided by 80x16. @@ -267,8 +289,12 @@ class RenderTargetCache { RenderTargetKey key; // The first 4 MB page in the heaps. uint32_t heap_page_first; - // Number of 4 MB pages this render target uses. + // The number of 4 MB pages this render target uses. uint32_t heap_page_count; + // Color/depth and stencil layouts. + D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprints[2]; + // Buffer size needed to copy the render target to the EDRAM buffer. + uint32_t copy_buffer_size; }; struct RenderTargetBinding { @@ -294,13 +320,34 @@ class RenderTargetCache { RenderTarget* FindOrCreateRenderTarget(RenderTargetKey key, uint32_t heap_page_first); - // Must be in a frame to call. Writes the dirty areas of the currently bound + // Must be in a frame to call. Stores the dirty areas of the currently bound // render targets and marks them as clean. - void WriteRenderTargetsToEDRAM(); + void StoreRenderTargetsToEDRAM(); D3D12CommandProcessor* command_processor_; RegisterFile* register_file_; + // The EDRAM buffer allowing color and depth data to be reinterpreted. + ID3D12Resource* edram_buffer_ = nullptr; + D3D12_RESOURCE_STATES edram_buffer_state_; + bool edram_buffer_cleared_; + + // EDRAM buffer load/store root signature. + ID3D12RootSignature* edram_load_store_root_signature_ = nullptr; + struct EDRAMLoadStoreRootConstants { + uint32_t base_tiles; + uint32_t pitch_tiles; + uint32_t rt_color_depth_pitch; + uint32_t rt_stencil_offset; + uint32_t rt_stencil_pitch; + }; + // EDRAM buffer load/store pipelines. + static const EDRAMLoadStorePipelineInfo + edram_load_store_pipeline_info_[size_t( + EDRAMLoadStorePipelineIndex::kCount)]; + ID3D12PipelineState* edram_load_store_pipelines_[size_t( + EDRAMLoadStorePipelineIndex::kCount)] = {}; + // 32 MB heaps backing used render targets resources, created when needed. // 24 MB proved to be not enough to store a single render target occupying the // entire EDRAM - a 32-bit depth/stencil one - at some resolution. diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl new file mode 100644 index 000000000..cd4079c67 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_32bpp.cs.hlsl @@ -0,0 +1,14 @@ +#include "edram_load_store.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + uint4 pixels = xe_edram_load_store_source.Load4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + xe_edram_load_store_dest.Store4(rt_offset, pixels); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl new file mode 100644 index 000000000..273ee41cf --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_64bpp.cs.hlsl @@ -0,0 +1,19 @@ +#include "edram_load_store.hlsli" + +[numthreads(40, 8, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data + // from 1 render target row rather than 1. Threads with X 0-19 are for the + // first row, with 20-39 are for the second. + uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u); + [flatten] if (xe_group_thread_id.x >= 20u) { + tile_dword_index += uint2(uint(-80), 1u); + } + uint4 pixels = xe_edram_load_store_source.Load4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + xe_edram_load_store_dest.Store4(rt_offset, pixels); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl new file mode 100644 index 000000000..dd8611ae6 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_color_7e3.cs.hlsl @@ -0,0 +1,20 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(40, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 2u; + uint2 pixels_7e3_packed = xe_edram_load_store_source.Load2( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + uint4 pixel_0_f16u32 = XeFloat7e3To16(pixels_7e3_packed.x); + uint4 pixel_1_f16u32 = XeFloat7e3To16(pixels_7e3_packed.y); + uint4 pixels_f16u32_packed = + uint4(pixel_0_f16u32.xz, pixel_1_f16u32.xz) | + (uint4(pixel_0_f16u32.yw, pixel_1_f16u32.yw) << 16u); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + xe_edram_load_store_dest.Store4(rt_offset, pixels_f16u32_packed); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl new file mode 100644 index 000000000..fc2644705 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_float.cs.hlsl @@ -0,0 +1,31 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + uint4 depth24_stencil = xe_edram_load_store_source.Load4(edram_offset); + uint4 depth24 = depth24_stencil & 0xFFFFFFu; + uint4 depth32 = xe_edram_load_store_source.Load4(10485760u + edram_offset); + // Depth. If the stored 32-bit depth converted to 24-bit is the same as the + // stored 24-bit depth, load the 32-bit value because it has more precision + // (and multipass rendering is possible), if it's not, convert the 24-bit + // depth because it was overwritten by aliasing. + uint4 depth24to32 = XeFloat20e4To32(depth24); + uint4 depth = depth24to32 + (depth32 - depth24to32) * + uint4(XeFloat32To20e4(depth32) == depth24); + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + xe_edram_load_store_dest.Store4(rt_offset, depth); + // Stencil. + uint4 stencil = (depth24_stencil >> 24u) << uint4(0u, 8u, 16u, 24u); + stencil.xy |= stencil.zw; + stencil.x |= stencil.y; + rt_offset = xe_edram_rt_stencil_offset + + xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + xe_edram_load_store_dest.Store(rt_offset, stencil.x); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl new file mode 100644 index 000000000..0d85248c7 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_depth_unorm.cs.hlsl @@ -0,0 +1,22 @@ +#include "edram_load_store.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + uint4 pixels = xe_edram_load_store_source.Load4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index)); + // Depth. + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + xe_edram_load_store_dest.Store4(rt_offset, pixels & 0xFFFFFFu); + // Stencil. + uint4 stencil = (pixels >> 24u) << uint4(0u, 8u, 16u, 24u); + stencil.xy |= stencil.zw; + stencil.x |= stencil.y; + rt_offset = xe_edram_rt_stencil_offset + + xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + xe_edram_load_store_dest.Store(rt_offset, stencil.x); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli new file mode 100644 index 000000000..f7636266a --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_load_store.hlsli @@ -0,0 +1,21 @@ +#ifndef XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ +#define XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ + +cbuffer XeEDRAMLoadStoreConstants : register(b0) { + uint xe_edram_base_tiles; + uint xe_edram_pitch_tiles; + uint xe_edram_rt_color_depth_pitch; + uint xe_edram_rt_stencil_offset; + uint xe_edram_rt_stencil_pitch; +}; + +ByteAddressBuffer xe_edram_load_store_source : register(t0); +RWByteAddressBuffer xe_edram_load_store_dest : register(u0); + +uint XeEDRAMOffset(uint2 tile_index, uint2 tile_dword_index) { + return (xe_edram_base_tiles + (tile_index.y * xe_edram_pitch_tiles) + + tile_index.x) * 5120u + tile_dword_index.y * 320u + + tile_dword_index.x * 4u; +} + +#endif // XENIA_GPU_D3D12_SHADERS_EDRAM_LOAD_STORE_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl new file mode 100644 index 000000000..584416fdb --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_32bpp.cs.hlsl @@ -0,0 +1,14 @@ +#include "edram_load_store.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + xe_edram_load_store_dest.Store4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl new file mode 100644 index 000000000..ec3cab476 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_64bpp.cs.hlsl @@ -0,0 +1,19 @@ +#include "edram_load_store.hlsli" + +[numthreads(40, 8, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + uint4 pixels = xe_edram_load_store_source.Load4(rt_offset); + // One tile contains 80x8 texels, and 2 rows within a 80x16 tile contain data + // from 1 render target row rather than 1. Threads with X 0-19 are for the + // first row, with 20-39 are for the second. + uint2 tile_dword_index = xe_group_thread_id.xy * uint2(4u, 2u); + [flatten] if (xe_group_thread_id.x >= 20u) { + tile_dword_index += uint2(uint(-80), 1u); + } + xe_edram_load_store_dest.Store4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl new file mode 100644 index 000000000..7b9c5cc03 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_color_7e3.cs.hlsl @@ -0,0 +1,19 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(40, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + uint4 pixels_f16u32_packed = xe_edram_load_store_source.Load4(rt_offset); + uint4 pixel_0_f16u32 = pixels_f16u32_packed.xxyy >> uint4(0u, 16u, 0u, 16u); + uint4 pixel_1_f16u32 = pixels_f16u32_packed.zzww >> uint4(0u, 16u, 0u, 16u); + uint2 pixels_7e3_packed = + uint2(XeFloat16To7e3(pixel_0_f16u32), XeFloat16To7e3(pixel_1_f16u32)); + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 2u; + xe_edram_load_store_dest.Store2( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels_7e3_packed); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl new file mode 100644 index 000000000..17cb1acdf --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_float.cs.hlsl @@ -0,0 +1,25 @@ +#include "edram_load_store.hlsli" +#include "pixel_formats.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Depth. + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + uint4 depth32 = xe_edram_load_store_source.Load4(rt_offset); + uint4 depth24_stencil = XeFloat32To20e4(depth32); + // Stencil. + rt_offset = xe_edram_rt_stencil_offset + + xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + depth24_stencil |= xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u) << 24u; + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + uint edram_offset = XeEDRAMOffset(xe_group_id.xy, tile_dword_index); + // Store 24-bit depth for aliasing and checking if 32-bit depth is up to date. + xe_edram_load_store_dest.Store4(edram_offset, depth24_stencil); + // Store 32-bit depth so precision isn't lost when doing multipass rendering. + xe_edram_load_store_dest.Store4(10485760u + edram_offset, depth32); +} diff --git a/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl new file mode 100644 index 000000000..3a318645e --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/edram_store_depth_unorm.cs.hlsl @@ -0,0 +1,20 @@ +#include "edram_load_store.hlsli" + +[numthreads(20, 16, 1)] +void main(uint3 xe_group_id : SV_GroupID, + uint3 xe_group_thread_id : SV_GroupThreadID, + uint3 xe_thread_id : SV_DispatchThreadID) { + // Depth. + uint rt_offset = xe_thread_id.y * xe_edram_rt_color_depth_pitch + + xe_thread_id.x * 16u; + uint4 pixels = xe_edram_load_store_source.Load4(rt_offset) & 0xFFFFFFu; + // Stencil. + rt_offset = xe_edram_rt_stencil_offset + + xe_thread_id.y * xe_edram_rt_stencil_pitch + xe_thread_id.x * 4u; + pixels |= xe_edram_load_store_source.Load(rt_offset).xxxx >> + uint4(0u, 8u, 16u, 24u) << 24u; + uint2 tile_dword_index = xe_group_thread_id.xy; + tile_dword_index.x *= 4u; + xe_edram_load_store_dest.Store4( + XeEDRAMOffset(xe_group_id.xy, tile_dword_index), pixels); +} diff --git a/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli new file mode 100644 index 000000000..fbdbb0221 --- /dev/null +++ b/src/xenia/gpu/d3d12/shaders/pixel_formats.hlsli @@ -0,0 +1,74 @@ +#ifndef XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ +#define XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ + +// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp + +uint XeFloat16To7e3(uint4 rgba_f16u32) { + float4 rgba_f32 = f16tof32(rgba_f16u32); + uint3 rgb_f32u32 = asuint(rgba_f32.xyz); + // Keep only positive (high bit set means negative for both float and int) and + // saturate to 31.875 (also dropping NaNs). + rgb_f32u32 = uint3(clamp(int3(rgb_f32u32), 0, 0x41FF0000)); + uint3 normalized = rgb_f32u32 + 0xC2000000u; + uint3 denormalized = ((rgb_f32u32 & 0x7FFFFFu) | 0x800000u) >> + ((125u).xxx - (rgb_f32u32 >> 23u)); + uint3 rgb_f10u32 = normalized + (denormalized - normalized) * + uint3(rgb_f32u32 < 0x3E800000u); + rgb_f10u32 = + ((rgb_f10u32 + 0x7FFFu + ((rgb_f10u32 >> 16u) & 1u)) >> 16u) & 0x3FFu; + return rgb_f10u32.r | (rgb_f10u32.g << 10u) | (rgb_f10u32.b << 20u) | + (uint(saturate(rgba_f32.a) * 3.0) << 30u); +} + +uint4 XeFloat7e3To16(uint rgba_packed) { + uint3 rgb_f10u32 = (rgba_packed.xxx >> uint3(0u, 10u, 20u)) & 0x3FFu; + uint3 mantissa = rgb_f10u32 & 0x7Fu; + uint3 exponent = rgb_f10u32 >> 7u; + // Normalize the values for the denormalized components. + // Exponent = 1; + // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x80) == 0); + uint3 is_denormalized = uint3(exponent == 0u); + uint3 mantissa_lzcnt = (7u).xxx - firstbithigh(mantissa); + exponent += ((1u).xxx - mantissa_lzcnt - exponent) * is_denormalized; + mantissa += + (((mantissa << mantissa_lzcnt) & 0x7Fu) - mantissa) * is_denormalized; + // Combine into 32-bit float bits and clear zeros. + uint3 rgb_f32u32 = (((exponent + 124u) << 23u) | (mantissa << 16u)) * + uint3(rgb_f10u32 != 0u); + return f32tof16(float4(asfloat(rgb_f32u32), + float(rgba_packed >> 30u) * (1.0 / 3.0))); +} + +// Based on CFloat24 from d3dref9.dll and the 6e4 code from: +// https://github.com/Microsoft/DirectXTex/blob/master/DirectXTex/DirectXTexConvert.cpp +// 6e4 has a different exponent bias allowing [0,512) values, 20e4 allows [0,2). +// We also can't clamp the stored value to 1 as load->store->load must be exact. + +uint4 XeFloat32To20e4(uint4 f32u32) { + // Keep only positive (high bit set means negative for both float and int) and + // saturate to the maximum representable value near 2 (also dropping NaNs). + f32u32 = uint4(clamp(int4(f32u32), 0, 0x3FFFFFF8)); + uint4 normalized = f32u32 + 0xC8000000u; + uint4 denormalized = + ((f32u32 & 0x7FFFFFu) | 0x800000u) >> ((113u).xxxx - (f32u32 >> 23u)); + uint4 f24u32 = + normalized + (denormalized - normalized) * uint4(f32u32 < 0x38800000u); + return ((f24u32 + 3u + ((f24u32 >> 3u) & 1u)) >> 3u) & 0xFFFFFFu; +} + +uint4 XeFloat20e4To32(uint4 f24u32) { + uint4 mantissa = f24u32 & 0xF00000u; + uint4 exponent = f24u32 >> 20u; + // Normalize the values for the denormalized components. + // Exponent = 1; + // do { Exponent--; Mantissa <<= 1; } while ((Mantissa & 0x100000) == 0); + uint4 is_denormalized = uint4(exponent == 0u); + uint4 mantissa_lzcnt = (20u).xxxx - firstbithigh(mantissa); + exponent += ((1u).xxxx - mantissa_lzcnt - exponent) * is_denormalized; + mantissa += + (((mantissa << mantissa_lzcnt) & 0xFFFFFu) - mantissa) * is_denormalized; + // Combine into 32-bit float bits and clear zeros. + return (((exponent + 112u) << 23u) | (mantissa << 3u)) * uint4(f24u32 != 0u); +} + +#endif // XENIA_GPU_D3D12_SHADERS_PIXEL_FORMATS_HLSLI_ diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc index 033045336..7c60b696e 100644 --- a/src/xenia/gpu/d3d12/shared_memory.cc +++ b/src/xenia/gpu/d3d12/shared_memory.cc @@ -394,7 +394,7 @@ void SharedMemory::TransitionBuffer(D3D12_RESOURCE_STATES new_state, barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; barrier.Transition.pResource = buffer_; - barrier.Transition.Subresource = 0; + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; barrier.Transition.StateBefore = buffer_state_; barrier.Transition.StateAfter = new_state; command_list->ResourceBarrier(1, &barrier); diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc index 8cb1e4b14..8e4db0f0c 100644 --- a/src/xenia/gpu/d3d12/texture_cache.cc +++ b/src/xenia/gpu/d3d12/texture_cache.cc @@ -741,7 +741,8 @@ bool TextureCache::LoadTextureData(Texture* texture) { if (copy_buffer_state != D3D12_RESOURCE_STATE_UNORDERED_ACCESS) { barriers[0].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; barriers[0].Transition.pResource = copy_buffer; - barriers[0].Transition.Subresource = 0; + barriers[0].Transition.Subresource = + D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; barriers[0].Transition.StateBefore = copy_buffer_state; barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; command_list->ResourceBarrier(1, barriers); @@ -792,7 +793,8 @@ bool TextureCache::LoadTextureData(Texture* texture) { barriers[0].UAV.pResource = copy_buffer; barriers[1].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; barriers[1].Transition.pResource = copy_buffer; - barriers[1].Transition.Subresource = 0; + barriers[1].Transition.Subresource = + D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE; command_list->ResourceBarrier(2, barriers);