From cd631fc44705d752060215d740d2268e29973b07 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Tue, 8 Jun 2021 02:17:59 +0200 Subject: [PATCH 1/4] [APU, SDL] Refactor sample submission - Move sample conversion to SDL callback thread - Add early channel down-conversion --- src/xenia/apu/conversion.h | 53 +++++++++++++ src/xenia/apu/sdl/sdl_audio_driver.cc | 109 +++++++++++++++++--------- src/xenia/apu/sdl/sdl_audio_driver.h | 3 + 3 files changed, 127 insertions(+), 38 deletions(-) create mode 100644 src/xenia/apu/conversion.h diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h new file mode 100644 index 000000000..2a487af78 --- /dev/null +++ b/src/xenia/apu/conversion.h @@ -0,0 +1,53 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2021 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_APU_CONVERSION_H_ +#define XENIA_APU_CONVERSION_H_ + +#include + +#include "xenia/base/byte_order.h" + +namespace xe { +namespace apu { +namespace conversion { + +inline void sequential_6_BE_to_interleaved_6_LE(float* output, + const float* input, + size_t ch_sample_count) { + for (size_t sample = 0; sample < ch_sample_count; sample++) { + for (size_t channel = 0; channel < 6; channel++) { + output[sample * 6 + channel] = + xe::byte_swap(input[channel * ch_sample_count + sample]); + } + } +} +inline void sequential_6_BE_to_interleaved_2_LE(float* output, + const float* input, + size_t ch_sample_count) { + // Default 5.1 channel mapping is fl, fr, fc, lf, bl, br + // https://docs.microsoft.com/en-us/windows/win32/xaudio2/xaudio2-default-channel-mapping + for (size_t sample = 0; sample < ch_sample_count; sample++) { + // put center on left and right, discard low frequency + float fl = xe::byte_swap(input[0 * ch_sample_count + sample]); + float fr = xe::byte_swap(input[1 * ch_sample_count + sample]); + float fc = xe::byte_swap(input[2 * ch_sample_count + sample]); + float br = xe::byte_swap(input[4 * ch_sample_count + sample]); + float bl = xe::byte_swap(input[5 * ch_sample_count + sample]); + float center_halved = fc * 0.5f; + output[sample * 2] = (fl + bl + center_halved) * (1.0f / 2.5f); + output[sample * 2 + 1] = (fr + br + center_halved) * (1.0f / 2.5f); + } +} + +} // namespace conversion +} // namespace apu +} // namespace xe + +#endif diff --git a/src/xenia/apu/sdl/sdl_audio_driver.cc b/src/xenia/apu/sdl/sdl_audio_driver.cc index a16e9e5d1..bdda565e1 100644 --- a/src/xenia/apu/sdl/sdl_audio_driver.cc +++ b/src/xenia/apu/sdl/sdl_audio_driver.cc @@ -10,9 +10,13 @@ #include "xenia/apu/sdl/sdl_audio_driver.h" #include +#include #include "xenia/apu/apu_flags.h" +#include "xenia/apu/conversion.h" +#include "xenia/base/assert.h" #include "xenia/base/logging.h" +#include "xenia/base/profiling.h" #include "xenia/helper/sdl/sdl_helper.h" namespace xe { @@ -46,41 +50,37 @@ bool SDLAudioDriver::Initialize() { } sdl_initialized_ = true; - SDL_AudioCallback audio_callback = [](void* userdata, Uint8* stream, - int len) -> void { - assert_true(len == frame_size_); - const auto driver = static_cast(userdata); - - std::unique_lock guard(driver->frames_mutex_); - if (driver->frames_queued_.empty()) { - memset(stream, 0, len); - } else { - auto buffer = driver->frames_queued_.front(); - driver->frames_queued_.pop(); - if (cvars::mute) { - memset(stream, 0, len); - } else { - memcpy(stream, buffer, len); - } - driver->frames_unused_.push(buffer); - - auto ret = driver->semaphore_->Release(1, nullptr); - assert_true(ret); + SDL_AudioSpec desired_spec = {}; + SDL_AudioSpec obtained_spec; + desired_spec.freq = frame_frequency_; + desired_spec.format = AUDIO_F32; + desired_spec.channels = frame_channels_; + desired_spec.samples = channel_samples_; + desired_spec.callback = SDLCallback; + desired_spec.userdata = this; + // Allow the hardware to decide between 5.1 and stereo + int allowed_change = SDL_AUDIO_ALLOW_CHANNELS_CHANGE; + for (int i = 0; i < 2; i++) { + sdl_device_id_ = SDL_OpenAudioDevice(nullptr, 0, &desired_spec, + &obtained_spec, allowed_change); + if (sdl_device_id_ <= 0) { + XELOGE("SDL_OpenAudioDevice() failed."); + return false; } - }; - - SDL_AudioSpec wanted_spec = {}; - wanted_spec.freq = frame_frequency_; - wanted_spec.format = AUDIO_F32; - wanted_spec.channels = frame_channels_; - wanted_spec.samples = channel_samples_; - wanted_spec.callback = audio_callback; - wanted_spec.userdata = this; - sdl_device_id_ = SDL_OpenAudioDevice(nullptr, 0, &wanted_spec, nullptr, 0); + if (obtained_spec.channels == 2 || obtained_spec.channels == 6) { + break; + } + // If the system is 4 or 7.1, let SDL convert + allowed_change = 0; + SDL_CloseAudioDevice(sdl_device_id_); + sdl_device_id_ = -1; + } if (sdl_device_id_ <= 0) { - XELOGE("SDL_OpenAudioDevice() failed."); + XELOGE("Failed to get a compatible SDL Audio Device."); return false; } + sdl_device_channels_ = obtained_spec.channels; + SDL_PauseAudioDevice(sdl_device_id_, 0); return true; @@ -99,13 +99,7 @@ void SDLAudioDriver::SubmitFrame(uint32_t frame_ptr) { } } - // interleave the data - for (size_t index = 0, o = 0; index < channel_samples_; ++index) { - for (size_t channel = 0, table = 0; channel < frame_channels_; - ++channel, table += channel_samples_) { - output_frame[o++] = xe::byte_swap(input_frame[table + index]); - } - } + std::memcpy(output_frame, input_frame, frame_samples_ * sizeof(float)); { std::unique_lock guard(frames_mutex_); @@ -133,6 +127,45 @@ void SDLAudioDriver::Shutdown() { }; } +void SDLAudioDriver::SDLCallback(void* userdata, Uint8* stream, int len) { + SCOPE_profile_cpu_f("apu"); + if (!userdata || !stream) { + XELOGE("SDLAudioDriver::sdl_callback called with nullptr."); + return; + } + const auto driver = static_cast(userdata); + assert_true(len == + sizeof(float) * channel_samples_ * driver->sdl_device_channels_); + + std::unique_lock guard(driver->frames_mutex_); + if (driver->frames_queued_.empty()) { + std::memset(stream, 0, len); + } else { + auto buffer = driver->frames_queued_.front(); + driver->frames_queued_.pop(); + if (cvars::mute) { + std::memset(stream, 0, len); + } else { + switch (driver->sdl_device_channels_) { + case 2: + conversion::sequential_6_BE_to_interleaved_2_LE( + reinterpret_cast(stream), buffer, channel_samples_); + break; + case 6: + conversion::sequential_6_BE_to_interleaved_6_LE( + reinterpret_cast(stream), buffer, channel_samples_); + break; + default: + assert_unhandled_case(driver->sdl_device_channels_); + break; + } + } + driver->frames_unused_.push(buffer); + + auto ret = driver->semaphore_->Release(1, nullptr); + assert_true(ret); + } +}; } // namespace sdl } // namespace apu } // namespace xe diff --git a/src/xenia/apu/sdl/sdl_audio_driver.h b/src/xenia/apu/sdl/sdl_audio_driver.h index cb957105f..3d32b86e0 100644 --- a/src/xenia/apu/sdl/sdl_audio_driver.h +++ b/src/xenia/apu/sdl/sdl_audio_driver.h @@ -32,10 +32,13 @@ class SDLAudioDriver : public AudioDriver { void Shutdown(); protected: + static void SDLCallback(void* userdata, Uint8* stream, int len); + xe::threading::Semaphore* semaphore_ = nullptr; SDL_AudioDeviceID sdl_device_id_ = -1; bool sdl_initialized_ = false; + uint8_t sdl_device_channels_ = 0; static const uint32_t frame_frequency_ = 48000; static const uint32_t frame_channels_ = 6; From 0ad939b2f10571b03a8652dc1df8a5e444bbac13 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Tue, 8 Jun 2021 03:36:07 +0200 Subject: [PATCH 2/4] [APU] Add AVX intrinsic variants for conversion --- src/xenia/apu/conversion.h | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h index 2a487af78..211243348 100644 --- a/src/xenia/apu/conversion.h +++ b/src/xenia/apu/conversion.h @@ -13,11 +13,76 @@ #include #include "xenia/base/byte_order.h" +#include "xenia/base/platform.h" namespace xe { namespace apu { namespace conversion { +#if XE_ARCH_AMD64 +inline void sequential_6_BE_to_interleaved_6_LE(float* output, + const float* input, + size_t ch_sample_count) { + const uint32_t* in = reinterpret_cast(input); + uint32_t* out = reinterpret_cast(output); + const __m128i byte_swap_shuffle = + _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + for (size_t sample = 0; sample < ch_sample_count; sample++) { + __m128i sample0 = _mm_set_epi32( + in[3 * ch_sample_count + sample], in[2 * ch_sample_count + sample], + in[1 * ch_sample_count + sample], in[0 * ch_sample_count + sample]); + uint32_t sample1 = in[4 * ch_sample_count + sample]; + uint32_t sample2 = in[5 * ch_sample_count + sample]; + sample0 = _mm_shuffle_epi8(sample0, byte_swap_shuffle); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[sample * 6]), sample0); + sample1 = xe::byte_swap(sample1); + out[sample * 6 + 4] = sample1; + sample2 = xe::byte_swap(sample2); + out[sample * 6 + 5] = sample2; + } +} + +inline void sequential_6_BE_to_interleaved_2_LE(float* output, + const float* input, + size_t ch_sample_count) { + assert_true(ch_sample_count % 4 == 0); + const uint32_t* in = reinterpret_cast(input); + uint32_t* out = reinterpret_cast(output); + const __m128i byte_swap_shuffle = + _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + const __m128 half = _mm_set1_ps(0.5f); + const __m128 two_fifths = _mm_set1_ps(1.0f / 2.5f); + + // put center on left and right, discard low frequency + for (size_t sample = 0; sample < ch_sample_count; sample += 4) { + // load 4 samples from 6 channels each + __m128 fl = _mm_loadu_ps(&input[0 * ch_sample_count + sample]); + __m128 fr = _mm_loadu_ps(&input[1 * ch_sample_count + sample]); + __m128 fc = _mm_loadu_ps(&input[2 * ch_sample_count + sample]); + __m128 bl = _mm_loadu_ps(&input[4 * ch_sample_count + sample]); + __m128 br = _mm_loadu_ps(&input[5 * ch_sample_count + sample]); + // byte swap + fl = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fl), byte_swap_shuffle)); + fr = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fr), byte_swap_shuffle)); + fc = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fc), byte_swap_shuffle)); + bl = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(bl), byte_swap_shuffle)); + br = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(br), byte_swap_shuffle)); + + __m128 center_halved = _mm_mul_ps(fc, half); + __m128 left = _mm_add_ps(_mm_add_ps(fl, bl), center_halved); + __m128 right = _mm_add_ps(_mm_add_ps(fr, br), center_halved); + left = _mm_mul_ps(left, two_fifths); + right = _mm_mul_ps(right, two_fifths); + _mm_storeu_ps(&output[sample * 2], _mm_unpacklo_ps(left, right)); + _mm_storeu_ps(&output[(sample + 2) * 2], _mm_unpackhi_ps(left, right)); + } +} +#else inline void sequential_6_BE_to_interleaved_6_LE(float* output, const float* input, size_t ch_sample_count) { @@ -45,6 +110,7 @@ inline void sequential_6_BE_to_interleaved_2_LE(float* output, output[sample * 2 + 1] = (fr + br + center_halved) * (1.0f / 2.5f); } } +#endif } // namespace conversion } // namespace apu From f15e3d07e737e4f9e4d73a647b90446f237277d9 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Tue, 8 Jun 2021 03:53:18 +0200 Subject: [PATCH 3/4] [APU] Use vectorized converter in xaudio2 backend --- src/xenia/apu/xaudio2/xaudio2_audio_driver.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc index 4d3ef9c37..f393706c9 100644 --- a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc +++ b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc @@ -13,6 +13,7 @@ #include "xenia/base/platform_win.h" #include "xenia/apu/apu_flags.h" +#include "xenia/apu/conversion.h" #include "xenia/base/clock.h" #include "xenia/base/logging.h" @@ -208,12 +209,8 @@ void XAudio2AudioDriver::SubmitFrame(uint32_t frame_ptr) { auto interleave_channels = frame_channels_; // interleave the data - for (uint32_t index = 0, o = 0; index < channel_samples_; ++index) { - for (uint32_t channel = 0, table = 0; channel < interleave_channels; - ++channel, table += channel_samples_) { - output_frame[o++] = xe::byte_swap(input_frame[table + index]); - } - } + conversion::sequential_6_BE_to_interleaved_6_LE(output_frame, input_frame, + channel_samples_); api::XAUDIO2_BUFFER buffer; buffer.Flags = 0; From 357aa1cdd36208d33ace4d7a651a5c452a3c49d8 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 14 Jun 2021 17:30:56 +0300 Subject: [PATCH 4/4] [GPU] Fix RT view and pipeline sRGB mismatch --- .../gpu/d3d12/d3d12_command_processor.cc | 1 + .../gpu/d3d12/d3d12_render_target_cache.cc | 39 +++++-------- .../gpu/d3d12/d3d12_render_target_cache.h | 13 ++--- src/xenia/gpu/render_target_cache.cc | 56 ++++++++++--------- src/xenia/gpu/render_target_cache.h | 36 ++++++------ 5 files changed, 66 insertions(+), 79 deletions(-) diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 7259efc68..fca476673 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1928,6 +1928,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, if (host_render_targets_used) { bound_depth_and_color_render_target_bits = render_target_cache_->GetLastUpdateBoundRenderTargets( + render_target_cache_->gamma_render_target_as_srgb(), bound_depth_and_color_render_target_formats); } else { bound_depth_and_color_render_target_bits = 0; diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index aac898d4e..234b3f2ae 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -1858,9 +1858,10 @@ DXGI_FORMAT D3D12RenderTargetCache::GetColorDrawDXGIFormat( xenos::ColorRenderTargetFormat format) const { switch (format) { case xenos::ColorRenderTargetFormat::k_8_8_8_8: - case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: - // sRGB is handled in a different way, not via the RenderTargetKey format. return DXGI_FORMAT_R8G8B8A8_UNORM; + case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: + return gamma_render_target_as_srgb_ ? DXGI_FORMAT_R8G8B8A8_UNORM_SRGB + : DXGI_FORMAT_R8G8B8A8_UNORM; case xenos::ColorRenderTargetFormat::k_16_16: return DXGI_FORMAT_R16G16_SNORM; case xenos::ColorRenderTargetFormat::k_16_16_16_16: @@ -1954,20 +1955,6 @@ DXGI_FORMAT D3D12RenderTargetCache::GetDepthSRVStencilDXGIFormat( } } -xenos::ColorRenderTargetFormat -D3D12RenderTargetCache::GetHostRelevantColorFormat( - xenos::ColorRenderTargetFormat format) const { - switch (format) { - case xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA: - // Currently handled in the shader (with incorrect blending), but even if - // handling is changed (to true sRGB), it will still be able to alias it - // with R8G8B8A8_UNORM. - return xenos::ColorRenderTargetFormat::k_8_8_8_8; - default: - return format; - } -} - RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget( RenderTargetKey key) { ID3D12Device* device = @@ -1990,7 +1977,7 @@ RenderTargetCache::RenderTarget* D3D12RenderTargetCache::CreateRenderTarget( assert_true(resource_desc.Format != DXGI_FORMAT_UNKNOWN); if (resource_desc.Format == DXGI_FORMAT_UNKNOWN) { XELOGE("D3D12RenderTargetCache: Unknown {} render target format {}", - key.is_depth ? "depth" : "color", key.host_relevant_format); + key.is_depth ? "depth" : "color", key.resource_format); return nullptr; } if (key.msaa_samples == xenos::MsaaSamples::k2X && !msaa_2x_supported()) { @@ -2228,16 +2215,16 @@ D3D12RenderTargetCache::GetOrCreateTransferPipelines(TransferShaderKey key) { bool dest_is_color = (mode.output == TransferOutput::kColor); xenos::ColorRenderTargetFormat dest_color_format = - xenos::ColorRenderTargetFormat(key.dest_host_relevant_format); + xenos::ColorRenderTargetFormat(key.dest_resource_format); xenos::DepthRenderTargetFormat dest_depth_format = - xenos::DepthRenderTargetFormat(key.dest_host_relevant_format); + xenos::DepthRenderTargetFormat(key.dest_resource_format); bool dest_is_64bpp = dest_is_color && xenos::IsColorRenderTargetFormat64bpp(dest_color_format); xenos::ColorRenderTargetFormat source_color_format = - xenos::ColorRenderTargetFormat(key.source_host_relevant_format); + xenos::ColorRenderTargetFormat(key.source_resource_format); xenos::DepthRenderTargetFormat source_depth_format = - xenos::DepthRenderTargetFormat(key.source_host_relevant_format); + xenos::DepthRenderTargetFormat(key.source_resource_format); // If not source_is_color, it's depth / stencil - 40-sample columns are // swapped as opposed to color destination. bool source_is_color = (rs & kTransferUsedRootParameterColorSRVBit) != 0; @@ -4920,8 +4907,8 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( uint32_t rt_sort_index = 0; TransferShaderKey new_transfer_shader_key; new_transfer_shader_key.dest_msaa_samples = dest_rt_key.msaa_samples; - new_transfer_shader_key.dest_host_relevant_format = - dest_rt_key.host_relevant_format; + new_transfer_shader_key.dest_resource_format = + dest_rt_key.resource_format; uint32_t stencil_clear_rectangle_count = 0; for (uint32_t j = 0; j <= uint32_t(need_stencil_bit_draws); ++j) { // j == 0 - color or depth. @@ -4958,8 +4945,8 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears( RenderTargetKey source_rt_key = source_d3d12_rt.key(); new_transfer_shader_key.source_msaa_samples = source_rt_key.msaa_samples; - new_transfer_shader_key.source_host_relevant_format = - source_rt_key.host_relevant_format; + new_transfer_shader_key.source_resource_format = + source_rt_key.resource_format; bool host_depth_source_is_copy = host_depth_source_d3d12_rt == &dest_d3d12_rt; new_transfer_shader_key.host_depth_source_is_copy = @@ -6492,7 +6479,7 @@ void D3D12RenderTargetCache::DumpRenderTargets(uint32_t dump_base, any_sources_32bpp_64bpp[size_t(rt_key.Is64bpp())] = true; DumpPipelineKey pipeline_key; pipeline_key.msaa_samples = rt_key.msaa_samples; - pipeline_key.host_relevant_format = rt_key.host_relevant_format; + pipeline_key.resource_format = rt_key.resource_format; pipeline_key.is_depth = rt_key.is_depth; dump_invocations_.emplace_back(rectangle, pipeline_key); } diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h index db2795b06..ea9115251 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h @@ -224,9 +224,6 @@ class D3D12RenderTargetCache final : public RenderTargetCache { return D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION; } - xenos::ColorRenderTargetFormat GetHostRelevantColorFormat( - xenos::ColorRenderTargetFormat format) const override; - RenderTarget* CreateRenderTarget(RenderTargetKey key) override; bool IsHostDepthEncodingDifferent( @@ -418,14 +415,14 @@ class D3D12RenderTargetCache final : public RenderTargetCache { union TransferShaderKey { struct { xenos::MsaaSamples dest_msaa_samples : xenos::kMsaaSamplesBits; - uint32_t dest_host_relevant_format : xenos::kRenderTargetFormatBits; + uint32_t dest_resource_format : xenos::kRenderTargetFormatBits; xenos::MsaaSamples source_msaa_samples : xenos::kMsaaSamplesBits; // Always 1x when host_depth_source_is_copy is true not to create the same // pipeline for different MSAA sample counts as it doesn't matter in this // case. xenos::MsaaSamples host_depth_source_msaa_samples : xenos::kMsaaSamplesBits; - uint32_t source_host_relevant_format : xenos::kRenderTargetFormatBits; + uint32_t source_resource_format : xenos::kRenderTargetFormatBits; // If host depth is also fetched, whether it's pre-copied to the EDRAM // buffer (but since it's just a scratch buffer, with tiles laid out // linearly with the same pitch as in the original render target; also no @@ -557,7 +554,7 @@ class D3D12RenderTargetCache final : public RenderTargetCache { union DumpPipelineKey { struct { xenos::MsaaSamples msaa_samples : 2; - uint32_t host_relevant_format : 4; + uint32_t resource_format : 4; // Last bit because this affects the root signature - after sorting, only // change it at most once. Depth buffers have an additional stencil SRV. uint32_t is_depth : 1; @@ -580,11 +577,11 @@ class D3D12RenderTargetCache final : public RenderTargetCache { xenos::ColorRenderTargetFormat GetColorFormat() const { assert_false(is_depth); - return xenos::ColorRenderTargetFormat(host_relevant_format); + return xenos::ColorRenderTargetFormat(resource_format); } xenos::DepthRenderTargetFormat GetDepthFormat() const { assert_true(is_depth); - return xenos::DepthRenderTargetFormat(host_relevant_format); + return xenos::DepthRenderTargetFormat(resource_format); } }; diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 573cb29a0..2b4ea97ef 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -424,7 +424,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done, uint32_t depth_and_color_rts_used_bits = 0; // depth_and_color_rts_used_bits -> EDRAM base. uint32_t edram_bases[1 + xenos::kMaxColorRenderTargets]; - uint32_t host_relevant_formats[1 + xenos::kMaxColorRenderTargets]; + uint32_t resource_formats[1 + xenos::kMaxColorRenderTargets]; uint32_t rts_are_64bpp = 0; uint32_t color_rts_are_gamma = 0; if (is_rasterization_done) { @@ -438,7 +438,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done, std::min(rb_depth_info.depth_base, xenos::kEdramTileCount); // With pixel shader interlock, always the same addressing disregarding // the format. - host_relevant_formats[0] = + resource_formats[0] = interlock_barrier_only ? 0 : uint32_t(rb_depth_info.depth_format); } if (regs.Get().edram_mode == @@ -468,20 +468,19 @@ bool RenderTargetCache::Update(bool is_rasterization_done, if (color_format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) { color_rts_are_gamma |= uint32_t(1) << rt_index; } - xenos::ColorRenderTargetFormat color_host_relevant_format; + xenos::ColorRenderTargetFormat color_resource_format; if (interlock_barrier_only) { // Only changes in mapping between coordinates and addresses are // interesting (along with access overlap between draw calls), thus // only pixel size is relevant. - color_host_relevant_format = + color_resource_format = is_64bpp ? xenos::ColorRenderTargetFormat::k_16_16_16_16 : xenos::ColorRenderTargetFormat::k_8_8_8_8; } else { - color_host_relevant_format = GetHostRelevantColorFormat( + color_resource_format = GetColorResourceFormat( xenos::GetStorageColorFormat(color_format)); } - host_relevant_formats[rt_bit_index] = - uint32_t(color_host_relevant_format); + resource_formats[rt_bit_index] = uint32_t(color_resource_format); } } } @@ -659,7 +658,7 @@ bool RenderTargetCache::Update(bool is_rasterization_done, rt_key.pitch_tiles_at_32bpp = pitch_tiles_at_32bpp; rt_key.msaa_samples = msaa_samples; rt_key.is_depth = rt_bit_index == 0; - rt_key.host_relevant_format = host_relevant_formats[rt_bit_index]; + rt_key.resource_format = resource_formats[rt_bit_index]; if (!interlock_barrier_only) { RenderTarget* render_target = GetOrCreateRenderTarget(rt_key); if (!render_target) { @@ -801,10 +800,11 @@ bool RenderTargetCache::Update(bool is_rasterization_done, } uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets( - uint32_t* depth_and_color_formats_out) const { + bool distinguish_gamma_formats, + uint32_t* depth_and_color_resource_formats_out) const { if (GetPath() != Path::kHostRenderTargets) { - if (depth_and_color_formats_out) { - std::memset(depth_and_color_formats_out, 0, + if (depth_and_color_resource_formats_out) { + std::memset(depth_and_color_resource_formats_out, 0, sizeof(uint32_t) * (1 + xenos::kMaxColorRenderTargets)); } return 0; @@ -814,15 +814,19 @@ uint32_t RenderTargetCache::GetLastUpdateBoundRenderTargets( const RenderTarget* render_target = last_update_accumulated_render_targets_[i]; if (!render_target) { - if (depth_and_color_formats_out) { - depth_and_color_formats_out[i] = 0; + if (depth_and_color_resource_formats_out) { + depth_and_color_resource_formats_out[i] = 0; } continue; } rts_used |= uint32_t(1) << i; - if (depth_and_color_formats_out) { - depth_and_color_formats_out[i] = - render_target->key().host_relevant_format; + if (depth_and_color_resource_formats_out) { + depth_and_color_resource_formats_out[i] = + (distinguish_gamma_formats && i && + (last_update_accumulated_color_targets_are_gamma_ & + (uint32_t(1) << (i - 1)))) + ? uint32_t(xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) + : render_target->key().resource_format; } } return rts_used; @@ -1083,7 +1087,7 @@ bool RenderTargetCache::PrepareHostRenderTargetsResolveClear( depth_render_target_key.pitch_tiles_at_32bpp = pitch_tiles_at_32bpp; depth_render_target_key.msaa_samples = msaa_samples; depth_render_target_key.is_depth = 1; - depth_render_target_key.host_relevant_format = + depth_render_target_key.resource_format = resolve_info.depth_edram_info.format; depth_render_target = GetOrCreateRenderTarget(depth_render_target_key); if (!depth_render_target) { @@ -1098,9 +1102,8 @@ bool RenderTargetCache::PrepareHostRenderTargetsResolveClear( color_render_target_key.pitch_tiles_at_32bpp = pitch_tiles_at_32bpp; color_render_target_key.msaa_samples = msaa_samples; color_render_target_key.is_depth = 0; - color_render_target_key.host_relevant_format = - uint32_t(GetHostRelevantColorFormat(xenos::ColorRenderTargetFormat( - resolve_info.color_edram_info.format))); + color_render_target_key.resource_format = uint32_t(GetColorResourceFormat( + xenos::ColorRenderTargetFormat(resolve_info.color_edram_info.format))); color_render_target = GetOrCreateRenderTarget(color_render_target_key); if (!color_render_target) { color_render_target_key = RenderTargetKey(); @@ -1161,8 +1164,8 @@ RenderTargetCache::PrepareFullEdram1280xRenderTargetForSnapshotRestoration( } RenderTargetKey render_target_key; render_target_key.pitch_tiles_at_32bpp = kPitchTilesAt32bpp; - render_target_key.host_relevant_format = uint32_t( - GetHostRelevantColorFormat(xenos::GetStorageColorFormat(color_format))); + render_target_key.resource_format = + uint32_t(GetColorResourceFormat(color_format)); RenderTarget* render_target = GetOrCreateRenderTarget(render_target_key); if (!render_target) { return nullptr; @@ -1214,14 +1217,14 @@ RenderTargetCache::RenderTarget* RenderTargetCache::GetOrCreateRenderTarget( "Created a {}x{} {}xMSAA {} render target with guest format {} at " "EDRAM base {}", width, height, uint32_t(1) << uint32_t(key.msaa_samples), - key.is_depth ? "depth" : "color", key.host_relevant_format, + key.is_depth ? "depth" : "color", key.resource_format, key.base_tiles); } else { XELOGE( "Failed to create a {}x{} {}xMSAA {} render target with guest format " "{} at EDRAM base {}", width, height, uint32_t(1) << uint32_t(key.msaa_samples), - key.is_depth ? "depth" : "color", key.host_relevant_format, + key.is_depth ? "depth" : "color", key.resource_format, key.base_tiles); } // Insert even if failed to create, not to try to create again. @@ -1339,8 +1342,7 @@ void RenderTargetCache::ChangeOwnership( nullptr, resolve_clear_cutout)) { RenderTargetKey transfer_host_depth_source = host_depth_encoding_different - ? it->second - .host_depth_render_targets[dest.host_relevant_format] + ? it->second.host_depth_render_targets[dest.resource_format] : RenderTargetKey(); if (transfer_host_depth_source == transfer_source) { // Same render target, don't provide a separate host depth source. @@ -1385,7 +1387,7 @@ void RenderTargetCache::ChangeOwnership( // Claim the current range. it->second.render_target = dest; if (host_depth_encoding_different) { - it->second.host_depth_render_targets[dest.host_relevant_format] = dest; + it->second.host_depth_render_targets[dest.resource_format] = dest; } // Check if can merge with the next range after claiming. std::map::iterator it_next; diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 74207a58c..383c12f94 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -181,8 +181,10 @@ class RenderTargetCache { // Returns bits where 0 is whether a depth render target is currently bound on // the host and 1... are whether the same applies to color render targets, and - // "host-relevant" formats of each. + // formats (resource formats, but if needed, with gamma taken into account) of + // each. uint32_t GetLastUpdateBoundRenderTargets( + bool distinguish_gamma_formats, uint32_t* depth_and_color_formats_out = nullptr) const; protected: @@ -223,11 +225,8 @@ class RenderTargetCache { uint32_t pitch_tiles_at_32bpp : 8; // 19 xenos::MsaaSamples msaa_samples : xenos::kMsaaSamplesBits; // 21 uint32_t is_depth : 1; // 22 - // Not always the original format - blending precision ignored, formats - // handled through the same render targets on the host are normalized, and - // with pixel shader interlock, replaced with some single 32bpp or 64bpp - // format because it's only needed for addressing. - uint32_t host_relevant_format : xenos::kRenderTargetFormatBits; // 26 + // Ignoring the blending precision and sRGB. + uint32_t resource_format : xenos::kRenderTargetFormatBits; // 26 }; uint32_t key = 0; struct Hasher { @@ -250,11 +249,11 @@ class RenderTargetCache { xenos::ColorRenderTargetFormat GetColorFormat() const { assert_false(is_depth); - return xenos::ColorRenderTargetFormat(host_relevant_format); + return xenos::ColorRenderTargetFormat(resource_format); } xenos::DepthRenderTargetFormat GetDepthFormat() const { assert_true(is_depth); - return xenos::DepthRenderTargetFormat(host_relevant_format); + return xenos::DepthRenderTargetFormat(resource_format); } bool Is64bpp() const { if (is_depth) { @@ -436,15 +435,6 @@ class RenderTargetCache { uint32_t GetRenderTargetHeight(uint32_t pitch_tiles_at_32bpp, xenos::MsaaSamples msaa_samples) const; - // Normalizes the format if it's fine to use the same render target textures - // for the provided and the returned guest formats. - // xenos::GetStorageColorFormat is supposed to be done before calling, so - // redoing what it does in the implementations is not needed. - virtual xenos::ColorRenderTargetFormat GetHostRelevantColorFormat( - xenos::ColorRenderTargetFormat format) const { - return format; - } - virtual RenderTarget* CreateRenderTarget(RenderTargetKey key) = 0; // Whether depth buffer is encoded differently on the host, thus after @@ -567,7 +557,7 @@ class RenderTargetCache { return false; } if (host_depth_encoding_different && !key.is_depth && - host_depth_render_targets[key.host_relevant_format] != key) { + host_depth_render_targets[key.resource_format] != key) { // Depth encoding is the same, but different addressing is needed. return false; } @@ -582,6 +572,16 @@ class RenderTargetCache { } }; + static constexpr xenos::ColorRenderTargetFormat GetColorResourceFormat( + xenos::ColorRenderTargetFormat format) { + // sRGB, if used on the host, is a view property or global state - linear + // and sRGB host render targets can share data directly without transfers. + if (format == xenos::ColorRenderTargetFormat::k_8_8_8_8_GAMMA) { + return xenos::ColorRenderTargetFormat::k_8_8_8_8; + } + return xenos::GetStorageColorFormat(format); + } + RenderTarget* GetOrCreateRenderTarget(RenderTargetKey key); // Checks if changing ownership of the range to the specified render target